**Author**: Siddhant Sutar

Import Pandas, Numpy, and ordered logistic regression module by Fabian Pedregosa (with a few tweaks) obtained from https://github.com/fabianp/minirank/blob/master/minirank/logistic.py, since Scikit-learn doesn't support it yet.

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from logistic import ordinal_logistic_fit, ordinal_logistic_predict

Read training and test data.

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
def tokens(x):
    return x.split(', ')

In [4]:
vectorizer = TfidfVectorizer(tokenizer=tokens, lowercase=False, use_idf=False, norm=False, binary=True)

In [5]:
actors = vectorizer.fit_transform(train["Cast"].fillna(''))
actors_list = vectorizer.get_feature_names()[1:]
actors_list = [x.encode('UTF8') for x in actors_list]
actors

<47927x76908 sparse matrix of type '<type 'numpy.float64'>'
	with 184622 stored elements in Compressed Sparse Row format>

In [6]:
directors = vectorizer.fit_transform(train["Director"].fillna(''))
directors_list = vectorizer.get_feature_names()[1:]
directors_list = [x.encode('UTF8') for x in directors_list]
directors

<47927x21817 sparse matrix of type '<type 'numpy.float64'>'
	with 53373 stored elements in Compressed Sparse Row format>

In [7]:
genres = vectorizer.fit_transform(train["Genre"].fillna(''))
genres_list = vectorizer.get_feature_names()[1:]
genres_list = [x.encode('UTF8') for x in genres_list]
genres

<47927x22 sparse matrix of type '<type 'numpy.float64'>'
	with 85569 stored elements in Compressed Sparse Row format>

In [8]:
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=False, norm=False, strip_accents='ascii', stop_words=stopset, binary=True)
plot = vectorizer.fit_transform(train["FullPlot"].fillna(''))
keywords_list = vectorizer.get_feature_names()
plot

<47927x61781 sparse matrix of type '<type 'numpy.float64'>'
	with 1374809 stored elements in Compressed Sparse Row format>

Ordered logit regression model

In [9]:
features = []
g = ["Biography", "Drama", "History"]
d = ["Steven Spielberg"]
a = ["Liam Neeson", "Ben Kingsley", "Ralph Fiennes", "Caroline Goodall"]
keywords = [] #["corruption", "vampire", "girl"]
for each in g:
    features.append(genres[:, genres_list.index(str(each))+1].toarray().flatten())
for each in d:
    features.append(directors[:, directors_list.index(str(each))+1].toarray().flatten())
for each in a:
    features.append(actors[:, actors_list.index(str(each))+1].toarray().flatten())

In [10]:
X = csr_matrix(features).transpose().todense()
y = train.OrderedRating
w, theta = ordinal_logistic_fit(X, y)

  jac=f_grad, hessp=f_hess, options=options, callback=callback)


Ordered ratings: 0 = [0-5), 1 = [5-6), 2 = [6-7), 3 = [7-8), 4 = [8-9), 5 = [9-10)

In [11]:
pred = ordinal_logistic_predict(w, theta, np.ones(len(features)))
print(pred)

5
