**Author**: Siddhant Sutar

Import libraries.

In [17]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix

Import training and test samples.

In [18]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

Function to split into tokens.

In [19]:
def tokens(x):
    return x.split(', ')

Initialize vectorizer.

In [20]:
features_list = []
vectorizer = TfidfVectorizer(tokenizer=tokens, lowercase=False, use_idf=False, norm=False, binary=True)

Create feature vectors for actors.

In [21]:
actors = vectorizer.fit_transform(train["Cast"].fillna(''))
features_list = [each.encode('UTF8') for each in vectorizer.get_feature_names()[1:]]
actors

<47927x76908 sparse matrix of type '<type 'numpy.float64'>'
	with 184622 stored elements in Compressed Sparse Row format>

Create feature vectors for directors.

In [22]:
directors = vectorizer.fit_transform(train["Director"].fillna(''))
directors_list = vectorizer.get_feature_names()
features_list += ["dir_" + each.encode('UTF8') for each in vectorizer.get_feature_names()[1:]]
directors

<47927x21817 sparse matrix of type '<type 'numpy.float64'>'
	with 53373 stored elements in Compressed Sparse Row format>

Create feature vectors for genres.

In [23]:
genres = vectorizer.fit_transform(train["Genre"].fillna(''))
genres_list = vectorizer.get_feature_names()
features_list += [str(each) for each in vectorizer.get_feature_names()[1:]]
genres

<47927x22 sparse matrix of type '<type 'numpy.float64'>'
	with 85569 stored elements in Compressed Sparse Row format>

Create feature vectors for plot keywords.

In [24]:
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=False, norm=False, strip_accents='ascii', stop_words=stopset, binary=True)
plot = vectorizer.fit_transform(train["FullPlot"].fillna(''))
keywords_list = vectorizer.get_feature_names()

Join the feature vector sparse matrices into a single matrix to fit into the model.

In [25]:
features = hstack([actors[:, 1:], directors[:, 1:], genres[:, 1:]])

Initialize and fit.

In [26]:
X = features
y = train.imdbRating
lm = LinearRegression()
lm.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [27]:
lm.intercept_

5.9168151563836675

Predictor features.

In [28]:
rating = lm.intercept_
features = ["Biography", "Drama", "History", "dir_Steven Spielberg", "Liam Neeson", "Ben Kingsley", "Ralph Fiennes", "Caroline Goodall"]
for each in features:
    rating += lm.coef_[features_list.index(each)]

Outcome rating.

In [29]:
print rating

10.3329293859
