In [145]:
import pandas as pd
import pickle as pkl
from tqdm import tqdm_notebook as timer
import string
import re
import numpy as np

import sklearn.linear_model as lm
import sklearn.metrics as mcs

Using a set of features that include the following:
- top 5 pageranks of the actors (five floating point values) in each movie.
- if the director is one of the top 100 directors or not (101 boolean values).These
are directors of the top 100 movies from the "IMDb top 250". 

You can also
find a list of these movies in the movie_rating.txt file.
train a regression model and predict the ratings of the 3 movies mentioned
above. 

Specify the exact feature set you use and how you compute the numerical
values for these features. Compute and state the goodness of fit for your
regression model.

# Get A,M from File

In [2]:
with open("pyfiles/A.pkl") as afile, open("pyfiles/M.pkl") as mfile:
    A = pkl.load(afile)
    M = pkl.load(mfile)

# Get PageRanks from File

In [3]:
page_ranks = pd.read_csv("page_rank.txt", sep="\t")
page_ranks["Name"] = page_ranks["Name"].apply(string.rstrip)
page_ranks = page_ranks.set_index("Name")

# Find top 100 movies

In [19]:
movies = pd.read_csv("data/top_250.txt", sep="\t", header=None)
top_100 = {movie : i for (i, movie) in enumerate( movies[0].values[1:100] )}

# Get movie ratings

In [107]:
ratings = pd.read_csv("ratings.txt", sep = "\t", header = None, names=["Name", "Rating"])
ratings = ratings.set_index("Name")

# Match each movie to (director == top-100 director)

In [129]:
D = {}

printable = set(string.printable)

def clean_string(s):
    # Remove (*)
    s = re.sub(r'\([^0-9]+\)', '', s)

    # Remove {*}
    s = re.sub(r'\{.*\}', '', s)
    
    # Remove multiple spaces
    s = re.sub(r'\s+', ' ', s)

    # Strip spaces etc
    s = s.lstrip()
    s = s.rstrip()

    return s

all_movies = set( M.keys() )

with open("data/director_movies.txt") as f:
    for line in timer(f, total = 389663, desc = "directors"):
        line = filter(lambda x : x in printable, line.decode('latin1')).encode('ascii')
        splits = line.split("\t\t")
        
        dname = clean_string(splits[0])
        movies = set( map(clean_string, splits[1:]) )
        
        # Filter out unwanted movies
        movies = movies.intersection(all_movies)
        
        if movie in top_100:
            D[movie] = 1




# Find all movies where ratings are available

In [130]:
available_movies = set(M.keys()).intersection(ratings.index.values)

# Run a logistic regression using the features:
1. Top 10 page ranks of actors
2. Boolean indicating whether director was top 100 or not
3. Number of instances where an actor has acted in top 100 movies
4. Average page rank of actors in the movie
5. Average rating of neighbors in the graph

In [183]:
def build_feature_vector(movie):
    arr = [0] * 14
    
    prs = sorted( page_ranks.loc[ M[movie] ]["PageRank"], reverse=True)
    neighbors = [m for a in M[movie] for m in A[a]]
    
    # Top 10 page ranks of actors
    arr[:10] = top10_prs = prs[:10]
    
    # Boolean indicating whether director was top 100 or not
    arr[10] = D.get(movie, 0)
    
    # Number of instances where an actor has acted in top 100 movies
    arr[11] = sum( map(lambda x : x in top_100, neighbors) )
    
    # Average page rank of actors in the movie
    arr[12] = np.mean( prs )
    
    # Average rating of neighbors in the graph
    arr[13] = np.mean( ratings.loc[ neighbors ]["Rating"] )
    
    return( np.array( arr ) )

feature_vec = np.vstack(map(build_feature_vector, timer(available_movies)))




In [184]:
target_vec = ratings.loc[ available_movies ]["Rating"]

In [185]:
lr = lm.LinearRegression()

lr.fit(feature_vec, target_vec)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [186]:
test_movies = [
    "Batman v Superman: Dawn of Justice (2016)",
    "Mission: Impossible III (2006)",
    "Minions (2015)"
]

In [187]:
test_vec = map(build_feature_vector, test_movies)

# Run on 3 movies 

In [188]:
lr.predict(test_vec)

array([ 6.7661026 ,  6.6520514 ,  7.28253742])

In [189]:
predicted = lr.predict(feature_vec)

# Goodness of fit

In [190]:
mcs.mean_absolute_error(target_vec, predicted)

0.72653928112780397

In [191]:
mcs.mean_squared_error(target_vec, predicted)

0.87764049096049612