In [248]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import normalize, MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.feature_extraction.text import CountVectorizer
import spacy
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
import os
import cv2
import matplotlib.pyplot as plt
import math
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPool2D, Flatten
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from PIL import Image, ImageChops
from matplotlib import image
from scipy.stats import binned_statistic
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
import warnings
warnings.filterwarnings('ignore')

In [249]:
data = pd.read_csv("cleaned.csv")

In [250]:
#features used in the model
features = ["Genre","Rated","Runtime",'num_faces','top_director','top_actor',"movie_name","tomato_indic"]

In [251]:
#function to normalize numerical data
def NormalizeData(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))

In [252]:
#remove any remaining spaces from features
feature_data = data[features].dropna().reset_index(drop = True)
feature_data['Genre'] = feature_data["Genre"].apply(lambda x:x.replace(" ",""))
feature_data['Rated'] = feature_data['Rated'].apply(lambda x:x.replace(" ",""))

In [253]:
#convert runtime to integer
feature_data['Runtime'] = feature_data['Runtime'].apply(lambda x: int(x.replace(" min","")))

In [254]:
#if multiple genres, convert to list
feature_data['Genre'] = feature_data['Genre'].apply(lambda x: x.split(","))

In [255]:
#convert all categorical to dummies
mlb = MultiLabelBinarizer()
genres = feature_data['Genre']
genre_dummies = pd.DataFrame(mlb.fit_transform(genres),columns = mlb.classes_,index = feature_data.index)
rated_dummies = pd.get_dummies(feature_data['Rated'])
runtime = feature_data[['Runtime']]
top_actor = feature_data[['top_actor']]
top_director = feature_data[['top_director']]
runtime['Runtime'] = NormalizeData(runtime['Runtime']) #normalize runtime data to between 0,1
faces = feature_data[['num_faces']]
labels = feature_data['tomato_indic']

In [256]:
feature_data

Unnamed: 0,Genre,Rated,Runtime,num_faces,top_director,top_actor,movie_name,tomato_indic
0,"[Action, Adventure, Drama]",PG-13,138,2,0,0,The Tomorrow War,0
1,"[Horror, Thriller]",R,88,0,0,0,Till Death,1
2,"[Action, Horror, Thriller]",R,103,0,0,0,The Forever Purge,0
3,"[Crime, Drama, Mystery]",R,115,2,1,0,No Sudden Move,1
4,"[Documentary, Music]",PG-13,118,1,0,0,"Summer of Soul (...Or, When the Revolution Cou...",1
...,...,...,...,...,...,...,...,...
3051,"[Animation, Adventure, Family]",PG,82,0,0,0,Dinosaur,1
3052,"[Action, Crime, Thriller]",R,115,1,0,0,Romeo Must Die,0
3053,"[Biography, Drama, Sport]",PG,113,0,0,1,Remember the Titans,1
3054,"[Romance, Comedy, Musical]",PG,93,0,1,0,Love's Labour's Lost,1


In [257]:
#concatenate feature dataframes defined above
features_labels = pd.concat([runtime,genre_dummies,top_actor,top_director,faces,labels],axis = 1)
features_labels

Unnamed: 0,Runtime,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,...,Sci-Fi,Short,Sport,Thriller,War,Western,top_actor,top_director,num_faces,tomato_indic
0,0.552486,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,2,0
1,0.276243,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,0.359116,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0.425414,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,1,2,1
4,0.441989,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3051,0.243094,0,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3052,0.425414,1,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
3053,0.414365,0,0,0,1,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,1
3054,0.303867,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1


In [258]:
#see how many samples for each feature for dummy variables
for column in features_labels.columns:
    print(column,sum(features_labels[column]))

Runtime 1113.1657458563502
Action 608
Adventure 455
Animation 126
Biography 268
Comedy 1164
Crime 508
Documentary 283
Drama 1600
Family 190
Fantasy 192
History 116
Horror 278
Music 144
Musical 26
Mystery 281
News 14
Romance 571
Sci-Fi 168
Short 4
Sport 98
Thriller 495
War 60
Western 20
top_actor 478
top_director 687
num_faces 2486
tomato_indic 1533


In [259]:
#can remove features with a few number of samples such as Short or News
features_labels = features_labels.drop(columns = ['News','Western','Sport','Musical','Music','War','Short'])

In [260]:
#create train and test set
train,test = train_test_split(features_labels,test_size = 0.2)
full_train_x = features_labels.drop(columns = ['tomato_indic'])
full_train_y = features_labels['tomato_indic']
x_train = train.loc[:,train.columns != "tomato_indic"]
x_test = test.loc[:,test.columns != "tomato_indic"]
y_train = train['tomato_indic']
y_test = test['tomato_indic']

In [261]:
clf = LogisticRegression(random_state=9).fit(x_train,y_train)


In [262]:
models = [("Logistic Regression",LogisticRegression(random_state=9)),("SVM",svm.SVC()),("Random Forest Classifier",RandomForestClassifier(50)),("MultinomailNB",MultinomialNB(alpha = 2))]

In [263]:
best_model = 0
top_score = 0
for model in models:
    m = model[1]
    m.fit(x_train,y_train)
    score = m.score(x_test,y_test)
    if(score > top_score):
        top_score = score
        best_model = (m,score)

        
print("Best model is {} with accuracy of: {}".format(best_model[0],np.round(best_model[1],2)))

Best model is LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=9, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False) with accuracy of: 0.62


In [264]:
#best model is usually logistic regression 

In [265]:
best_model[0].coef_

array([[ 1.2642203 , -0.30458052, -0.2500301 ,  0.78019018,  0.34913797,
         0.04992414, -0.23807216,  1.61905318,  0.43527066, -0.12163686,
        -0.05044203, -0.18510497, -0.22239886, -0.25237542, -0.01392235,
         0.13908417,  0.02293103,  0.01360906,  0.50017658, -0.06731433]])

In [266]:
#compile feature importance
feature_importance = list(zip(features_labels.columns,best_model[0].coef_[0]))

#list feature importance
feature_importance.sort(key = lambda x: x[1],reverse = True)
feature_importance

[('Documentary', 1.619053176444512),
 ('Runtime', 1.2642202954279445),
 ('Animation', 0.7801901761985361),
 ('top_director', 0.5001765806030065),
 ('Drama', 0.43527066016938143),
 ('Biography', 0.3491379682604382),
 ('Sci-Fi', 0.13908417422835676),
 ('Comedy', 0.04992413734445178),
 ('Thriller', 0.02293103426430117),
 ('top_actor', 0.013609064230116806),
 ('Romance', -0.013922352784423976),
 ('Fantasy', -0.050442028265217806),
 ('num_faces', -0.06731433062610916),
 ('Family', -0.12163686057770638),
 ('History', -0.18510497125499897),
 ('Horror', -0.22239886046677265),
 ('Crime', -0.23807215721021133),
 ('Adventure', -0.25003009853898783),
 ('Mystery', -0.2523754208350398),
 ('Action', -0.3045805180128779)]

Logistic Regression model coefficients directly correlate to feature importance. Interestingly, whether or not a movie is a Documentary is important, along with how long a movie is (longer movies tend to score better), Animation, and whether the director is a top director. Adventure, Mystery, and Action Movies tend to do badly in terms of rotten tomatoes ratings.

In [268]:
#do hyperparameter tuning on the logistic regression model
model = LogisticRegression(random_state=9, max_iter = 100)

#hyperparameters
solvers = ['newton-cg','lbfgs','liblinear','sag','saga']
penalty = ['l2'] #other penalty types not supported
c_values = [100,10,1.0,0.1,0.01]

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=1) # add k-fold cross validation
grid = dict(solver = solvers,penalty = penalty,C = c_values)
grid_search = GridSearchCV(estimator = model, param_grid = grid, cv = cv,scoring = 'accuracy')
grid_result = grid_search.fit(full_train_x,full_train_y)

print("Best: %f using %s" % (np.round(grid_result.best_score_,2), grid_result.best_params_))

Best: 0.620000 using {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
