In [152]:
import pandas as pd
import numpy as np
import sklearn as skl
import matplotlib.pyplot as plt
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from functools import reduce
import winsound
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection  import GridSearchCV, permutation_test_score, StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, f1_score 
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

In [153]:
imdb = pd.read_csv("IMDb movies.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [154]:
imdb.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,...,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,$ 2250,,,,7.0,7.0
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.8,188,,,,,5.0,2.0
3,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,446,$ 45000,,,,25.0,3.0
4,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2237,,,,,31.0,14.0


In [155]:
len(imdb)

85855

In [156]:
#remove values with no gross income data or budget data
imdb=imdb[imdb['worlwide_gross_income'].isna()==False]
imdb=imdb[imdb['budget'].isna()==False]
imdb.reset_index(drop=True, inplace=True)

In [157]:
#Remove dollar sign from revenue and budget
worldwide=[]
budget=[]
for index, row in imdb.iterrows():
    try:
        worldwide.append(row['worlwide_gross_income'].split('$ ')[1])
    except:
        worldwide.append(0)
        
    budget.append(row['budget'].split(' ')[1])

imdb['worlwide_gross_income']=[int(x) for x in worldwide]
imdb['budget']=[int(x) for x in budget]

In [158]:
len(imdb)

12762

In [159]:
#split multiple items in column into a list of separate items
imdb['country'] = (imdb['country'].str.split(', '))
imdb['genre'] = (imdb['genre'].str.split(', '))
imdb['language'] = (imdb['language'].str.split(', '))
imdb['writer'] = (imdb['writer'].str.split(', '))
imdb['director'] = (imdb['director'].str.split(', '))

In [160]:
# perform sentiment analysis on description and title
des_scores=[]
title_scores=[]
analyzer = SentimentIntensityAnalyzer()
for x in imdb['description']:
    try:
        des_scores.append(analyzer.polarity_scores(x)['compound'])
    except TypeError:
        des_scores.append(0)

for x in imdb['title']:
    try:
        title_scores.append(analyzer.polarity_scores(x)['compound'])
    except TypeError:
        title_scores.append(0)
imdb['description_score']=des_scores
imdb['title']=title_scores
imdb.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics,description_score
0,tt0010323,0.0,Das Cabinet des Dr. Caligari,1920,1920-02-27,"[Fantasy, Horror, Mystery]",76,[Germany],[German],[Robert Wiene],...,"Hypnotist Dr. Caligari uses a somnambulist, Ce...",8.1,55601,18000,$ 8811,8811,,237.0,160.0,-0.4215
1,tt0011440,0.0,Markens grøde,1921,1921-12-02,[Drama],107,[Norway],,[Gunnar Sommerfeldt],...,After the Nobel prize winning Knut Hamsun-nove...,6.6,195,250000,,4272,,3.0,3.0,0.1779
2,tt0012190,0.0,The Four Horsemen of the Apocalypse,1921,1923-04-16,"[Drama, Romance, War]",150,[USA],[None],[Rex Ingram],...,An extended family split up in France and Germ...,7.2,3058,800000,$ 9183673,9183673,,45.0,16.0,-0.7579
3,tt0012349,0.0,The Kid,1921,1923-11-26,"[Comedy, Drama, Family]",68,[USA],"[English, None]",[Charles Chaplin],...,"The Tramp cares for an abandoned child, but ev...",8.3,109038,250000,,26916,,173.0,105.0,-0.631
4,tt0014624,0.0,A Woman of Paris: A Drama of Fate,1923,1927-06-06,"[Drama, Romance]",82,[USA],"[None, English]",[Charles Chaplin],...,A kept woman runs into her former fiancé and f...,7.0,4735,351000,,11233,,37.0,24.0,0.6908


In [161]:
#get dummy variables for categories that need them
df = pd.get_dummies(imdb['genre'].apply(pd.Series).stack()).sum(level=0)
df2 = pd.get_dummies(imdb['language'].apply(pd.Series).stack()).sum(level=0)
df3 = pd.get_dummies(imdb['director'].apply(pd.Series).stack()).sum(level=0)
df4 = pd.get_dummies(imdb['writer'].apply(pd.Series).stack()).sum(level=0)
df5 = pd.get_dummies(imdb['country'].apply(pd.Series).stack()).sum(level=0)

In [162]:
df = df.add_prefix('genre_')
df2 = df2.add_prefix('langauge_')
df3 = df3.add_prefix('director_')
df4 = df4.add_prefix('writer_')
df5 = df5.add_prefix('country_')

In [163]:
#add similar column for merging
df['imdb_title_id']=imdb['imdb_title_id']
df2['imdb_title_id']=imdb['imdb_title_id']
df3['imdb_title_id']=imdb['imdb_title_id']
df4['imdb_title_id']=imdb['imdb_title_id']
df5['imdb_title_id']=imdb['imdb_title_id']

In [164]:
#merge all dataframes into one
dfs = [imdb,df,df2,df3,df4,df5]
df_final = reduce(lambda left,right: pd.merge(left,right,on='imdb_title_id'), dfs)

In [165]:
df_final

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,country_UK,country_USA,country_Ukraine,country_United Arab Emirates,country_Uruguay,country_Venezuela,country_Vietnam,country_West Germany,country_Yemen,country_Yugoslavia
0,tt0010323,0.0,Das Cabinet des Dr. Caligari,1920,1920-02-27,"[Fantasy, Horror, Mystery]",76,[Germany],[German],[Robert Wiene],...,0,0,0,0,0,0,0,0,0,0
1,tt0012190,0.0,The Four Horsemen of the Apocalypse,1921,1923-04-16,"[Drama, Romance, War]",150,[USA],[None],[Rex Ingram],...,0,1,0,0,0,0,0,0,0,0
2,tt0012349,0.0,The Kid,1921,1923-11-26,"[Comedy, Drama, Family]",68,[USA],"[English, None]",[Charles Chaplin],...,0,1,0,0,0,0,0,0,0,0
3,tt0014624,0.0,A Woman of Paris: A Drama of Fate,1923,1927-06-06,"[Drama, Romance]",82,[USA],"[None, English]",[Charles Chaplin],...,0,1,0,0,0,0,0,0,0,0
4,tt0015864,0.0,The Gold Rush,1925,1925-10-23,"[Adventure, Comedy, Drama]",95,[USA],"[English, None]",[Charles Chaplin],...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12654,tt9878242,0.0,Subharathri,2019,2019-07-06,"[Drama, Romance]",130,[India],[Malayalam],[Vyasan K.P.],...,0,0,0,0,0,0,0,0,0,0
12655,tt9886872,0.0,Munthiri Monchan,2019,2019-12-06,"[Comedy, Romance]",130,[India],[Malayalam],[Vijith Nambiar],...,0,0,0,0,0,0,0,0,0,0
12656,tt9894394,0.0,Upin & Ipin: Keris Siamang Tunggal,2019,2019-03-21,[Animation],100,[Malaysia],[Malay],"[Adam Bin Amiruddin, Syed Nurfaiz Khalid bin S...",...,0,0,0,0,0,0,0,0,0,0
12657,tt9900782,0.0,Kaithi,2019,2019-10-25,"[Action, Thriller]",145,[India],[Tamil],[Lokesh Kanagaraj],...,0,0,0,0,0,0,0,0,0,0


In [166]:
# get a binary column to represent whether a movie was able to recoup its production costs
profitable = []
for index, row in df_final.iterrows():
    if row['worlwide_gross_income']>row['budget']:
        profitable.append(1)
    else:
        profitable.append(0)
df_final['profitable']=profitable

In [167]:
df_final.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,country_USA,country_Ukraine,country_United Arab Emirates,country_Uruguay,country_Venezuela,country_Vietnam,country_West Germany,country_Yemen,country_Yugoslavia,profitable
0,tt0010323,0.0,Das Cabinet des Dr. Caligari,1920,1920-02-27,"[Fantasy, Horror, Mystery]",76,[Germany],[German],[Robert Wiene],...,0,0,0,0,0,0,0,0,0,0
1,tt0012190,0.0,The Four Horsemen of the Apocalypse,1921,1923-04-16,"[Drama, Romance, War]",150,[USA],[None],[Rex Ingram],...,1,0,0,0,0,0,0,0,0,1
2,tt0012349,0.0,The Kid,1921,1923-11-26,"[Comedy, Drama, Family]",68,[USA],"[English, None]",[Charles Chaplin],...,1,0,0,0,0,0,0,0,0,0
3,tt0014624,0.0,A Woman of Paris: A Drama of Fate,1923,1927-06-06,"[Drama, Romance]",82,[USA],"[None, English]",[Charles Chaplin],...,1,0,0,0,0,0,0,0,0,0
4,tt0015864,0.0,The Gold Rush,1925,1925-10-23,"[Adventure, Comedy, Drama]",95,[USA],"[English, None]",[Charles Chaplin],...,1,0,0,0,0,0,0,0,0,0


In [168]:
#get the feature and label sets
#remove all the features that wouldn't be known until after a movie is produced
X = df_final.drop(['imdb_title_id','title','original_title','date_published',
                   'genre', 'duration', 'country', 'language', 'director', 'writer',
                   'production_company', 'actors', 'description', 'avg_vote', 'votes',
                   'usa_gross_income', 'worlwide_gross_income', 'metascore','reviews_from_users', 
                   'reviews_from_critics', 'description_score'],axis=1)
y = df_final['profitable']

In [169]:
corlist=pd.DataFrame(X.corrwith(y),columns=['coor'])

In [170]:
corlist.reset_index(inplace=True)

In [171]:
df1 = corlist[corlist['coor']>.025]
df2 = corlist[corlist['coor']<-.025]
df = pd.concat([df1,df2])

In [172]:
print(len(X.columns))
print(len(df))

20255
200


In [173]:
X = X[df['index']]
X.drop('profitable',axis=1,inplace=True)

In [174]:
X.dropna(inplace=True)

In [175]:
# generate more samples from the data to balance out binary outcomes
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

In [176]:
#get a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=52)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=13)

In [177]:
# standardize the data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)
X_test = sc.transform(X_test)

In [178]:
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [179]:
# predict values using X test and the new model
y_pred = classifier.predict(X_val)

In [180]:
#print the confusion matrix as well as multiple scores to evaluate the model
cm = confusion_matrix(y_val, y_pred)
print(cm)
print('Accuracy: ' + str(accuracy_score(y_val, y_pred)))
print('AUC: ' + str(roc_auc_score(y_val, y_pred)))
print('F1 Score: ' + str(f1_score(y_val, y_pred)))

[[985 467]
 [507 951]]
Accuracy: 0.6652920962199312
AUC: 0.6653190150664897
F1 Score: 0.6613351877607787


In [181]:
#use grid search cv to run multiple random forest classifier models to find best hyperparameters
param_grid = { 
    'max_depth':[3,5,7,11,None],
    'n_estimators': [50, 100, 250,500],
    'max_features': ['auto','sqrt','log2'],
}

CV_rfc = GridSearchCV(estimator=classifier, param_grid=param_grid, cv= 5,n_jobs=-1)
CV_rfc.fit(X_val, y_val)
print(CV_rfc.best_estimator_)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=11, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)


In [182]:
winsound.Beep(440,250)

In [183]:
# predict values using X test and the new model
y_pred = CV_rfc.best_estimator_.predict(X_test)

In [184]:
#print the confusion matrix as well as multiple scores to evaluate the model
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy: ' + str(accuracy_score(y_test, y_pred)))
print('AUC: ' + str(roc_auc_score(y_test, y_pred)))
print('F1 Score: ' + str(f1_score(y_test, y_pred)))

[[ 991  446]
 [ 463 1010]]
Accuracy: 0.6876288659793814
AUC: 0.6876533341270212
F1 Score: 0.689655172413793


In [185]:
# perform multiple random permutations to find p-value 
clf = CV_rfc.best_estimator_
cv = StratifiedKFold(2, shuffle=True, random_state=35)

score_orig, perm_scores_orig, pvalue_orig = permutation_test_score(
    clf, X_test, y_test, scoring="accuracy", cv=cv, n_permutations=1000)

In [186]:
pvalue_orig

0.000999000999000999

In [187]:
winsound.Beep(440,250)

In [188]:
logisticRegr = LogisticRegression(random_state=21,n_jobs=-1)
logisticRegr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l2', random_state=21,
                   solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

In [189]:
y_pred = logisticRegr.predict(X_val)

In [190]:
#print the confusion matrix as well as multiple scores to evaluate the model
cm = confusion_matrix(y_val, y_pred)
print(cm)
print('Accuracy: ' + str(accuracy_score(y_val, y_pred)))
print('AUC: ' + str(roc_auc_score(y_val, y_pred)))
print('F1 Score: ' + str(f1_score(y_val, y_pred)))

[[ 884  568]
 [ 411 1047]]
Accuracy: 0.663573883161512
AUC: 0.6634612114410094
F1 Score: 0.6814188089814512


In [211]:
#use grid search cv to run multiple random forest classifier models to find best hyperparameters
param_grid = {
    'max_iter':[50,100,200,None],
    'solver':['newton-cg', 'lbfgs', 'liblinear','sag','saga'],
    'penalty':['none','l1','l2','elasticnet'],
    'C':[100, 10, 1.0, 0.1, 0.01]
}

CV_logreg = GridSearchCV(estimator=logisticRegr, param_grid=param_grid, cv= 5,n_jobs=-1)
CV_logreg.fit(X_val, y_val)
print(CV_logreg.best_estimator_)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=50,
                   multi_class='auto', n_jobs=-1, penalty='l1', random_state=21,
                   solver='liblinear', tol=0.0001, verbose=0, warm_start=False)




In [212]:
y_pred = CV_logreg.predict(X_test)

In [213]:
#print the confusion matrix as well as multiple scores to evaluate the model
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy: ' + str(accuracy_score(y_test, y_pred)))
print('AUC: ' + str(roc_auc_score(y_test, y_pred)))
print('F1 Score: ' + str(f1_score(y_test, y_pred)))

[[ 795  642]
 [ 383 1090]]
Accuracy: 0.647766323024055
AUC: 0.646611165204722
F1 Score: 0.6801872074882994


In [214]:
winsound.Beep(440,250)

In [223]:
# perform multiple random permutations to find p-value 
clf = CV_logreg.best_estimator_
cv = StratifiedKFold(2, shuffle=True, random_state=35)

score_orig, perm_scores_orig, pvalue_orig = permutation_test_score(
    clf, X_test, y_test, scoring="accuracy", cv=cv, n_permutations=1000,n_jobs=None)



In [224]:
pvalue_orig

0.000999000999000999