In [1]:
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import math
from sklearn import preprocessing

In [2]:
#define normalization function
def normalize(df):
    columns=df.columns
    for column in columns:
        if df[column].max()==0:
            pass
        else:
            df[column]=df[column]/df[column].max()
    return df

In [3]:
#importing gathered data
theMovies_DF=pd.read_csv('../DataSources/theMovieFeatures.csv')
#droping columns that: (1) have missing data for some movies, (2) have string values, (3) the year column
theMovies_DF=theMovies_DF.drop(columns=['Revenue', 'Budget', 'Prod Company', 'Title', 'Year'])
len(theMovies_DF.columns)

54

In [4]:
theMovies_DF.head()

Unnamed: 0,Best Picture,imdbRating,Popularity,Rated_APPROVED,Rated_Approved,Rated_G,Rated_Not Rated,Rated_PG,Rated_PG-13,Rated_Passed,...,Genre_Short,Genre_Sport,Genre_Thriller,Genre_War,Genre_Western,Best Actor,Best Actress,Best Director,Best Supporting Actor,Best Supporting Actress
0,0,8.3,11.825,1,0,0,0,0,0,0,...,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0
1,0,7.8,7.708,0,0,0,0,0,0,0,...,0,0,1,0,0,0.0,1.0,0.0,0.0,0.0
2,1,7.2,3.571,0,0,0,0,0,0,1,...,0,0,0,0,0,1.0,0.0,1.0,1.0,0.0
3,0,7.6,1.283,0,1,0,0,0,0,0,...,0,0,0,1,0,0.0,0.0,0.0,0.0,0.0
4,0,5.8,1.4,0,0,0,0,0,0,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0


In [5]:
#normalize data
theMovies_DF=normalize(theMovies_DF)
theMovies_DF.head()

Unnamed: 0,Best Picture,imdbRating,Popularity,Rated_APPROVED,Rated_Approved,Rated_G,Rated_Not Rated,Rated_PG,Rated_PG-13,Rated_Passed,...,Genre_Short,Genre_Sport,Genre_Thriller,Genre_War,Genre_Western,Best Actor,Best Actress,Best Director,Best Supporting Actor,Best Supporting Actress
0,0.0,0.892473,0.310213,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.83871,0.202209,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,0.774194,0.09368,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
3,0.0,0.817204,0.033658,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.623656,0.036727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
#set features
X = theMovies_DF.drop(columns=['Best Picture']).values
#set label
y = theMovies_DF['Best Picture'].values

#running RFE (recursive Feature E, 
#dropping one feature each iteration (10000 iterations) based on p-value (higher the pvalue the more likely the change is due to chance), it drops high p-values (I think)
#getting top 10 most influential features
estimator = LogisticRegression(solver='sag',max_iter=10000) 
selector = RFE(estimator, 10, step=1)
selector.fit(X,y)
selector.ranking_

array([43,  1, 25, 38, 29, 33, 40, 19, 26, 20, 28, 41, 18,  2, 23, 22,  1,
        6,  1, 17,  1, 21, 24,  8, 11, 34, 32, 16, 15, 12,  3, 35, 27,  4,
        1, 37, 14, 42,  1,  1,  9, 10,  1, 30, 13, 39, 36,  7,  1, 44,  1,
       31,  5])

In [7]:
#set features
X = theMovies_DF.drop(columns=['Best Picture']).values
#set label
y = theMovies_DF['Best Picture'].values

#logistic regression on all 53 features
estimator = LogisticRegression(solver='sag',max_iter=10000) 
estimator.fit(X,y)
coeffs=estimator.coef_[0]
intercept = estimator.intercept_[0]


In [8]:
#setting variable list for coefficients of top ten most influential features
#((e^(mx1+mx2....)/(1+e^(mx1+e^mx2...))), where m is the coefficient
coeffs_top10=(selector.estimator_.coef_[0])
print ('coefficients',selector.estimator_.coef_)

coefficients [[ 0.99171302  1.13318027 -1.09708863 -0.78532234 -0.86061053 -0.84362255
   0.82393898 -0.87209629  1.18148171  3.48796522]]


In [9]:
#get what feature corresponds to what coefficient
features = theMovies_DF.drop(columns=['Best Picture']).columns.tolist()
# cc = cX.columns.tolist()
feature_ranking = list(selector.ranking_)
Rank_DF = zip(feature_ranking,features)
parameters=[]
for rank,parameter in Rank_DF:
    if rank == 1:
        parameters.append(parameter)
    

In [10]:
#create Dataframe of top 10 features and coefficients
top_10=pd.DataFrame({"Feature": parameters,
                    "Coefficient": coeffs_top10})
top_10.sort_values('Coefficient', ascending=False)

Unnamed: 0,Feature,Coefficient
9,Best Director,3.487965
8,Best Actor,1.181482
1,Release Date_05,1.13318
0,Popularity,0.991713
6,Genre_Musical,0.823939
3,Release Date_09,-0.785322
5,Genre_Music,-0.843623
4,Genre_Fantasy,-0.860611
7,Genre_Sci-Fi,-0.872096
2,Release Date_07,-1.097089


In [11]:
#create Dataframe of all 49 features and their coefficients (regular logistic regression, not RFE)
all_feat=pd.DataFrame({"Feature": features,
                    "Coefficient": coeffs})
all_feat.sort_values('Coefficient', ascending=False)

Unnamed: 0,Feature,Coefficient
50,Best Director,3.704131
48,Best Actor,1.308699
16,Release Date_05,1.196039
39,Genre_Musical,0.840018
33,Genre_Family,0.764428
52,Best Supporting Actress,0.752381
1,Popularity,0.730021
30,Genre_Crime,0.710218
36,Genre_History,0.599222
24,Runtime,0.539202


# Predicting 2018 Best Picture

In [47]:
contenders=pd.read_csv('../DataSources/theMovieFeatures2018.csv')
contender_titles=contenders['Title']
contenders=contenders.drop(columns=['Revenue', 'Budget', 'Title'])

columns=contenders.columns
contenders

Unnamed: 0,Best Director,Best Actor,Best Actress,Best Supporting Actor,Best Supporting Actress,imdbRating,Popularity,Rated_PG-13,Rated_R,Release Date_02,...,Genre_Action,Genre_Adventure,Genre_Biography,Genre_Comedy,Genre_Crime,Genre_Drama,Genre_History,Genre_Music,Genre_Romance,Genre_Sci-Fi
0,0,1,0,0,0,8.3,170.104,1,0,0,...,0,0,1,0,0,1,0,1,0,0
1,0,0,0,0,0,7.4,53.369,1,0,1,...,1,1,0,0,0,0,0,0,0,1
2,0,0,0,0,0,7.5,31.767,0,1,0,...,0,0,1,0,1,1,0,0,0,0
3,0,0,1,0,0,7.9,139.806,0,1,0,...,0,0,1,1,0,1,1,0,0,0
4,1,0,0,0,0,8.0,34.205,0,1,0,...,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,8.0,105.901,0,1,0,...,0,0,0,0,0,1,0,1,1,0
6,0,0,0,0,0,7.1,45.84,0,1,0,...,0,0,1,1,0,1,1,0,0,0
7,0,0,0,1,0,8.3,56.313,1,0,0,...,0,0,1,1,0,1,0,1,0,0


In [48]:
#normalize contender data
contenders=normalize(contenders)
contenders['Title']=contender_titles
contenders

Unnamed: 0,Best Director,Best Actor,Best Actress,Best Supporting Actor,Best Supporting Actress,imdbRating,Popularity,Rated_PG-13,Rated_R,Release Date_02,...,Genre_Adventure,Genre_Biography,Genre_Comedy,Genre_Crime,Genre_Drama,Genre_History,Genre_Music,Genre_Romance,Genre_Sci-Fi,Title
0,0.0,1.0,0.0,0.0,0,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,Bohemian Rhapsody
1,0.0,0.0,0.0,0.0,0,0.891566,0.313743,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Black Panther
2,0.0,0.0,0.0,0.0,0,0.903614,0.18675,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,BlacKkKlansman
3,0.0,0.0,1.0,0.0,0,0.951807,0.821885,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,The Favourite
4,1.0,0.0,0.0,0.0,0,0.963855,0.201083,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,Roma
5,0.0,0.0,0.0,0.0,0,0.963855,0.622566,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,A Star Is Born
6,0.0,0.0,0.0,0.0,0,0.855422,0.269482,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,Vice
7,0.0,0.0,0.0,1.0,0,1.0,0.33105,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,Green Book


In [49]:
def getprediction(df,title):
    tempdf = df[df['Title']==title]
    col = df.columns.tolist()
    x = 0
    for c in col:
        try:
            coeff = all_feat[all_feat['Feature']==c].iloc[:,1].values[0]
            x += tempdf[c].values * coeff
        except:
            pass

    y = (math.exp(x+intercept))/(1+(math.exp(x+intercept)))
    return y

In [50]:
predictions=[]
for t in contenders['Title'].tolist():
    the_movie={'Title':t,
              'Liklihood of Winning':getprediction(contenders,t)}
    predictions.append(the_movie)

Predictions=pd.DataFrame(predictions)

In [51]:
Predictions=Predictions.sort_values('Liklihood of Winning', ascending=False)
Predictions

Unnamed: 0,Liklihood of Winning,Title
4,0.829628,Roma
3,0.186584,The Favourite
0,0.16309,Bohemian Rhapsody
6,0.079448,Vice
5,0.039632,A Star Is Born
7,0.033079,Green Book
2,0.028929,BlacKkKlansman
1,0.01752,Black Panther


# Predicting 2016 Best Picture

In [41]:
films2016=pd.read_csv('../DataSources/theMovieFeatures2016.csv')

films2016_titles=films2016['Title']
films2016=films2016.drop(columns=['Revenue', 'Budget', 'Title'])

columns=films2016.columns
films2016.head()

Unnamed: 0,Best Director,Best Actor,Best Actress,Best Supporting Actor,Best Supporting Actress,imdbRating,Popularity,Rated_PG,Rated_PG-13,Rated_R,...,Genre_Drama,Genre_History,Genre_Music,Genre_Musical,Genre_Mystery,Genre_Romance,Genre_Sci-Fi,Genre_Thriller,Genre_War,Genre_Western
0,1,0,1,0,0,8.0,17.561,0,1,0,...,1,0,1,1,0,1,0,0,0,0
1,0,0,0,0,0,7.9,20.369,0,1,0,...,1,0,0,0,1,0,1,1,0,0
2,0,0,0,0,0,8.1,24.197,0,1,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,7.6,12.255,0,0,1,...,1,0,0,0,0,0,0,1,0,1
4,0,0,0,0,0,7.8,19.442,1,0,0,...,1,1,0,0,0,0,0,0,0,0


In [42]:
#normalize 2016 data
films2016=normalize(films2016)
films2016['Title']=films2016_titles
films2016.head()

Unnamed: 0,Best Director,Best Actor,Best Actress,Best Supporting Actor,Best Supporting Actress,imdbRating,Popularity,Rated_PG,Rated_PG-13,Rated_R,...,Genre_History,Genre_Music,Genre_Musical,Genre_Mystery,Genre_Romance,Genre_Sci-Fi,Genre_Thriller,Genre_War,Genre_Western,Title
0,1.0,0.0,1.0,0.0,0.0,0.987654,0.725751,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,La La Land
1,0.0,0.0,0.0,0.0,0.0,0.975309,0.841799,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,Arrival
2,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Lion
3,0.0,0.0,0.0,0.0,0.0,0.938272,0.506468,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,Hell or High Water
4,0.0,0.0,0.0,0.0,0.0,0.962963,0.803488,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Hidden Figures


In [43]:
predictions2016=[]
for t in films2016['Title'].tolist():
    the_movie={'Title':t,
              'Liklihood of Winning':getprediction(films2016,t)}
    predictions2016.append(the_movie)

Predictions2016=pd.DataFrame(predictions2016)
Predictions2016=Predictions2016.sort_values('Liklihood of Winning', ascending=False)
Predictions2016

Unnamed: 0,Liklihood of Winning,Title
0,0.832702,La La Land
7,0.295216,Manchester by the Sea
6,0.147674,Hacksaw Ridge
3,0.137059,Hell or High Water
8,0.110952,Fences
5,0.092862,Moonlight
2,0.088566,Lion
4,0.058225,Hidden Figures
1,0.031279,Arrival


# Predicting 2017 Best Picture

In [44]:
films2017=pd.read_csv('../DataSources/theMovieFeatures2017.csv')
films2017_titles=films2017['Title']
films2017=films2017.drop(columns=['Revenue', 'Budget', 'Title'])

columns=films2017.columns
films2017.head()

Unnamed: 0,Best Director,Best Actor,Best Actress,Best Supporting Actor,Best Supporting Actress,imdbRating,Popularity,Rated_PG-13,Rated_R,Release Date_02,...,Genre_Comedy,Genre_Crime,Genre_Drama,Genre_Fantasy,Genre_History,Genre_Horror,Genre_Mystery,Genre_Romance,Genre_Thriller,Genre_War
0,1,0,0,0,0,7.4,21.814,0,1,0,...,0,0,1,1,0,0,0,1,1,0
1,0,1,0,0,0,7.4,14.713,1,0,0,...,0,0,1,0,1,0,0,0,0,1
2,0,0,0,0,0,7.9,21.118,1,0,0,...,0,0,1,0,1,0,0,0,1,1
3,0,0,0,0,0,7.5,13.06,0,1,0,...,0,0,1,0,0,0,0,1,0,0
4,0,0,0,0,0,7.2,13.776,1,0,0,...,0,0,1,0,1,0,0,0,1,0


In [45]:
#normalize 2017 data
films2017=normalize(films2017)
films2017['Title']=films2017_titles
films2017.head()

Unnamed: 0,Best Director,Best Actor,Best Actress,Best Supporting Actor,Best Supporting Actress,imdbRating,Popularity,Rated_PG-13,Rated_R,Release Date_02,...,Genre_Crime,Genre_Drama,Genre_Fantasy,Genre_History,Genre_Horror,Genre_Mystery,Genre_Romance,Genre_Thriller,Genre_War,Title
0,1.0,0.0,0.0,0.0,0,0.902439,0.906763,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,The Shape of Water
1,0.0,1.0,0.0,0.0,0,0.902439,0.611589,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,Darkest Hour
2,0.0,0.0,0.0,0.0,0,0.963415,0.877832,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,Dunkirk
3,0.0,0.0,0.0,0.0,0,0.914634,0.542877,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Phantom Thread
4,0.0,0.0,0.0,0.0,0,0.878049,0.57264,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,The Post


In [46]:
predictions2017=[]
for t in films2017['Title'].tolist():
    the_movie={'Title':t,
              'Liklihood of Winning':getprediction(films2017,t)}
    predictions2017.append(the_movie)

Predictions2017=pd.DataFrame(predictions2017)
Predictions2017=Predictions2017.sort_values('Liklihood of Winning', ascending=False)
Predictions2017

Unnamed: 0,Liklihood of Winning,Title
0,0.401277,The Shape of Water
1,0.321734,Darkest Hour
7,0.276217,"Three Billboards Outside Ebbing, Missouri"
4,0.061198,The Post
2,0.059952,Dunkirk
6,0.052956,Lady Bird
3,0.039513,Phantom Thread
8,0.03779,Get Out
5,0.028213,Call Me by Your Name
