# Prediction

Having collected, cleaned and organized the data with information about ratings, cast, genre and past Oscar performances, we will now try to predict the winners and nominees. The approach that we take is the following. 

Because we have seen that a movie almost has to be in one of the three major categories -- drama, romance, comedy or biograpjy -- we will filter our data to only consider the movie that fall into oine of thes egenres. 
We will then use a variety of different models to predict the probabibility of win for the different movies. The movies will be sorted in descending order of win probabilities. 

Finally, we will take a aweighted average of al the prediction, weighted by the recall score, and use the grand average as our final prediction. 

In [1]:
# import necessary libraries
import pandas as pd
import collections
import numpy as np
import requests
import wikipedia
import re
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
from urllib.request import urlopen
from scrapy import selector
import datetime as dt
import pickle
from skimage import io
from IPython.display import clear_output
%matplotlib inline

# scikit learn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report 
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV 
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor  
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_score


In [2]:
DF_main = pd.read_csv('my_data/df_main_FINAL.csv', index_col=[0])
all_genres = pickle.load(open("my_data/all_genres_FINAL","rb"))
oscar_categories = pickle.load(open("my_data/major_oscar_categories","rb"))
oscar_genres_columns = ['sci-fi', 'family', 'adventure', 'crime', 
                'biography', 'history', 'musical', 'romance', 'sport', 'comedy', 
                'horror', 'drama', 'mystery', 'war', 'thriller']



ID_columns = ['imdbID', 'title', 'year']
basic_columns = ['cast_size', 'genre_span']

    
print("Oscar genres:\n",oscar_genres_columns)
print("Oscar categories:\n", oscar_categories)



scores = ['n_votes','imdb_rating', 'metscore', 'rotten_tomatoes']
awards_pre = ['precount_wins', 'precount_noms', 'other_wins', 'other_noms']
awards_post = ['win', 'nom']

predictor_columns = basic_columns + oscar_genres_columns + scores + awards_pre

print("Predictor columns:\n",predictor_columns)
target_columns = ['win', 'nom']
filters = ((DF_main.drama==1)|((DF_main.comedy==1)|(DF_main.romance==1)|(DF_main.biography==1)))&(DF_main.action==0)
DF_main = DF_main[filters]


Oscar genres:
 ['sci-fi', 'family', 'adventure', 'crime', 'biography', 'history', 'musical', 'romance', 'sport', 'comedy', 'horror', 'drama', 'mystery', 'war', 'thriller']
Oscar categories:
 ['picture', 'director', 's_actor', 's_actress', 'actor', 'actress', 'screenplay']
Predictor columns:
 ['cast_size', 'genre_span', 'sci-fi', 'family', 'adventure', 'crime', 'biography', 'history', 'musical', 'romance', 'sport', 'comedy', 'horror', 'drama', 'mystery', 'war', 'thriller', 'n_votes', 'imdb_rating', 'metscore', 'rotten_tomatoes', 'precount_wins', 'precount_noms', 'other_wins', 'other_noms']


In [3]:
print(f'{len(DF_main.query("year == 2019"))} 2019 movies are being considered:\n')
# print(list(DF_main.query("year == 2019").title))

192 2019 movies are being considered:



In [4]:
def convert_to_pctile(X, columns):
    
    for col in columns:
        x = np.array(X[col])
        X[col] = [(len(np.where(x<=y)[0])/len(x)) for y in x]
    return X

def normalize_by_max(X, columns):
    for col in columns:
        x = np.array(X[col])
        mx = np.max(x)
        X[col] = x/mx
    return X

def top_N_each_year(df,N,feature):
    df_ = pd.DataFrame()
    for year in list(set(df.year)):
        # print(year)
        df_ = df_.append(df[df.year == year].sort_values(by = feature, ascending=False).head(N))
    return df_

def normalize_by_year(df, columns, _how = 'max'):
    
    if type(columns)!= list:
        print("columns must be list")
        return dict()
    years = list(set(df.year))

    print(years)
    
    if _how == 'pctile':
        df = convert_to_pctile(df, columns)
        return df

    for column in columns:
        cols = ['year'] + [column]
        for year in years:
            if year%10 == 0:
                clear_output() 
            print(year, column)
            temp_df = df[(df.year == year)]
            temp_df = temp_df[column]
            ids = temp_df.index
            
            if _how == 'max':
                df.loc[ids,column] = df.loc[ids,column]/temp_df.max()
            if _how == 'minmax':
                min_max_scaler = preprocessing.MinMaxScaler()
                vals = df.loc[ids,column].values
                # vals.shape = (len(vals),1)
                print(vals.shape)
                vals = vals[:,np.newaxis]
                print(vals.shape)
                scaled_array = min_max_scaler.fit_transform(vals)
                df.loc[ids,column] = scaled_array
            
                
    return df

In [5]:
print(predictor_columns)

['cast_size', 'genre_span', 'sci-fi', 'family', 'adventure', 'crime', 'biography', 'history', 'musical', 'romance', 'sport', 'comedy', 'horror', 'drama', 'mystery', 'war', 'thriller', 'n_votes', 'imdb_rating', 'metscore', 'rotten_tomatoes', 'precount_wins', 'precount_noms', 'other_wins', 'other_noms']


In [6]:
print(DF_main.columns)
normalize_columns = ['n_votes', 'imdb_rating', 'metscore', 'rotten_tomatoes',
                     'cast_size', 'genre_span', 'cast_size', 'running_time']
DF = normalize_by_year(DF_main, predictor_columns, 'max')
DF.fillna('0',inplace=True)


2010 other_noms
2011 other_noms
2012 other_noms
2013 other_noms
2014 other_noms
2015 other_noms
2016 other_noms
2017 other_noms
2018 other_noms
2019 other_noms


In [7]:
# DF = DF[(DF.year >=1960)&(DF.year < 2019)]
# DFX = DF.loc[:,ID_columns + predictor_columns + target_columns]
# print(ID_columns + predictor_columns + target_columns)
# DFX.columns
# DFX.info()

In [8]:
DFX_2019 = DF[DF.year == 2019]
DFX_2019 = DFX_2019.loc[:,ID_columns + predictor_columns + target_columns]
X_2019 = DFX_2019[predictor_columns]
print("X_2019 shape:",X_2019.shape)

X_2019 shape: (192, 25)


In [9]:
# Pre 2019 (predictor) movies

# Predictor matrix
DF1 = DF[(DF.year >=1960)&(DF.year < 2019)]
DFX = DF1.loc[:,ID_columns + predictor_columns + target_columns]
X = DFX[predictor_columns]
print("X shape:",X.shape)


#Target matrix
awards_map = {'W':1, 'N':1, 'WN':1, 'O':0}
category = 'picture'
y = DF1[category].map(awards_map).values
y = y[:,np.newaxis]
print("y shape:", y.shape)


# y = (DFX.win > 1)|(DFX.nom>3) # 2 wins of 4 nominations
# y = np.array([int(z) for z in y])
# DFX['target'] = y
# y = y[:,np.newaxis]
# print("y shape:", y.shape)

# Now for 2019 (target) movies
DFX_2019 = DF[DF.year == 2019]
DFX_2019 = DFX_2019.loc[:,ID_columns + predictor_columns + target_columns]
X_2019 = DFX_2019[predictor_columns]
print("X_2019 shape:",X_2019.shape)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify = y)
print("Class 0:", len(y_train[np.where(y_train == 0)]))
print("Class 1:", len(y_train[np.where(y_train == 1)]))
print("2019:", )



X shape: (8981, 25)
y shape: (8981, 1)
X_2019 shape: (192, 25)
Class 0: 6082
Class 1: 204
2019:


In [10]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8981 entries, 0 to 13933
Data columns (total 25 columns):
cast_size          8981 non-null float64
genre_span         8981 non-null float64
sci-fi             8981 non-null object
family             8981 non-null object
adventure          8981 non-null float64
crime              8981 non-null object
biography          8981 non-null float64
history            8981 non-null object
musical            8981 non-null float64
romance            8981 non-null float64
sport              8981 non-null object
comedy             8981 non-null float64
horror             8981 non-null object
drama              8981 non-null float64
mystery            8981 non-null object
war                8981 non-null object
thriller           8981 non-null object
n_votes            8981 non-null float64
imdb_rating        8981 non-null float64
metscore           8981 non-null float64
rotten_tomatoes    8981 non-null float64
precount_wins      8981 non-null float64

# Generic Network

In [11]:
def classifier(classifier_dict, random_state = 47):
    
    name = classifier_dict['classifier']
    model = classifier_dict['classifier'](random_state = random_state)
    if 'hyperparameters' in classifier_dict:
        hyperparams = classifier_dict['hyperparameters']['params']
        cv = classifier_dict['hyperparameters']['cv']
        model = GridSearchCV(model, hyperparameters, cv=cv, verbose=0)
        return model
    

In [12]:
# Classifier definition for logistic regression
classifier_dict = {'name':'LogisticRegression'}
# classifier_dict['name'] = 'LogisticRegression'
classifier_dict['classifier'] = LogisticRegression

# Hyperparameters
classifier_dict['hyperparameters'] = {}
penalty = ['l1', 'l2']
C = np.logspace(0, 4, 10)
hyperparameters = dict(C=C, penalty=penalty)
cv = 5
classifier_dict['hyperparameters']['params'] = hyperparameters
classifier_dict['hyperparameters']['cv'] = cv
# Penalty
classifier_dict['penalty'] = ['l1', 'l2']

classifier(classifier_dict, random_state = 29)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=29, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
       5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
       3.59381366e+03, 1.00000000e+04]),
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
     

In [13]:
model = classifier(classifier_dict, random_state = 29)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
target_names = ['class 0', 'class 1']
clear_output() 
x = x.iloc[::-1,:]
print(model.best_params_)

NameError: name 'x' is not defined

In [None]:
name = []
accuracy = []
recall = []
precision = []

# Logistic Regression

Because we are interested in obtaining probabilities for each film earning a nomination, and inferring the winner from these probabilities, the only linear model that was considered was Logistic Regression. Because Logistic regression uses the logistic function to model a binary dependent variable, the output of the model can be naturally interpreted as the probability of nomination or not. 

In [None]:
log = LogisticRegression(random_state=0)
penalty = ['l1', 'l2']
C = np.logspace(0, 4, 10)
hyperparameters = dict(C=C, penalty=penalty)
log_cv = GridSearchCV(log, hyperparameters, cv=5, verbose=0)
log_cv.fit(X, y)
y_pred = log_cv.predict(X_2019)
y_prob = log_cv.predict_proba(X_2019)[:,1]
DFX_2019.loc[:,'predicted_probability_log'] = y_prob
x = DFX_2019.sort_values(by='predicted_probability_log', ascending=False).head(10)
clear_output()

### Performance evaluation
log_cv.fit(X_train, y_train)
y_pred = log_cv.predict(X_test)
target_names = ['class 0', 'class 1']
clear_output() 
x = x.iloc[::-1,:]
print(log_cv.best_params_)

plt.barh(x['title'], x['predicted_probability_log'])
plt.xlabel('probability')
print(classification_report(y_test, y_pred, target_names=target_names))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

acc_score = log_cv.score(X, y)
prec_score = precision_score(y_test, y_pred, labels= 'class 1')
print("Accuracy:", acc_score)
print("Precision:",prec_score)
name.append('logreg')
precision.append(prec_score)
accuracy.append(acc_score)
print(name, accuracy, precision)

In [None]:
# name.append('log')
# accuracy.append(0.96)
# recall.append(0.25)
# precision.append(0.58)
# print(name, accuracy, recall, precision)

## K Nearest neighbors classifier 
K nearest neighbor is a nonlinear classifier that provided straightforward approach to classify movies based on their proximity to previous winners and nominees. We used GridSearchCV() method from the utils package of scikit-learn module. Although several variations were available, we only performed grid search on the number parameters, whose optimal value was found to be 15. 

In [None]:
# KNN Training
knn = KNeighborsClassifier(n_neighbors=5)
param_grid = {'n_neighbors': np.arange(1, 30)} 
knn_cv = GridSearchCV(knn, param_grid, cv=5) 
# knn.fit(X, y) 
knn_cv.fit(X, y)

# KNN prediction
y_pred = knn_cv.predict(X_2019)
y_prob = knn_cv.predict_proba(X_2019)
DFX_2019.loc[:,'predicted_probability_knn'] = y_prob[:,1]
x = DFX_2019.sort_values(by='predicted_probability_knn', ascending=False).head(10)
clear_output()


### Performance evaluation
knn_cv.fit(X_train, y_train)
y_pred = knn_cv.predict(X_test)
target_names = ['class 0', 'class 1']
clear_output() 
x = x.iloc[::-1,:]
print(knn_cv.best_params_)
print(knn_cv.score(X, y))
plt.barh(x['title'], x['predicted_probability_knn'])
plt.xlabel('probability')
print(classification_report(y_test, y_pred, target_names=target_names))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


acc_score = knn_cv.score(X, y)
prec_score = precision_score(y_test, y_pred, labels= 'class 1')
print("Accuracy:", acc_score)
print("Precision:",prec_score)
name.append('knn')
precision.append(prec_score)
accuracy.append(acc_score)
print(name, accuracy, precision)


# Random Forest Classifier

In [None]:
rfc=RandomForestClassifier() # n_estimators=100)
param_grid = {'n_estimators': np.arange(50,100,5)} 
rfc_cv = GridSearchCV(rfc, param_grid, cv=5) 

rfc_cv.fit(X,y)
y_pred = rfc_cv.predict(X_2019)
y_prob = rfc_cv.predict_proba(X_2019)
DFX_2019.loc[:,'predicted_probability_rfc'] = y_prob[:,1]
x = DFX_2019.sort_values(by='predicted_probability_rfc', ascending=False).head(10)

### Performance evaluation
rfc_cv.fit(X_train, y_train)
y_pred = rfc_cv.predict(X_test)
target_names = ['class 0', 'class 1']
clear_output() 
x = x.iloc[::-1,:]
print(rfc_cv.best_params_)
plt.barh(x['title'], x['predicted_probability_rfc'])
plt.xlabel('probability')
print(classification_report(y_test, y_pred, target_names=target_names))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


acc_score = rfc_cv.score(X, y)
prec_score = precision_score(y_test, y_pred, labels= 'class 1')
print("Accuracy:", acc_score)
print("Precision:",prec_score)
name.append('rfc')
precision.append(prec_score)
accuracy.append(acc_score)
print(name, accuracy, precision)



# Ensemble Methods: Bag of KNN

In [None]:
bag = BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=10), n_estimators=100, random_state=42)
bag.fit(X, y)
y_pred = bag.predict(X_2019)
y_prob = bag.predict_proba(X_2019)
DFX_2019.loc[:,'predicted_probability_bag'] = y_prob[:,1]
clear_output()
x = DFX_2019.sort_values(by='predicted_probability_bag', ascending=False).head(10)


### Performance evaluation
bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)
target_names = ['class 0', 'class 1']
clear_output() 
x = x.iloc[::-1,:]
# print(bag.best_params_)
plt.barh(x['title'], x['predicted_probability_bag'])
plt.xlabel('probability')
print(classification_report(y_test, y_pred, target_names=target_names))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


acc_score = bag.score(X, y)
prec_score = precision_score(y_test, y_pred, labels= 'class 1')
print("Accuracy:", acc_score)
print("Precision:",prec_score)
name.append('bag')
precision.append(prec_score)
accuracy.append(acc_score)
print(name, accuracy, precision)


# Adaptive Boosting (Adaboost)

In [None]:
ada = AdaBoostClassifier(n_estimators=100, random_state=47)
ada.fit(X, y)
y_pred = ada.predict(X_2019)
y_prob = ada.predict_proba(X_2019)

DFX_2019.loc[:,'predicted_probability_ada'] = y_prob[:,1]
clear_output()
print(ada.score(X,y))
x = DFX_2019.sort_values(by='predicted_probability_ada', ascending=False).head(10)
# print(bag.best_params_)
x = x.iloc[::-1,:]


### Performance evaluation
ada.fit(X_train, y_train)
y_pred = ada.predict(X_test)
target_names = ['class 0', 'class 1']
clear_output() 
# print(ada.best_params_)
plt.barh(x['title'], x['predicted_probability_ada'])
plt.xlabel('probability')
print(classification_report(y_test, y_pred, target_names=target_names))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

acc_score = ada.score(X, y)
prec_score = precision_score(y_test, y_pred, labels= 'class 1')
print("Accuracy:", acc_score)
print("Precision:",prec_score)
name.append('ada')
precision.append(prec_score)
accuracy.append(acc_score)
print(name, accuracy, precision)


# Neural Network

In [None]:
mlp = MLPClassifier(solver='sgd', alpha=1e-5, hidden_layer_sizes=(10,10), max_iter=1000, random_state=41)
mlp.fit(X, y)
y_pred = mlp.predict(X_2019)
y_prob = mlp.predict_proba(X_2019)
# print(y_prob)
# print(classifier.score)
DFX_2019.loc[:,'predicted_probability_mlp'] = y_prob[:,1]
clear_output()
print(mlp.score(X, y))
x = DFX_2019.sort_values(by='predicted_probability_mlp', ascending=False).head(10)
x = x.iloc[::-1,:]


### Performance evaluation
mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)
target_names = ['class 0', 'class 1']
clear_output() 
plt.barh(x['title'], x['predicted_probability_mlp'])
plt.xlabel('probability')
print(classification_report(y_test, y_pred, target_names=target_names))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

acc_score = mlp.score(X, y)
prec_score = precision_score(y_test, y_pred, labels= 'class 1')
print("Accuracy:", acc_score)
print("Precision:",prec_score)
name.append('mlp')
precision.append(prec_score)
accuracy.append(acc_score)
print(name, accuracy, precision)

# Ensemble of Neural Networks

In [None]:
mlp = MLPClassifier(solver='sgd')
mlp_bag = BaggingClassifier(base_estimator=mlp, n_estimators=100, random_state=42).fit(X, y)
y_pred = mlp_bag.predict(X_2019)
y_prob = mlp_bag.predict_proba(X_2019)
DFX_2019.loc[:,'predicted_probability_mlpbag'] = y_prob[:,1]
x = DFX_2019.sort_values(by='predicted_probability_mlpbag', ascending=False).head(10)
x = x.iloc[::-1,:]


### Performance evaluation
mlp_bag.fit(X_train, y_train)
y_pred = mlp_bag.predict(X_test)
target_names = ['class 0', 'class 1']
clear_output() 
plt.barh(x['title'], x['predicted_probability_mlpbag'])
plt.xlabel('probability')
print(classification_report(y_test, y_pred, target_names=target_names))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

acc_score = mlp_bag.score(X, y)
prec_score = precision_score(y_test, y_pred, labels= 'class 1')
print("Accuracy:", acc_score)
print("Precision:",prec_score)
name.append('mlp_bag')
precision.append(prec_score)
accuracy.append(acc_score)
print(name, accuracy, precision)

In [None]:
print(name)
print(accuracy)
print(precision)
print(recall)


In [None]:
performance_df = pd.DataFrame({'classifier': name, 'accuracy':accuracy, 'precision':precision})
performance_df.set_index('classifier', inplace=True)
performance_df
pickle.dump(performance_df,open('my_data/performance_df', "wb" ))
print(performance_df)

# Grand Average Prediction

In [None]:


field = 'accuracy'
DFX_2019['final_proba'] = 1
DFX_2019['final_proba'] = DFX_2019['final_proba'] * DFX_2019['predicted_probability_log']*performance_df.loc['logreg',field] 
DFX_2019['final_proba'] = DFX_2019['final_proba'] * DFX_2019['predicted_probability_knn']*performance_df.loc['knn',field] 
DFX_2019['final_proba'] = DFX_2019['final_proba'] * DFX_2019['predicted_probability_rfc']*performance_df.loc['rfc',field] 
DFX_2019['final_proba'] = DFX_2019['final_proba'] * DFX_2019['predicted_probability_bag']*performance_df.loc['bag',field] 
DFX_2019['final_proba'] = DFX_2019['final_proba'] * DFX_2019['predicted_probability_ada']*performance_df.loc['ada',field] 
DFX_2019['final_proba'] = DFX_2019['final_proba'] * DFX_2019['predicted_probability_mlp']*performance_df.loc['mlp',field] 
DFX_2019['final_proba'] = DFX_2019['final_proba'] * DFX_2019['predicted_probability_mlpbag']*performance_df.loc['mlp_bag',field] 



 

DFX_2019.columns
x = DFX_2019.sort_values(by='final_proba', ascending=False).head(10)
# print(bag.best_params_)
# x = x.iloc[::-1,:]
plt.bar(x['title'], x['final_proba'])
plt.xticks(rotation=90)
plt.ylabel('predicted\nprobability')
plt.xticks(rotation=45, horizontalalignment = 'right')



In [None]:


field = 'precision'
DFX_2019['final_proba'] = 1
DFX_2019['final_proba'] = DFX_2019['final_proba'] * DFX_2019['predicted_probability_log']*performance_df.loc['logreg',field] 
# DFX_2019['final_proba'] = DFX_2019['final_proba'] * DFX_2019['predicted_probability_knn']*performance_df.loc['knn',field] 
DFX_2019['final_proba'] = DFX_2019['final_proba'] * DFX_2019['predicted_probability_rfc']*performance_df.loc['rfc',field] 
DFX_2019['final_proba'] = DFX_2019['final_proba'] * DFX_2019['predicted_probability_bag']*performance_df.loc['bag',field] 
DFX_2019['final_proba'] = DFX_2019['final_proba'] * DFX_2019['predicted_probability_ada']*performance_df.loc['ada',field] 
DFX_2019['final_proba'] = DFX_2019['final_proba'] * DFX_2019['predicted_probability_mlp']*performance_df.loc['mlp',field] 
DFX_2019['final_proba'] = DFX_2019['final_proba'] * DFX_2019['predicted_probability_mlpbag']*performance_df.loc['mlp_bag',field] 





DFX_2019.columns
x = DFX_2019.sort_values(by='final_proba', ascending=False).head(10)
# print(bag.best_params_)
# x = x.iloc[::-1,:]
plt.bar(x['title'], x['final_proba'])
plt.xticks(rotation=90)
plt.ylabel('predicted\nprobability')
plt.xticks(rotation=45, horizontalalignment = 'right')





# Deep Learning Prediction

In [None]:
print(X.shape)
print(y.shape)
print(type(X.values))
print(type(np.array(y)))
len(y[np.where(y==0)])
y[1:10]
X.values[3,]

In [None]:

# model = Sequential()
# model.add(Dense(32, input_dim=784))
# model.add(Activation('relu'))

import tensorflow as tf
import tensorflow.keras.backend as K

X_k = K.constant(X.values)
X_2019_k = K.constant(X_2019.values)
y_k = K.constant(y)

ndim = X.shape[1]
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(36, activation = tf.nn.relu, input_dim=ndim))
model.add(tf.keras.layers.Dense(36, activation = tf.nn.relu))
model.add(tf.keras.layers.Dense(2, activation = tf.nn.softmax))
model.compile(optimizer = 'adam',
              loss = 'sparse_categorical_crossentropy',
              metrics = ['accuracy'])
# model.fit(X, y, epochs = 5)
model.fit(X_k, y_k, epochs = 5)
y_pred = model.predict([X_2019_k])

             

In [None]:
y_prob = y_pred
DFX_2019.loc[:,'predicted_probability_deep'] = y_prob[:,1]
x = DFX_2019.sort_values(by='predicted_probability_deep', ascending=False).head(10)
x = x.iloc[::-1,:]

## Performance evaluation
plt.barh(x['title'], x['predicted_probability_deep'])
plt.xlabel('probability')


In [None]:
y_pred = mlp_bag.predict(X_2019)
