In [1]:
#import dependencies

import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import time
from pprint import pprint as pp
import sqlite3
import warnings
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.svm import SVC, SVR
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense



In [2]:
# create connection to database
warnings.filterwarnings('ignore')

con = sqlite3.connect("db/movies.db")

sql = f"""
   SELECT * FROM movie_data
   """

# bring in db to pandas dataframe
movieDf = pd.read_sql(sql, con)
movieDf.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,gross_margin,successful,rating_numeric,director_score,actor_1_score,actor_2_score,actor_3_score,imdb_num_code,release_date,success_bins
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,0.688365,1,3,3.091853,1.520767,1.747604,2.066294,tt0499549,2009-12-10,extreme success
1,Color,Stephen Sommers,106.0,106.0,208.0,855.0,Jason Flemyng,3000.0,11146409.0,Action|Adventure|Horror|Sci-Fi,...,-3.037175,0,4,1.766773,3.041534,5.825346,2.066294,tt0118956,1998-01-30,no success
2,Color,Terrence Malick,222.0,150.0,0.0,855.0,Michael Greyeyes,23000.0,12712093.0,Biography|Drama|History|Romance,...,-1.359958,0,3,1.32508,8.364217,0.582535,2.066294,tt0402399,2005-12-25,no success
3,Color,Brian Robbins,76.0,98.0,48.0,722.0,Joel David Moore,21000.0,61112916.0,Comedy|Family|Fantasy,...,0.181842,1,2,2.65016,9.884984,1.747604,2.066294,tt0393735,2006-03-09,moderate success
4,Color,Brad Peyton,178.0,94.0,62.0,722.0,Dwayne Johnson,14000.0,103812241.0,Action|Adventure|Comedy|Family|Fantasy|Sci-Fi,...,0.239011,1,2,1.32508,2.661342,1.747604,2.066294,tt1397514,2012-01-19,average success


In [3]:
# review columns
movieDf.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes', 'revenue',
       'gross_margin', 'successful', 'rating_numeric', 'director_score',
       'actor_1_score', 'actor_2_score', 'actor_3_score', 'imdb_num_code',
       'release_date', 'success_bins'],
      dtype='object')

In [4]:
# create table for model purposes
modelDf = movieDf[["duration","facenumber_in_poster","rating_numeric","budget","revenue","gross_margin", "successful","director_score","actor_1_score","actor_2_score","actor_3_score"]]
modelDf = modelDf.dropna()

modelDf.head(100)

Unnamed: 0,duration,facenumber_in_poster,rating_numeric,budget,revenue,gross_margin,successful,director_score,actor_1_score,actor_2_score,actor_3_score
0,178.0,0.0,3,237000000.0,523505847.0,0.688365,1,3.091853,1.520767,1.747604,2.066294
1,106.0,0.0,4,45000000.0,-33853591.0,-3.037175,0,1.766773,3.041534,5.825346,2.066294
2,150.0,0.0,3,30000000.0,-17287907.0,-1.359958,0,1.325080,8.364217,0.582535,2.066294
3,98.0,0.0,2,50000000.0,11112916.0,0.181842,1,2.650160,9.884984,1.747604,2.066294
4,94.0,3.0,2,79000000.0,24812241.0,0.239011,1,1.325080,2.661342,1.747604,2.066294
5,146.0,4.0,4,100000000.0,-4671063.0,-0.048999,0,0.441693,0.760383,0.582535,2.066294
6,90.0,0.0,3,25000000.0,-6139597.0,-0.325528,0,2.650160,0.380192,1.747604,0.688765
7,121.0,0.0,4,83000000.0,-16137932.0,-0.241362,0,2.208466,1.520767,2.912673,2.066294
8,101.0,1.0,4,42000000.0,-28791977.0,-2.179885,0,0.883387,0.760383,0.582535,2.066294
9,110.0,1.0,4,45000000.0,25001065.0,0.357153,1,0.883387,0.760383,0.582535,2.066294


In [7]:
# Assign X (data) and y (target)
X = modelDf[["duration", "rating_numeric","director_score","actor_1_score","actor_2_score","actor_3_score"]]
y = modelDf["successful"].values.reshape(-1, 1)
print(X.shape, y.shape)

(3738, 6) (3738, 1)


In [8]:
# split the training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1158, train_size=0.9)

# Create a StandardScater model and fit it to the training data
X_scaler = StandardScaler().fit(X_train)

In [9]:
# transform the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Logistic Regression Model 

In [10]:
#build logistic model 
model = LogisticRegression()

#fit on training data
model.fit(X_train_scaled, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
# Use our model to predict a value
predicted = model.predict(X_test_scaled)

# Score the prediction with mse and r2
mse = mean_squared_error(y_test, predicted)
r2 = r2_score(y_test, predicted)
testing_score = model.score(X_test_scaled,y_test)

print(f"testing_score: {testing_score}")
# pd.DataFrame({"Prediction": predictions, "Actual": y_test})
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2 ): {r2}")

testing_score: 0.5561497326203209
Mean Squared Error (MSE): 0.44385026737967914
R-squared (R2 ): -0.7795230451731254


# Deep Neural Network

In [12]:
# creating the empty squential network
modelNN = Sequential()

In [13]:
# Add the first layer where the input dimensions are the 6 columns of the training data
modelNN.add(Dense(units=18, activation='relu', input_dim=6))

Instructions for updating:
Colocations handled automatically by placer.


In [14]:
# Add the hidden layer
modelNN.add(Dense(units=18, activation='relu'))

In [15]:
# Add output layer 
modelNN.add(Dense(units=2, activation='softmax'))

In [16]:
# Compile the model using the adaptive learning rate optimizer "adam", spare_categorical_crossentropy
# for the loss function since we did not one-hot encode the labels and used accuracy for the training metrics.
modelNN.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [17]:
# Use the training data to fit (train) the model
modelNN.fit(
    X_train_scaled,
    y_train,
    epochs=100,
    shuffle=True,
    verbose=2
)

Epoch 1/100
 - 0s - loss: 0.7059 - acc: 0.5161
Epoch 2/100
 - 0s - loss: 0.6889 - acc: 0.5312
Epoch 3/100
 - 0s - loss: 0.6857 - acc: 0.5446
Epoch 4/100
 - 0s - loss: 0.6831 - acc: 0.5550
Epoch 5/100
 - 0s - loss: 0.6814 - acc: 0.5547
Epoch 6/100
 - 0s - loss: 0.6800 - acc: 0.5639
Epoch 7/100
 - 0s - loss: 0.6796 - acc: 0.5618
Epoch 8/100
 - 0s - loss: 0.6792 - acc: 0.5606
Epoch 9/100
 - 0s - loss: 0.6771 - acc: 0.5696
Epoch 10/100
 - 0s - loss: 0.6765 - acc: 0.5696
Epoch 11/100
 - 0s - loss: 0.6761 - acc: 0.5707
Epoch 12/100
 - 0s - loss: 0.6753 - acc: 0.5675
Epoch 13/100
 - 0s - loss: 0.6743 - acc: 0.5705
Epoch 14/100
 - 0s - loss: 0.6732 - acc: 0.5794
Epoch 15/100
 - 0s - loss: 0.6727 - acc: 0.5746
Epoch 16/100
 - 0s - loss: 0.6733 - acc: 0.5710
Epoch 17/100
 - 0s - loss: 0.6725 - acc: 0.5770
Epoch 18/100
 - 0s - loss: 0.6715 - acc: 0.5817
Epoch 19/100
 - 0s - loss: 0.6706 - acc: 0.5803
Epoch 20/100
 - 0s - loss: 0.6705 - acc: 0.5844
Epoch 21/100
 - 0s - loss: 0.6701 - acc: 0.5829
E

<tensorflow.python.keras.callbacks.History at 0x1a2b71c320>

In [18]:
# evaluate model by using test data
model_loss, model_accuracy = modelNN.evaluate(
    X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

 - 0s - loss: 0.6855 - acc: 0.5642
Loss: 0.6854826613543505, Accuracy: 0.5641711354255676


# Grid Search / Cross Validation

In [19]:
# import dependencies from sklearn, build model using "rbf" kernel for the support vector classifier and bring in
# hyperparameters "C" and "gamma"

from sklearn.model_selection import GridSearchCV
modelGS = SVC(kernel='linear')
param_grid = {'C': [.01, 1, 10, 100, 1000],
              'gamma': [0.0001, 0.001, 0.01,]}
grid = GridSearchCV(modelGS, param_grid, verbose=3)

In [None]:
grid.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 15 candidates, totalling 45 fits
[CV] C=0.01, gamma=0.0001 ............................................
[CV] ................ C=0.01, gamma=0.0001, score=0.540, total=   0.1s
[CV] C=0.01, gamma=0.0001 ............................................
[CV] ................ C=0.01, gamma=0.0001, score=0.540, total=   0.1s
[CV] C=0.01, gamma=0.0001 ............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s


[CV] ................ C=0.01, gamma=0.0001, score=0.540, total=   0.1s
[CV] C=0.01, gamma=0.001 .............................................
[CV] ................. C=0.01, gamma=0.001, score=0.540, total=   0.1s
[CV] C=0.01, gamma=0.001 .............................................
[CV] ................. C=0.01, gamma=0.001, score=0.540, total=   0.1s
[CV] C=0.01, gamma=0.001 .............................................
[CV] ................. C=0.01, gamma=0.001, score=0.540, total=   0.1s
[CV] C=0.01, gamma=0.01 ..............................................
[CV] .................. C=0.01, gamma=0.01, score=0.540, total=   0.1s
[CV] C=0.01, gamma=0.01 ..............................................
[CV] .................. C=0.01, gamma=0.01, score=0.540, total=   0.1s
[CV] C=0.01, gamma=0.01 ..............................................
[CV] .................. C=0.01, gamma=0.01, score=0.540, total=   0.1s
[CV] C=1, gamma=0.0001 ...............................................
[CV] .

In [24]:
# Linear SVC - best parameters and score for the data
print(grid.best_params_)
print(grid.best_score_)

{'C': 1000, 'gamma': 0.005}
0.5633174791914387


In [None]:
modelGS2 = SVC(kernel='rbf')
param_grid = {'C': [.01, 1, 10, 100, 1000],
              'gamma': [0.0001,0.0005, 0.001,0.005, 0.01, 0.05]}
grid2 = GridSearchCV(modelGS2, param_grid, verbose=3)

In [None]:
grid2.fit(X_train_scaled, y_train)

In [None]:
# store dataframe in a sqlite db
con = sqlite3.connect("db/movies.db")
movies.to_sql("movie_data", con, if_exists="replace", index=False)
# commit the changes and close the connection
con.commit()
con.close()