In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import time
from pprint import pprint as pp
import sqlite3
import warnings
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.svm import SVC 

warnings.filterwarnings('ignore')

# importing movie table
movies = pd.read_csv("movie_metadata.csv")
movies.head()

# create revenue column
movies["revenue"] = movies["gross"] - movies["budget"]

# create gross margin column
movies["gross_margin"] = movies["revenue"] / movies["gross"]

# success/failure column
movies["sf"] = np.where(movies["gross_margin"] >= 0 , 1, 0)

movies.head(100)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,revenue,gross_margin,sf
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,523505847.0,0.688365,1
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,9404152.0,0.030394,1
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,-44925825.0,-0.224546,0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,198130642.0,0.442127,1
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,12.0,7.1,,0,,,0
5,Color,Andrew Stanton,462.0,132.0,475.0,530.0,Samantha Morton,640.0,73058679.0,Action|Adventure|Sci-Fi,...,PG-13,263700000.0,2012.0,632.0,6.6,2.35,24000,-190641321.0,-2.609427,0
6,Color,Sam Raimi,392.0,156.0,0.0,4000.0,James Franco,24000.0,336530303.0,Action|Adventure|Romance,...,PG-13,258000000.0,2007.0,11000.0,6.2,2.35,0,78530303.0,0.233353,1
7,Color,Nathan Greno,324.0,100.0,15.0,284.0,Donna Murphy,799.0,200807262.0,Adventure|Animation|Comedy|Family|Fantasy|Musi...,...,PG,260000000.0,2010.0,553.0,7.8,1.85,29000,-59192738.0,-0.294774,0
8,Color,Joss Whedon,635.0,141.0,0.0,19000.0,Robert Downey Jr.,26000.0,458991599.0,Action|Adventure|Sci-Fi,...,PG-13,250000000.0,2015.0,21000.0,7.5,2.35,118000,208991599.0,0.455328,1
9,Color,David Yates,375.0,153.0,282.0,10000.0,Daniel Radcliffe,25000.0,301956980.0,Adventure|Family|Fantasy|Mystery,...,PG,250000000.0,2009.0,11000.0,7.5,2.35,10000,51956980.0,0.172067,1


In [2]:
movies.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes', 'revenue',
       'gross_margin', 'sf'],
      dtype='object')

In [3]:
modelDf = movies[["duration","facenumber_in_poster","content_rating","budget","revenue","gross_margin", "sf"]]
modelDf = modelDf.dropna()

# decided to declare approved as PG. Not Rated, Passed and Unrated declared as 0 or null

modelDf["rating_numeric"] = modelDf["content_rating"].map({"G":1,"TV-G":1,"PG":2,"TV-PG":2,
                                                           "GP":2,"PG-13":3,"TV-14":3,"R":4,"TV-MA":4,
                                                           "M":4,"NC-17":5,"X":5,"Not Rated":0,"Approved":2,
                                                           "Unrated":0,"Passed":0})
modelDf.head(100)
# modelDf["content_rating"].value_counts()

Unnamed: 0,duration,facenumber_in_poster,content_rating,budget,revenue,gross_margin,sf,rating_numeric
0,178.0,0.0,PG-13,237000000.0,523505847.0,0.688365,1,3
1,169.0,0.0,PG-13,300000000.0,9404152.0,0.030394,1,3
2,148.0,1.0,PG-13,245000000.0,-44925825.0,-0.224546,0,3
3,164.0,0.0,PG-13,250000000.0,198130642.0,0.442127,1,3
5,132.0,1.0,PG-13,263700000.0,-190641321.0,-2.609427,0,3
6,156.0,0.0,PG-13,258000000.0,78530303.0,0.233353,1,3
7,100.0,1.0,PG,260000000.0,-59192738.0,-0.294774,0,2
8,141.0,4.0,PG-13,250000000.0,208991599.0,0.455328,1,3
9,153.0,3.0,PG,250000000.0,51956980.0,0.172067,1,2
10,183.0,0.0,PG-13,250000000.0,80249062.0,0.242996,1,3


In [4]:
# Assign X (data) and y (target)
X = modelDf[["duration", "rating_numeric"]]
y = modelDf["sf"].values.reshape(-1, 1)
print(X.shape, y.shape)

(3834, 2) (3834, 1)


In [5]:
# split the training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1158)

# Create a StandardScater model and fit it to the training data
X_scaler = StandardScaler().fit(X_train)
# y_scaler = StandardScaler().fit(y_train)

In [6]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
# y_train_scaled = y_scaler.transform(y_train)
# y_test_scaled = y_scaler.transform(y_test)

# Logistic Regression Model 

In [7]:
#build logistic model 
model = LogisticRegression()

#fit on training data
model.fit(X_train_scaled, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [8]:
# Use our model to predict a value
predicted = model.predict(X_test_scaled)

# Score the prediction with mse and r2
mse = mean_squared_error(y_test, predicted)
r2 = r2_score(y_test, predicted)
testing_score = model.score(X_test_scaled,y_test)

print(f"testing_score: {testing_score}")
# pd.DataFrame({"Prediction": predictions, "Actual": y_test})
# print(f"Mean Squared Error (MSE): {mse}")
# print(f"R-squared (R2 ): {r2}")

testing_score: 0.5172054223149114


# Support Vector Machine

In [9]:
#Support Vector Machine Model
model_SVC = SVC(kernel='linear')
model_SVC.fit(X_train_scaled, y_train)
predictions = model_SVC.predict(X_test_scaled)

In [10]:
con = sqlite3.connect("db/movies.db")

sql = f"""
   SELECT * FROM movie_data
   """

movie_data = pd.read_sql(sql, con)
movie_data.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,gross_margin,successful,rating_numeric,director_score,actor_1_score,actor_2_score,actor_3_score,imdb_num_code,release_date,success_bins
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,0.688365,1,3,3.091853,1.520767,1.747604,2.066294,tt0499549,2009-12-10,extreme success
1,Color,Stephen Sommers,106.0,106.0,208.0,855.0,Jason Flemyng,3000.0,11146409.0,Action|Adventure|Horror|Sci-Fi,...,-3.037175,0,4,1.766773,3.041534,5.825346,2.066294,tt0118956,1998-01-30,no success
2,Color,Terrence Malick,222.0,150.0,0.0,855.0,Michael Greyeyes,23000.0,12712093.0,Biography|Drama|History|Romance,...,-1.359958,0,3,1.32508,8.364217,0.582535,2.066294,tt0402399,2005-12-25,no success
3,Color,Brian Robbins,76.0,98.0,48.0,722.0,Joel David Moore,21000.0,61112916.0,Comedy|Family|Fantasy,...,0.181842,1,2,2.65016,9.884984,1.747604,2.066294,tt0393735,2006-03-09,moderate success
4,Color,Brad Peyton,178.0,94.0,62.0,722.0,Dwayne Johnson,14000.0,103812241.0,Action|Adventure|Comedy|Family|Fantasy|Sci-Fi,...,0.239011,1,2,1.32508,2.661342,1.747604,2.066294,tt1397514,2012-01-19,average success


In [None]:
# store dataframe in a sqlite db
con = sqlite3.connect("db/movies.db")
movies.to_sql("movie_data", con, if_exists="replace", index=False)
# commit the changes and close the connection
con.commit()
con.close()