In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import linregress
import sqlite3


In [2]:
# create connection to database

con = sqlite3.connect("db/movies.db")
sql = f"""
   SELECT * FROM movie_data
   """

# bring in db to pandas dataframe
movies = pd.read_sql(sql, con)
movies.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,production_companies_count,genres_count,popularity,vote_count,vote_average,total_actor_starpower,release_month,holiday_month,tot_noms,tot_wins
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,4.0,4,185.070892,12114.0,7.2,5.334665,12,1,0,0
1,Color,Stephen Sommers,106.0,106.0,208.0,855.0,Jason Flemyng,3000.0,11146409.0,Action|Adventure|Horror|Sci-Fi,...,3.0,4,6.922458,155.0,6.0,10.933174,1,0,0,0
2,Color,Terrence Malick,222.0,150.0,0.0,855.0,Michael Greyeyes,23000.0,12712093.0,Biography|Drama|History|Romance,...,5.0,3,7.694502,336.0,6.4,11.013046,12,1,0,0
3,Color,Brian Robbins,76.0,98.0,48.0,722.0,Joel David Moore,21000.0,61112916.0,Comedy|Family|Fantasy,...,3.0,2,4.878907,138.0,4.5,13.698882,3,0,1,0
4,Color,Brad Peyton,178.0,94.0,62.0,722.0,Dwayne Johnson,14000.0,103812241.0,Action|Adventure|Comedy|Family|Fantasy|Sci-Fi,...,3.0,3,9.46307,1050.0,5.8,6.47524,1,0,0,0


# Testing models:
## Prepping the data to be tested by the QDA model

In [3]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.metrics import mean_squared_error, r2_score, roc_curve, roc_auc_score, f1_score, precision_recall_curve, average_precision_score, auc


In [4]:
# review columns
for col in movies.columns.sort_values().values:
    print(col)

actor_1_facebook_likes
actor_1_name
actor_1_score
actor_2_facebook_likes
actor_2_name
actor_2_score
actor_3_facebook_likes
actor_3_name
actor_3_score
aspect_ratio
budget
cast_total_facebook_likes
color
content_rating
country
director_facebook_likes
director_name
director_score
duration
facenumber_in_poster
genres
genres_count
gross
gross_margin
holiday_month
imdb_num_code
imdb_score
language
movie_facebook_likes
movie_imdb_link
movie_title
num_critic_for_reviews
num_user_for_reviews
num_voted_users
plot_keywords
popularity
production_companies_count
rating_numeric
release_date
release_month
revenue
success_bins
successful
title_year
tot_noms
tot_wins
total_actor_starpower
vote_average
vote_count


In [5]:
# create table for model purposes
# Note that I haven't included any predictors that won't be known before a movie release
movies = movies[["duration","facenumber_in_poster","rating_numeric","gross_margin", 
                   "successful","director_score","production_companies_count","total_actor_starpower","release_month",
                   "holiday_month","tot_wins","tot_noms", "total_actor_starpower", "vote_average", "vote_count"]]
movies = movies.dropna()
movies.head(10)


Unnamed: 0,duration,facenumber_in_poster,rating_numeric,gross_margin,successful,director_score,production_companies_count,total_actor_starpower,release_month,holiday_month,tot_wins,tot_noms,total_actor_starpower.1,vote_average,vote_count
0,178.0,0.0,3,0.688365,1,3.091853,4.0,5.334665,12,1,0,0,5.334665,7.2,12114.0
1,106.0,0.0,4,-3.037175,0,1.766773,3.0,10.933174,1,0,0,0,10.933174,6.0,155.0
2,150.0,0.0,3,-1.359958,0,1.32508,5.0,11.013046,12,1,0,0,11.013046,6.4,336.0
3,98.0,0.0,2,0.181842,1,2.65016,3.0,13.698882,3,0,0,1,13.698882,4.5,138.0
4,94.0,3.0,2,0.239011,1,1.32508,3.0,6.47524,1,0,0,0,6.47524,5.8,1050.0
5,146.0,4.0,4,-0.048999,0,0.441693,4.0,3.409212,5,0,1,2,3.409212,5.5,434.0
6,90.0,0.0,3,-0.325528,0,2.65016,1.0,2.81656,9,0,0,0,2.81656,4.9,134.0
7,121.0,0.0,4,-0.241362,0,2.208466,1.0,6.499734,11,1,0,0,6.499734,5.5,488.0
8,101.0,1.0,4,-2.179885,0,0.883387,8.0,3.409212,8,0,0,0,3.409212,3.2,106.0
9,110.0,1.0,4,0.357153,1,0.883387,4.0,3.409212,8,0,0,0,3.409212,6.5,1950.0


In [6]:
movies["successful"].value_counts()

1    1950
0    1687
Name: successful, dtype: int64

# Set features (X) and y, create model, make train and testing data

In [20]:
# Assign X (data) and y (target)
# Change X if you want to test other parameters; this set of 9 is the final set our group decided to test
X = movies[['duration','rating_numeric', 'director_score','production_companies_count',
            "holiday_month","tot_wins","tot_noms", "total_actor_starpower"]]


y = movies["successful"].values.reshape(-1, 1)
print(X.shape, y.shape)



(3637, 9) (3637, 1)


In [21]:
# split the training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1158, train_size=0.80, stratify=y)

# Create a StandardScater model and fit it to the training data
X_scaler = StandardScaler().fit(X_train)

  return self.partial_fit(X, y)


In [22]:
pd.DataFrame(y_test)[0].value_counts()

1    390
0    338
Name: 0, dtype: int64

In [23]:
# transform the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

  
  This is separate from the ipykernel package so we can avoid doing imports until


# Trying the PCA analysis with QDA model

In [24]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn import decomposition
from sklearn import datasets
from sklearn.svm import SVC

In [25]:
# define model; CHANGE FOR YOUR MODEL
# for SVC:
# modelSVC = SVC(C=5, gamma=0.1, kernel='linear')

model = QuadraticDiscriminantAnalysis()


In [26]:
# Function to test PCA with a range of component values

for n in range(2,10):
    
    pca = decomposition.PCA(n_components = n)
    pca.fit(X_train_scaled, y_train)
    X_train_pca = pca.transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)
    
    model.fit(X_train_pca, y_train)
    print("n="+str(n),model.score(X_test_pca, y_test),"")

n=2 0.5233516483516484 
n=3 0.5453296703296703 
n=4 0.5494505494505495 
n=5 0.5508241758241759 
n=6 0.5590659340659341 
n=7 0.5631868131868132 
n=8 0.5604395604395604 
n=9 0.5576923076923077 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [27]:
pca.explained_variance_

array([2.95628000e+00, 1.24169826e+00, 1.14192284e+00, 1.02345283e+00,
       9.62340975e-01, 8.14366245e-01, 6.83859691e-01, 1.79174066e-01,
       5.55092382e-32])

In [28]:
X_train_scaled

array([[-0.04891249,  0.86840406,  0.59554403, ...,  0.46143871,
         0.32595804,  0.32595804],
       [ 0.39352046,  0.86840406, -0.60204865, ..., -0.44615102,
         0.12671206,  0.12671206],
       [-0.49134544, -0.24005904, -0.84156719, ..., -0.44615102,
        -0.80268005, -0.80268005],
       ...,
       [ 0.34927716,  0.86840406, -0.12301158, ..., -0.44615102,
         0.82302606,  0.82302606],
       [-0.75680521,  0.86840406, -0.12301158, ...,  0.00764385,
        -0.59627092, -0.59627092],
       [-0.57983203,  0.86840406, -0.84156719, ..., -0.44615102,
         1.02943519,  1.02943519]])

In [29]:
import numpy as np

In [30]:
i =np.identity(len(X_train_scaled[0]))
i

array([[1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [31]:
pd.DataFrame(pca.transform(i))

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.210989,-0.497175,0.365514,0.194074,-0.106053,0.06274,0.722122,0.019785,2.523175e-18
1,0.052545,-0.334212,-0.395914,0.248189,-0.699497,-0.380777,-0.181405,7e-06,1.3002e-16
2,0.227009,-0.495734,0.143928,-0.045869,-0.04932,0.629701,-0.529401,-0.025973,-1.860102e-16
3,0.083859,-0.237689,-0.117461,0.664398,0.612942,-0.25736,-0.195743,0.031652,-7.293738000000001e-17
4,0.128889,-0.12267,0.659488,-0.274216,0.005025,-0.590968,-0.330066,-0.002931,4.58092e-17
5,0.429575,0.403729,0.171517,0.285598,-0.194036,0.101399,-0.067587,0.699274,-5.491701e-17
6,0.439577,0.387499,0.15438,0.306386,-0.158196,0.065334,-0.031499,-0.713179,-3.119788e-17
7,0.499891,-0.073045,-0.305747,-0.321384,0.171208,-0.110728,0.079205,0.012554,-0.7071068
8,0.499891,-0.073045,-0.305747,-0.321384,0.171208,-0.110728,0.079205,0.012554,0.7071068


In [32]:
# These column names are equivalent to rows 0-9 in pca.transform above
movies.columns

Index(['duration', 'facenumber_in_poster', 'rating_numeric', 'gross_margin',
       'successful', 'director_score', 'production_companies_count',
       'total_actor_starpower', 'release_month', 'holiday_month', 'tot_wins',
       'tot_noms', 'total_actor_starpower', 'vote_average', 'vote_count'],
      dtype='object')