In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_palette('husl')

from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import sqlite3


# Data: movie_data from sqllite (Angie's experimentation with So Jung's code)

In [4]:
# create connection to database

con = sqlite3.connect("db/movies.db")
sql = f"""
   SELECT * FROM movie_data
   """

# bring in db to pandas dataframe
movies = pd.read_sql(sql, con)
movies.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,production_companies_count,genres_count,popularity,vote_count,vote_average,total_actor_starpower,release_month,holiday_month,tot_noms,tot_wins
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,4.0,4,185.070892,12114.0,7.2,5.334665,12,1,0,0
1,Color,Stephen Sommers,106.0,106.0,208.0,855.0,Jason Flemyng,3000.0,11146409.0,Action|Adventure|Horror|Sci-Fi,...,3.0,4,6.922458,155.0,6.0,10.933174,1,0,0,0
2,Color,Terrence Malick,222.0,150.0,0.0,855.0,Michael Greyeyes,23000.0,12712093.0,Biography|Drama|History|Romance,...,5.0,3,7.694502,336.0,6.4,11.013046,12,1,0,0
3,Color,Brian Robbins,76.0,98.0,48.0,722.0,Joel David Moore,21000.0,61112916.0,Comedy|Family|Fantasy,...,3.0,2,4.878907,138.0,4.5,13.698882,3,0,1,0
4,Color,Brad Peyton,178.0,94.0,62.0,722.0,Dwayne Johnson,14000.0,103812241.0,Action|Adventure|Comedy|Family|Fantasy|Sci-Fi,...,3.0,3,9.46307,1050.0,5.8,6.47524,1,0,0,0


In [6]:
# review columns
for col in movies.columns.sort_values().values:
    print(col)

actor_1_facebook_likes
actor_1_name
actor_1_score
actor_2_facebook_likes
actor_2_name
actor_2_score
actor_3_facebook_likes
actor_3_name
actor_3_score
aspect_ratio
budget
cast_total_facebook_likes
color
content_rating
country
director_facebook_likes
director_name
director_score
duration
facenumber_in_poster
genres
genres_count
gross
gross_margin
holiday_month
imdb_num_code
imdb_score
language
movie_facebook_likes
movie_imdb_link
movie_title
num_critic_for_reviews
num_user_for_reviews
num_voted_users
plot_keywords
popularity
production_companies_count
rating_numeric
release_date
release_month
revenue
success_bins
successful
title_year
tot_noms
tot_wins
total_actor_starpower
vote_average
vote_count


In [8]:
# create table for model purposes
# Note that I haven't included any predictors that won't be known before a movie release
movies = movies[["duration","facenumber_in_poster","rating_numeric","gross_margin", 
                   "successful","director_score","production_companies_count","total_actor_starpower","release_month",
                   "holiday_month","tot_wins","tot_noms", "total_actor_starpower", "vote_average", "vote_count"]]
movies = movies.dropna()
movies.head(10)

Unnamed: 0,duration,facenumber_in_poster,rating_numeric,gross_margin,successful,director_score,production_companies_count,total_actor_starpower,total_actor_starpower.1,release_month,holiday_month,tot_wins,tot_noms,total_actor_starpower.2,total_actor_starpower.3,vote_average,vote_count
0,178.0,0.0,3,0.688365,1,3.091853,4.0,5.334665,5.334665,12,1,0,0,5.334665,5.334665,7.2,12114.0
1,106.0,0.0,4,-3.037175,0,1.766773,3.0,10.933174,10.933174,1,0,0,0,10.933174,10.933174,6.0,155.0
2,150.0,0.0,3,-1.359958,0,1.32508,5.0,11.013046,11.013046,12,1,0,0,11.013046,11.013046,6.4,336.0
3,98.0,0.0,2,0.181842,1,2.65016,3.0,13.698882,13.698882,3,0,0,1,13.698882,13.698882,4.5,138.0
4,94.0,3.0,2,0.239011,1,1.32508,3.0,6.47524,6.47524,1,0,0,0,6.47524,6.47524,5.8,1050.0
5,146.0,4.0,4,-0.048999,0,0.441693,4.0,3.409212,3.409212,5,0,1,2,3.409212,3.409212,5.5,434.0
6,90.0,0.0,3,-0.325528,0,2.65016,1.0,2.81656,2.81656,9,0,0,0,2.81656,2.81656,4.9,134.0
7,121.0,0.0,4,-0.241362,0,2.208466,1.0,6.499734,6.499734,11,1,0,0,6.499734,6.499734,5.5,488.0
8,101.0,1.0,4,-2.179885,0,0.883387,8.0,3.409212,3.409212,8,0,0,0,3.409212,3.409212,3.2,106.0
9,110.0,1.0,4,0.357153,1,0.883387,4.0,3.409212,3.409212,8,0,0,0,3.409212,3.409212,6.5,1950.0


In [10]:
movies["successful"].value_counts()

1    1950
0    1687
Name: successful, dtype: int64

In [11]:
# Assign X (data) and y (target)
# Change X if you want to test other features
# Only testing features that will be predictable before a movie is released
X = movies[["rating_numeric","director_score","production_companies_count","total_actor_starpower","release_month",
             "holiday_month","tot_wins","tot_noms", "total_actor_starpower", "vote_average", "vote_count"]]
y = movies["successful"].values.reshape(-1, 1)
print(X.shape, y.shape)

(3637, 17) (3637, 1)


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train.head()

Unnamed: 0,rating_numeric,director_score,production_companies_count,total_actor_starpower,total_actor_starpower.1,total_actor_starpower.2,total_actor_starpower.3,release_month,holiday_month,tot_wins,tot_noms,total_actor_starpower.4,total_actor_starpower.5,total_actor_starpower.6,total_actor_starpower.7,vote_average,vote_count
816,3,3.533546,4.0,15.162407,15.162407,15.162407,15.162407,12,1,0,5,15.162407,15.162407,15.162407,15.162407,6.7,1195.0
59,2,3.091853,2.0,20.176518,20.176518,20.176518,20.176518,10,0,0,4,20.176518,20.176518,20.176518,20.176518,7.2,1269.0
937,1,1.766773,2.0,5.156816,5.156816,5.156816,5.156816,7,0,1,1,5.156816,5.156816,5.156816,5.156816,5.8,94.0
1452,3,3.091853,5.0,3.02902,3.02902,3.02902,3.02902,7,0,0,0,3.02902,3.02902,3.02902,3.02902,5.9,997.0
1962,3,1.766773,2.0,4.300319,4.300319,4.300319,4.300319,9,0,0,0,4.300319,4.300319,4.300319,4.300319,5.1,65.0


In [13]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

  return self.partial_fit(X, y)


In [14]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

  """Entry point for launching an IPython kernel.
  


In [16]:
y_train

array([[1],
       [1],
       [0],
       ...,
       [1],
       [0],
       [1]])

# PCA

In [17]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn import decomposition
from sklearn import datasets
from sklearn.svm import SVC

In [18]:
pca = decomposition.PCA(n_components = 2)

In [19]:
pca.fit(X_train_scaled, y_train)

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [20]:
X_train_pca = pca.transform(X_train_scaled)

In [21]:
len(X_train_pca[0])

2

In [22]:
modelSVC = SVC(C=5, gamma=0.1, kernel='linear')

In [23]:
modelSVC.fit(X_train_pca, y_train)

  y = column_or_1d(y, warn=True)


SVC(C=5, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [24]:
X_test_pca = pca.transform(X_test_scaled)

In [25]:
X_test_pca

array([[-1.33761043,  1.82644449],
       [ 2.20199287, -0.20326244],
       [ 4.80308019,  1.24317044],
       ...,
       [-1.08852407, -0.44092198],
       [-2.3567328 ,  0.20757807],
       [-0.90647416, -0.42856314]])

In [26]:
modelSVC.score(X_test_pca, y_test) #drops from 0.58 to 0.52 with Angie's smaller set of parameters

0.5208791208791209

In [27]:

modelSVC = SVC(C=5, gamma=0.1, kernel='linear')


for n in range(2,12):
    
    pca = decomposition.PCA(n_components = n)
    pca.fit(X_train_scaled, y_train)
    X_train_pca = pca.transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)
    
    modelSVC.fit(X_train_pca, y_train)
    print("n="+str(n),modelSVC.score(X_test_pca, y_test),"")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


n=2 0.5208791208791209 
n=3 0.5571428571428572 


  y = column_or_1d(y, warn=True)


n=4 0.6164835164835165 


  y = column_or_1d(y, warn=True)


n=5 0.6164835164835165 


  y = column_or_1d(y, warn=True)


n=6 0.6417582417582418 


  y = column_or_1d(y, warn=True)


n=7 0.6351648351648351 


  y = column_or_1d(y, warn=True)


n=8 0.6428571428571429 


  y = column_or_1d(y, warn=True)


n=9 0.6417582417582418 


  y = column_or_1d(y, warn=True)


n=10 0.6483516483516484 


  y = column_or_1d(y, warn=True)


n=11 0.6483516483516484 


#### test values from So Jung's PCA:
n=2 0.5840425531914893 
n=3 0.7691489361702127 
n=4 0.8521276595744681 
n=5 0.8776595744680851 
n=6 0.8840425531914894 
n=7 0.8882978723404256 
n=8 0.8829787234042553 
n=9 0.8882978723404256 

#### test values from Angie's PCA (less features and only pre-release predictive features):
n=2 0.5208791208791209 
n=3 0.5571428571428572 
n=4 0.6164835164835165 
n=5 0.6164835164835165 
n=6 0.6417582417582418 
n=7 0.6351648351648351 
n=8 0.6428571428571429 
n=9 0.6417582417582418 
n=10 0.6483516483516484 *** Highest values with PCA + SVG
n=11 0.6483516483516484 ***

In [279]:
pca.explained_variance_

array([5.91765603, 2.37985045, 2.05118039, 1.76001338, 1.16751937,
       1.01701426, 0.96839769, 0.92942712, 0.8801705 ])

In [280]:
X_train_scaled

array([[-0.22903476,  1.56881398, -0.26249603, ...,  0.79747789,
         0.62938141, -0.11293428],
       [-0.22600066,  0.29410078, -0.34136865, ..., -0.06291859,
         0.62938141, -0.21409354],
       [-0.25364472, -0.64009402, -0.43226197, ...,  0.12828063,
        -0.69884327, -0.66133247],
       ...,
       [-0.24825076, -0.57890779, -0.39746523, ...,  0.22388023,
        -0.69884327, -0.12937109],
       [-0.23004613, -0.56403614, -0.3306133 , ..., -1.2101139 ,
         0.62938141, -0.70585809],
       [-0.25566746,  0.80398606,  2.72981325, ..., -1.01891468,
         0.62938141,  0.81835047]])

In [281]:
import numpy as np

In [289]:
i =np.identity(len(X_train_scaled[0]))
i

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.,

In [294]:
pd.DataFrame(pca.transform(i))

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.142795,-0.046575,0.010097,-0.082484,-0.115905,-0.304237,-0.091815,0.410482,-0.800696
1,0.178205,0.434103,0.079351,-0.187046,-0.146432,-0.103633,0.155388,0.039384,0.052366
2,0.192279,0.430406,0.070592,-0.111222,-0.068376,-0.00938,-0.028772,-0.02254,0.058857
3,0.192138,0.325134,0.037966,-0.000136,0.079852,0.106043,-0.196992,-0.046425,-0.028508
4,0.222127,0.500799,0.083295,-0.164628,-0.10791,-0.063411,0.063899,0.012833,0.057551
5,0.270319,-0.019618,-0.022676,0.202658,0.071556,0.246141,0.164669,-0.100368,-0.195938
6,0.302967,-0.049412,-0.000873,0.230809,0.044179,0.130312,0.221708,-0.094129,-0.100758
7,0.296407,-0.12833,-0.045898,0.149637,0.078334,-0.205821,-0.101243,0.066801,0.082419
8,0.352887,-0.113051,-0.05657,0.15214,-0.038262,-0.041853,-0.075026,0.06799,-0.015111
9,0.214862,-0.207981,0.095138,-0.462341,0.282605,-0.131433,-0.142754,-0.033364,0.103764


In [285]:
modelSVC = SVC(C=5, gamma=0.1, kernel='linear')