In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_palette('husl')

from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import sqlite3

import warnings 
warnings.filterwarnings('ignore')

# Data: movie_data from sqllite (Angie's experimentation with So Jung's code)

In [3]:
# create connection to database

con = sqlite3.connect("db/movies.db")
sql = f"""
   SELECT * FROM movie_data
   """

# bring in db to pandas dataframe
movies = pd.read_sql(sql, con)
movies.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,rel_mon_04,rel_mon_05,rel_mon_06,rel_mon_07,rel_mon_08,rel_mon_09,rel_mon_10,rel_mon_11,rel_mon_12,release_month
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,0,0,0,0,0,0,0,0,1,12
1,Color,Stephen Sommers,106.0,106.0,208.0,855.0,Jason Flemyng,3000.0,11146409.0,Action|Adventure|Horror|Sci-Fi,...,0,0,0,0,0,0,0,0,0,1
2,Color,Terrence Malick,222.0,150.0,0.0,855.0,Michael Greyeyes,23000.0,12712093.0,Biography|Drama|History|Romance,...,0,0,0,0,0,0,0,0,1,12
3,Color,Brian Robbins,76.0,98.0,48.0,722.0,Joel David Moore,21000.0,61112916.0,Comedy|Family|Fantasy,...,0,0,0,0,0,0,0,0,0,3
4,Color,Brad Peyton,178.0,94.0,62.0,722.0,Dwayne Johnson,14000.0,103812241.0,Action|Adventure|Comedy|Family|Fantasy|Sci-Fi,...,0,0,0,0,0,0,0,0,0,1


In [4]:
# review columns
for col in movies.columns.sort_values().values:
    print(col)

actor_1_facebook_likes
actor_1_name
actor_1_score
actor_2_facebook_likes
actor_2_name
actor_2_score
actor_3_facebook_likes
actor_3_name
actor_3_score
aspect_ratio
budget
cast_total_facebook_likes
color
content_rating
country
director_facebook_likes
director_name
director_score
duration
facenumber_in_poster
genres
genres_count
gross
gross_margin
holiday_month
imdb_num_code
imdb_score
language
movie_facebook_likes
movie_imdb_link
movie_title
num_critic_for_reviews
num_user_for_reviews
num_voted_users
plot_keywords
popularity
production_companies_count
rating_numeric
rel_mon_01
rel_mon_02
rel_mon_03
rel_mon_04
rel_mon_05
rel_mon_06
rel_mon_07
rel_mon_08
rel_mon_09
rel_mon_10
rel_mon_11
rel_mon_12
release_date
release_month
revenue
success_bins
successful
title_year
tot_noms
tot_wins
total_actor_starpower
vote_average
vote_count


In [5]:
# create table for model purposes
# Note that I haven't included any predictors that won't be known before a movie release
movies = movies[["duration","facenumber_in_poster","rating_numeric","gross_margin", 
                   "successful","director_score","production_companies_count","total_actor_starpower","release_month",
                   "holiday_month","tot_wins","tot_noms", "total_actor_starpower", "vote_average", "vote_count"]]
movies = movies.dropna()
movies.head(10)

Unnamed: 0,duration,facenumber_in_poster,rating_numeric,gross_margin,successful,director_score,production_companies_count,total_actor_starpower,release_month,holiday_month,tot_wins,tot_noms,total_actor_starpower.1,vote_average,vote_count
0,178.0,0.0,3,0.688365,1,3.091853,4.0,5.334665,12,1,0,0,5.334665,7.2,12114.0
1,106.0,0.0,4,-3.037175,0,1.766773,3.0,10.933174,1,0,0,0,10.933174,6.0,155.0
2,150.0,0.0,3,-1.359958,0,1.32508,5.0,11.013046,12,1,0,0,11.013046,6.4,336.0
3,98.0,0.0,2,0.181842,1,2.65016,3.0,13.698882,3,0,0,1,13.698882,4.5,138.0
4,94.0,3.0,2,0.239011,1,1.32508,3.0,6.47524,1,0,0,0,6.47524,5.8,1050.0
5,146.0,4.0,4,-0.048999,0,0.441693,4.0,3.409212,5,0,1,2,3.409212,5.5,434.0
6,90.0,0.0,3,-0.325528,0,2.65016,1.0,2.81656,9,0,0,0,2.81656,4.9,134.0
7,121.0,0.0,4,-0.241362,0,2.208466,1.0,6.499734,11,1,0,0,6.499734,5.5,488.0
8,101.0,1.0,4,-2.179885,0,0.883387,8.0,3.409212,8,0,0,0,3.409212,3.2,106.0
9,110.0,1.0,4,0.357153,1,0.883387,4.0,3.409212,8,0,0,0,3.409212,6.5,1950.0


In [6]:
movies["successful"].value_counts()

1    1950
0    1687
Name: successful, dtype: int64

In [7]:
# Assign X (data) and y (target)
# Change X if you want to test other features
# Only testing features that will be predictable before a movie is released
X = movies[['duration','rating_numeric', 'director_score','production_companies_count','release_month',
            "holiday_month","tot_wins","tot_noms", "total_actor_starpower"]]


y = movies["successful"].values.reshape(-1, 1)
print(X.shape, y.shape)

(3637, 10) (3637, 1)


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train.head()

Unnamed: 0,duration,rating_numeric,director_score,production_companies_count,release_month,holiday_month,tot_wins,tot_noms,total_actor_starpower,total_actor_starpower.1
816,121.0,3,3.533546,4.0,12,1,0,5,15.162407,15.162407
59,101.0,2,3.091853,2.0,10,0,0,4,20.176518,20.176518
937,87.0,1,1.766773,2.0,7,0,1,1,5.156816,5.156816
1452,94.0,3,3.091853,5.0,7,0,0,0,3.02902,3.02902
1962,98.0,3,1.766773,2.0,9,0,0,0,4.300319,4.300319


In [9]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

In [10]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [10]:
y_train

array([[1],
       [1],
       [0],
       ...,
       [1],
       [0],
       [1]])

# PCA

In [11]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn import decomposition
from sklearn import datasets
from sklearn.svm import SVC

In [12]:
pca = decomposition.PCA(n_components = 2)

In [13]:
pca.fit(X_train_scaled, y_train)

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [14]:
X_train_pca = pca.transform(X_train_scaled)

In [15]:
len(X_train_pca[0])

2

In [16]:
modelSVC = SVC(C=5, gamma=0.1, kernel='linear')

In [17]:
modelSVC.fit(X_train_pca, y_train)

  y = column_or_1d(y, warn=True)


SVC(C=5, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [18]:
X_test_pca = pca.transform(X_test_scaled)

In [19]:
X_test_pca

array([[-0.19784944,  0.92440353],
       [-0.01984668, -0.64270658],
       [ 4.47649433, -0.87707538],
       ...,
       [-0.79858821,  0.21766371],
       [-1.47281903,  0.04773381],
       [-0.66395651,  0.15313774]])

In [20]:
modelSVC.score(X_test_pca, y_test) #drops from 0.58 to 0.52 with Angie's smaller set of parameters

0.5208791208791209

In [22]:

modelSVC = SVC(C=5, gamma=0.1, kernel='linear')


for n in range(2,10):
    
    pca = decomposition.PCA(n_components = n)
    pca.fit(X_train_scaled, y_train)
    X_train_pca = pca.transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)
    
    modelSVC.fit(X_train_pca, y_train)
    print("n="+str(n),modelSVC.score(X_test_pca, y_test),"")
    

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


n=2 0.5208791208791209 
n=3 0.5208791208791209 


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


n=4 0.5208791208791209 
n=5 0.5208791208791209 


  y = column_or_1d(y, warn=True)


n=6 0.5362637362637362 


  y = column_or_1d(y, warn=True)


n=7 0.5362637362637362 


  y = column_or_1d(y, warn=True)


n=8 0.5384615384615384 


  y = column_or_1d(y, warn=True)


n=9 0.5373626373626373 


In [23]:
pca.explained_variance_

array([2.98048541, 1.62725739, 1.2566536 , 0.99418027, 0.96501193,
       0.91895043, 0.72789793, 0.36175394, 0.17147748])

In [24]:
X_train_scaled

array([[ 0.44244261, -0.23482105,  0.8340484 , ...,  1.82917454,
         1.86022398,  1.86022398],
       [-0.40232616, -1.33320357,  0.59498492, ...,  1.37574677,
         2.92452764,  2.92452764],
       [-0.9936643 , -2.43158609, -0.12220554, ...,  0.01546343,
        -0.26357977, -0.26357977],
       ...,
       [ 2.17421859,  0.86356147, -0.36126902, ..., -0.43796434,
         0.82847163,  0.82847163],
       [-1.07814118, -3.5299686 , -0.83939599, ..., -0.43796434,
        -1.00762596, -1.00762596],
       [-0.95142586,  0.86356147, -0.36126902, ..., -0.43796434,
        -0.88397618, -0.88397618]])

In [25]:
import numpy as np

In [26]:
i =np.identity(len(X_train_scaled[0]))
i

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [27]:
pd.DataFrame(pca.transform(i))

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.211142,0.213117,-0.329873,0.407297,0.057023,0.392347,0.690138,0.040743,-0.012407
1,0.044547,-0.087048,-0.364461,0.484531,-0.52947,-0.577008,-0.071144,-0.067057,0.001393
2,0.218833,0.086371,-0.428549,0.123766,-0.081203,0.538778,-0.669299,-0.027863,0.00818
3,0.074298,-0.078283,-0.324503,0.194885,0.83431,-0.359723,-0.131973,-0.030106,-0.035684
4,0.166911,0.647054,0.062036,-0.041383,-0.008017,-0.21846,-0.113439,0.697833,-0.016528
5,0.166448,0.6532,0.071261,-0.117126,0.008251,-0.151493,-0.021707,-0.709042,0.024449
6,0.420297,-0.112862,0.453656,0.320705,0.019622,0.028811,-0.0944,-0.039873,-0.700172
7,0.430679,-0.124508,0.422894,0.320964,0.06156,0.009122,-0.078166,0.001599,0.712159
8,0.491085,-0.172954,-0.194713,-0.402951,-0.068691,-0.104536,0.111599,0.020701,-0.010511
9,0.491085,-0.172954,-0.194713,-0.402951,-0.068691,-0.104536,0.111599,0.020701,-0.010511


In [29]:
# These column names are equivalent to rows 0-9 in pca.transform above
movies.columns

Index(['duration', 'facenumber_in_poster', 'rating_numeric', 'gross_margin',
       'successful', 'director_score', 'production_companies_count',
       'total_actor_starpower', 'release_month', 'holiday_month', 'tot_wins',
       'tot_noms', 'total_actor_starpower', 'vote_average', 'vote_count'],
      dtype='object')