# Netflix Machine Learning - Modeling

In [1]:
# import libraries
import math
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
# import encoded dataset
data = pd.read_csv(r"C:\Users\Shinemet\Ironhack\Projects\Project7\Work_files\data_clean_encoded.csv")

In [3]:
# check data
data.head()

Unnamed: 0,year,kind,rating,vote,runtime,genre1,country1,language1,cast1,cast2,director1,writer1
0,2003.0,2,7.7,474.0,50.0,6,62,16,867,4968,4121,3319
1,1994.0,7,8.1,18.0,83.0,8,62,16,3627,438,3672,5036
2,1992.0,1,5.5,93.0,95.0,0,37,33,2513,1667,4077,2610
3,2004.0,1,5.3,13432.0,60.0,6,62,16,3047,1545,4022,4892
4,1991.0,1,4.5,2177.0,96.0,4,62,16,639,3350,1124,2893


### Data prep for modeling

In [4]:
# convert target variable to process models
data.rating = data.rating.astype(int)
data.head()

Unnamed: 0,year,kind,rating,vote,runtime,genre1,country1,language1,cast1,cast2,director1,writer1
0,2003.0,2,7,474.0,50.0,6,62,16,867,4968,4121,3319
1,1994.0,7,8,18.0,83.0,8,62,16,3627,438,3672,5036
2,1992.0,1,5,93.0,95.0,0,37,33,2513,1667,4077,2610
3,2004.0,1,5,13432.0,60.0,6,62,16,3047,1545,4022,4892
4,1991.0,1,4,2177.0,96.0,4,62,16,639,3350,1124,2893


In [5]:
# split dataset into independent and dependent variables
X = data.drop('rating', axis=1)
y = data['rating']

In [6]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# import models
from sklearn import ensemble, datasets, tree
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

### Feature Selection

In [8]:
# Find the optimal number of features with SFM
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

SFM = SelectFromModel(estimator=RandomForestClassifier())
s = SFM.fit(X, y)

n_features = s.transform(X).shape[1]
n_features

7

In [9]:
# Get feature names 
feature_idx = s.get_support()
feature_name = X.columns[feature_idx]
feature_name

Index(['year', 'vote', 'runtime', 'cast1', 'cast2', 'director1', 'writer1'], dtype='object')

In [10]:
# split dataset with features
X = data[feature_name]
y = data['rating']

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Data scaling

### DecisionTree (for reference)

In [11]:
# fitting DecisionTree for reference
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)

print('Test accuracy: %.3f' %metrics.accuracy_score(y_test, y_pred))

Test accuracy: 0.336


### BaggingClassifier

#### with default parameters

In [12]:
# fitting BaggingClassifier to the dataset
from sklearn.ensemble import BaggingClassifier

bc = BaggingClassifier()
bc.fit(X_train, y_train)

# model is overfitting training dataset
print('Training Accuracy : %.3f' %bc.score(X_train, y_train))
print('Test Accuracy : %.3f' %bc.score(X_test, y_test))

Training Accuracy : 0.985
Test Accuracy : 0.368


#### with hyperparameter

In [13]:
# GridsearchCV with DecisionTree (default base estimator)
params = {'base_estimator': [DecisionTreeClassifier()],
         'n_estimators': np.arange(100,400,10)
         }

bc_grid = GridSearchCV(BaggingClassifier(random_state=42), 
                       param_grid=params, 
                       scoring='accuracy', 
                       cv=3, n_jobs=-1, verbose=1)
bc_grid.fit(X_train, y_train)

print(bc_grid.best_params_)
print(bc_grid.best_score_)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
{'base_estimator': DecisionTreeClassifier(), 'n_estimators': 390}
0.4011971510298374


In [14]:
# Accuracy score on test data is still low
bc = BaggingClassifier(n_estimators=390)
bc.fit(X_train, y_train)

print('Training Accuracy : %.3f' %bc.score(X_train, y_train))
print('Test Accuracy : %.3f' %bc.score(X_test, y_test))

Training Accuracy : 1.000
Test Accuracy : 0.415


#### with hyperparameter based on a different estimator

In [25]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

params = {'base_estimator': [StandardScaler(), LogisticRegression(random_state=42)],
          'n_estimators': [390]}

bagging_classifier_grid = GridSearchCV(BaggingClassifier(random_state=1, n_jobs=-1), 
                                       param_grid=params,
                                       cv=3, n_jobs=-1, verbose=1)
bagging_classifier_grid.fit(X_train, y_train)

print('Train Accuracy : %.3f' %bagging_classifier_grid.best_estimator_.score(X_train, y_train))
print('Test Accuracy : %.3f' %bagging_classifier_grid.best_estimator_.score(X_test, y_test))
print('Best Accuracy Through Grid Search : %.3f' %bagging_classifier_grid.best_score_)
print('Best Parameters : ', bagging_classifier_grid.best_params_)

Fitting 3 folds for each of 2 candidates, totalling 6 fits




Train Accuracy : 0.341
Test Accuracy : 0.346
Best Accuracy Through Grid Search : 0.342
Best Parameters :  {'base_estimator': LogisticRegression(random_state=42), 'n_estimators': 390}


In [29]:
# Accuracy score on test data with hyperparameters
bc = BaggingClassifier({'bootstrap': False, 'bootstrap_features': True, 'max_samples': 0.5, 'n_estimators': 100})
bc.fit(X_train, y_train)
y_pred = bc.predict(X_test)

print('Training Accuracy : %.3f' %bc.score(X_train, y_train))
print('Test Accuracy : %.3f' %bc.score(X_test, y_test))

AttributeError: 'dict' object has no attribute 'fit'

### Comparison metrics

*The model is not fitting the dataset as is, we need to choose a different model.*