<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Capstone Project: Film Linguistics
## Notebook 3a - Support Vector Regression
#### Stephen Strawbridge, Cohort #1019

In [1]:
#Import necessary packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Import from sklearn.
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.decomposition import PCA

from sklearn.svm import LinearSVR, SVR

In [2]:
#Read in cleaned dataset
df = pd.read_csv('./CSVs/cleaned_df.csv');

In [3]:
df = df.drop(columns='Unnamed: 0')
df.head()

Unnamed: 0,MovieID,IDSubtitleFile,IDSubtitle,MovieName,MovieYear,SubDownloadsCnt,TotalWords,HarmVirtue,HarmVirtue-ratio,HarmVice,...,Relig-ratio,Death,Death-ratio,Informal,Informal-ratio,Swear,Swear-ratio,Filler,Filler-ratio,years_old
0,6247,141146,102946,Jan Dara,2001,4094,4352,3,0.000689,10,...,0.002757,14,0.003217,67,0.015395,34,0.007813,1,0.00023,20
1,70811,1953655263,5113335,Pocahontas,1995,81,6315,8,0.001267,39,...,0.003959,25,0.003959,68,0.010768,2,0.000317,0,0.0,26
2,38254,1952519756,3855025,Space Chimps,2008,45,7435,10,0.001345,8,...,0.002824,17,0.002286,340,0.04573,5,0.000672,62,0.008339,13
3,150,1953609111,5067084,A Bug's Life,1998,213,7932,8,0.001009,12,...,0.001009,18,0.002269,461,0.058119,8,0.001009,24,0.003026,23
4,4150,1953020828,4460230,Babe,1995,186,6176,4,0.000648,12,...,0.0034,10,0.001619,171,0.027688,5,0.00081,7,0.001133,26


## Section 1 - Create Features
---

#### Create Features for Model

In [4]:
#First create list of all ratio columns to use
ratio_cols = [col for col in df.columns if 'ratio' in col]


#Create list of all the dummified genres to include in features
genres = ['drama', 'comedy', 'romance', 'action', 'crime', 'mystery', 'western', 'animation', 'thriller',
         'biography', 'adventure', 'horror', 'fantasy', 'musical', 'war', 'sport', 'documentary', 'family',
         'short', 'music', 'history', 'adult', 'news']

#All of features to include in model
other_feats = ['years_old']

#Create features list
features = ratio_cols + genres + other_feats

#Create dataframe for features
X = df[features]

In [5]:
#Create array for target variable ('rating_rank')
y = df['rating_rank']

### Apply polynomial features
---

In [6]:
#Instantiate polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)

#Apply polynomial features transformer to X
X = poly.fit_transform(X)

In [7]:
#Check out new dataframe
X_poly_df = pd.DataFrame(X, columns=poly.get_feature_names(features))
X_poly_df.head()

Unnamed: 0,HarmVirtue-ratio,HarmVice-ratio,FairnessVirtue-ratio,FairnessVice-ratio,IngroupVirtue-ratio,IngroupVice-ratio,AuthorityVirtue-ratio,AuthorityVice-ratio,PurityVirtue-ratio,PurityVice-ratio,...,history^2,history adult,history news,history years_old,adult^2,adult news,adult years_old,news^2,news years_old,years_old^2
0,0.000689,0.002298,0.0,0.0,0.000919,0.00023,0.016774,0.0,0.000689,0.001379,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,400.0
1,0.001267,0.006176,0.000158,0.0,0.00095,0.001108,0.003325,0.000475,0.000633,0.001108,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,676.0
2,0.001345,0.001076,0.000403,0.0,0.000672,0.000403,0.003228,0.000672,0.000269,0.000269,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,169.0
3,0.001009,0.001513,0.000126,0.0,0.00063,0.000252,0.002774,0.0,0.0,0.000252,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,529.0
4,0.000648,0.001943,0.000486,0.000162,0.001457,0.000486,0.001295,0.000648,0.000162,0.000324,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,676.0


In [8]:
# Train-test-split data.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.3,
                                                    random_state = 42)

## Section 2 - PCA
---

#### Standardize Data (as needed in PCA)

In [9]:
# Instantiate our StandardScaler.
ss = StandardScaler()

# Standardize X_train.
X_train = ss.fit_transform(X_train)

# Standardize X_test.
X_test = ss.transform(X_test)

#### Apply PCA

In [10]:
# Instantiate PCA.
pca = PCA(n_components=300, random_state=42)

# Fit PCA on the training data.
pca.fit(X_train)

# Transform PCA on the training data.
Z_train = pca.transform(X_train)

# Transform PCA on the testing data.
Z_test = pca.transform(X_test)

#For presentation purposes, create dataframe with z-scores for each column
df_pca = pd.DataFrame(Z_train)
df_pca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,8.114775,-3.372211,0.44694,-0.19897,-12.958314,9.444238,4.167551,1.333543,-0.987725,-4.864393,...,-1.378509,-0.025673,0.912764,0.288093,0.46188,-2.071623,0.193236,1.437602,0.687258,-0.124483
1,-16.299773,3.839617,-20.319406,-14.167889,0.380537,4.084092,-1.432914,-3.358613,0.895895,-5.362388,...,0.247957,-0.056257,-0.481707,0.596248,-0.443106,-0.365659,0.047342,-0.464613,0.458605,-0.208534
2,10.913103,-21.013487,-4.283065,-4.884279,5.837939,-7.312906,-2.705243,-2.170001,-4.213721,3.722573,...,-0.036314,0.103989,0.166303,0.268613,0.278798,0.134578,-0.267698,-0.173607,-0.014045,0.100165
3,-15.064743,-0.238585,-5.446797,-3.895622,1.999216,-0.457907,-1.680499,0.728353,1.478193,-3.045113,...,0.231288,0.087348,0.493549,-0.017588,0.051604,-0.09397,0.152818,-0.018044,0.003252,0.167136
4,8.812461,13.176378,10.531943,-8.71508,-11.064069,-3.449811,0.47293,7.549452,0.692496,-3.573096,...,0.224667,0.110588,1.440846,-0.580409,0.571338,-0.054263,1.192533,-0.552069,-0.1629,1.333707


Each one of these columns (Z-features) is a combination of all of the other original features in the dataframe<br>
Each cell is an individual weight.

In [11]:
#Check out explained variance ratios of 20 most important principal components
pca.explained_variance_ratio_[:20]

array([0.05767599, 0.05480548, 0.03722313, 0.03291898, 0.03086933,
       0.0261695 , 0.02023609, 0.01983845, 0.01901599, 0.0188649 ,
       0.01834533, 0.01792859, 0.01713732, 0.01698997, 0.01677413,
       0.0165149 , 0.01587925, 0.01532409, 0.01491745, 0.01472048])

#### Unfortunately, PCA did not appear to improve our model.  As illustrated in the explained variance ratios above,  there were no clear principal components that held predictive value in the dataset.

## Section 3 - Support Vector Regression Model
---

In [15]:
#From these model parameters, we are going to make a dictionary to save our best parameters
#Note that we should NOT re-run this cell, unless we want to create new dataframe of parameters
model_params = {}
count = 0

In [13]:
# Instantiate and fit a gridsearch model for this SVR
svr = SVR()

# Create a paramaters grid
p_grid = {
    'C':np.linspace(0, 1.5, 20),
    'kernel':['rbf', 'polynomial'], 
    'degree':[0.25, 0.5, 1]
}

# Instantiate a GridSearch 
gssvr = GridSearchCV(estimator=svr, param_grid=p_grid, cv=5, n_jobs=4)

# Fit on training data.
gssvr.fit(Z_train, y_train)

GridSearchCV(cv=5, estimator=SVR(), n_jobs=4,
             param_grid={'C': array([0.        , 0.07894737, 0.15789474, 0.23684211, 0.31578947,
       0.39473684, 0.47368421, 0.55263158, 0.63157895, 0.71052632,
       0.78947368, 0.86842105, 0.94736842, 1.02631579, 1.10526316,
       1.18421053, 1.26315789, 1.34210526, 1.42105263, 1.5       ]),
                         'degree': [0.25, 0.5, 1],
                         'kernel': ['rbf', 'polynomial']})

In [16]:
#Create dataframe of best results (best parameters and the respective score)
count += 1
gssvr.best_params_['score'] = gssvr.best_score_ 
model_params[f'model_{count}'] = gssvr.best_params_

model_df = pd.DataFrame.from_dict(model_params, orient='index')
model_df

Unnamed: 0,C,degree,kernel,score
model_1,0.789474,0.25,rbf,0.314678


In [17]:
#Save best param/score results
#model_df.to_csv('./Best_Params/SVR/SVR_Jan21_params.csv')

In [18]:
#Look at R-squared scores
train_score = round(gssvr.score(Z_train, y_train), 5)
test_score = round(gssvr.score(Z_test, y_test), 5)

print(f'Training R-squared: {train_score}')
print(f'Testing R-squared: {test_score}')

Training R-squared: 0.49676
Testing R-squared: 0.2956


In [19]:
#Baseline RMSE (source code leveraged from https://numpy.org/doc/stable/reference/generated/numpy.zeros_like.html)
null_pred = np.zeros_like(y_test)
null_pred = null_pred + y_test.mean()
base_RMSE = round(metrics.mean_squared_error(y_test, null_pred, squared=False), 5)

print(f'Baseline RMSE: {base_RMSE}')

Baseline RMSE: 1.1321


In [20]:
#Create predictions and residuals variables
preds = gssvr.predict(Z_test)
resids = y_test - preds

test_RMSE = round(metrics.mean_squared_error(y_test, preds, squared=False), 5)
print(f'Testing RMSE: {test_RMSE}')

Testing RMSE: 0.95015


In [21]:
#How much better is our testing RMSE than baseline RMSE
under_base = round(base_RMSE/test_RMSE, 4)
print(f'Model is {under_base} times better than baseline model')

Model is 1.1915 times better than baseline model


In [None]:
#Plot residuals

plt.figure(figsize=(16,9))
sns.scatterplot(x=y_test, y=resids)
plt.title('Residuals of Predictions', fontsize=20)
plt.ylabel('Error');
plt.axhline(0, color='r');

#### Interesting Note:  The residuals seem to follow a *linear* pattern.  Throughout all the data, the model seems to generally seems to underestimate very poorly ranked movies and very highly ranked movies (e.g. model gives predictions that are more closely centered around the mean).

In [None]:
plt.figure(figsize=(8,8))
plt.title('Actuals vs Predictions', fontsize=20);

sns.regplot(x=y_test, y=preds, data=Z_test, marker='.');

plt.xlim(1.5, 9.5)
plt.ylim(1.5, 9.5)
plt.xlabel('Actuals', fontsize=12);
plt.ylabel('Predictions', fontsize=12);

#### As seen in graph above, our predictions fall in the range of about 4 to 8.5 rating rank.  Although this is a small range as compared to the actual range, it does make sense, as the distribution of actual rating ranks is very steep (e.g. only a very small percentage of movies fall below a 4 or above an 8.5 on the rating rank).