In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, LassoCV, Lasso, LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline,FeatureUnion
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix, roc_auc_score, roc_curve
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.base import TransformerMixin
from sklearn.model_selection import cross_val_score, cross_val_score, train_test_split
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn import metrics

data = pd.read_csv('ratings_clean.csv')

In [4]:
data.head()
data = data.drop(data.index[12958])
data[['year']] = data[['year']].apply(pd.to_numeric)

In [18]:
# correlation 
data.corr()

Unnamed: 0,year,metascore,weighted_average_vote,total_votes,mean_vote,median_vote,votes_10,votes_9,votes_8,votes_7,...,Adventure,Family,Fantasy,Mystery,Thriller,Biography,Animation,Action,Film-Noir,Sport
year,1.0,-0.184218,-0.267356,-0.029548,-0.254643,-0.20105,-0.056979,-0.053484,-0.041972,-0.004929,...,-0.039308,-0.05971,-0.020452,0.016827,0.034234,0.047307,0.018872,0.014834,-0.144721,-0.023802
metascore,-0.184218,1.0,0.692238,0.194167,0.646379,0.61677,0.174051,0.217833,0.254563,0.199447,...,-0.030274,-0.028321,-0.051549,-0.029548,-0.077741,0.084543,0.066184,-0.156396,0.065426,-0.02578
weighted_average_vote,-0.267356,0.692238,1.0,0.350695,0.973778,0.922052,0.289876,0.342934,0.406406,0.368057,...,0.014653,-0.03127,-0.044601,-0.041977,-0.094404,0.137943,0.079066,-0.069044,0.056338,0.016058
total_votes,-0.029548,0.194167,0.350695,1.0,0.331762,0.297199,0.861012,0.91733,0.955956,0.894911,...,0.225531,0.010939,0.073252,0.024909,0.029374,0.011298,0.056896,0.201573,9e-05,-0.005947
mean_vote,-0.254643,0.646379,0.973778,0.331762,1.0,0.944884,0.279591,0.330119,0.39177,0.349576,...,0.012688,-0.016133,-0.043167,-0.058264,-0.11161,0.138438,0.081835,-0.079197,0.052334,0.021208
median_vote,-0.20105,0.61677,0.922052,0.297199,0.944884,1.0,0.259404,0.301037,0.348909,0.305669,...,-0.001761,-0.036339,-0.03964,-0.05357,-0.101915,0.121779,0.063656,-0.08195,0.044389,0.00826
votes_10,-0.056979,0.174051,0.289876,0.861012,0.279591,0.259404,1.0,0.936734,0.77563,0.57534,...,0.128455,0.008483,0.037868,0.004057,-0.006798,0.008735,0.03729,0.091831,0.003339,-0.005007
votes_9,-0.053484,0.217833,0.342934,0.91733,0.330119,0.301037,0.936734,1.0,0.905246,0.672633,...,0.126012,-0.00411,0.027349,0.018818,0.006817,0.02548,0.04055,0.088372,0.005532,-0.00511
votes_8,-0.041972,0.254563,0.406406,0.955956,0.39177,0.348909,0.77563,0.905246,1.0,0.881288,...,0.186772,0.001466,0.046616,0.026416,0.026324,0.036971,0.06249,0.149334,0.005089,-0.002308
votes_7,-0.004929,0.199447,0.368057,0.894911,0.349576,0.305669,0.57534,0.672633,0.881288,1.0,...,0.25873,0.012272,0.078052,0.026959,0.050791,0.017935,0.071352,0.249748,-0.003556,-0.0022


In [5]:
# Set up X and y variables
X = data.drop(['imdb_title_id', 'title','metascore'], axis=1)
y = data[['metascore']]

In [6]:
# Make training and testing sets, giving testing set 20% of data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1)

In [7]:
# Make variables for both the categorical and numerical variables in the X table
numeric_features = X.select_dtypes(exclude='object').columns
# only need this if we have categorical variable
categorical_features = X.select_dtypes('object').columns

In [8]:
# Make pipeline for Numeric featurs to be transformed into workable data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

In [9]:
# Only need this if we use categorical data
# Make pipeline for Categorical Features to be transformed into workable data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [10]:
# Make a pipeline combining both numeric and categorical transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [40]:
# Make a pipeline that applies the preprocessing of the data, 
# and applies a Lasso regression feature selection on the data.
laspipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor',  Lasso(alpha=.5))])

knnpipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('knn',  KNeighborsRegressor(n_neighbors=5))])


In [42]:
# Fit pipeline to train data
laspipe.fit(X_train, y_train)
knnpipe.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['year', 'weighted_average_vote', 'total_votes', 'mean_vote',
       'median_vote', 'votes_10', 'votes_9', 'votes_8', 'votes_7', 'votes_6',
       'votes_5', 'votes_4', 'votes_3', 'votes_2', 'votes_1',
       'allgenders_18age_avg_vote...
       'Sci-Fi', 'Music', 'Musical', 'Comedy', 'Romance', 'Crime', 'War',
       'Western', 'Horror', 'Adventure', 'Family', 'Fantasy', 'Mystery',
       'Thriller', 'Biography', 'Animation', 'Action', 'Film-Noir', 'Sport'],
      dtype='object')),
                                      

In [43]:
# Instantiate
lm_re = LinearRegression()
dt_re = DecisionTreeRegressor(min_samples_leaf = 3)

# Refit models
lm_re.fit(X_train, y_train)
dt_re.fit(X_train, y_train)

DecisionTreeRegressor(min_samples_leaf=3)

In [14]:
yhat = clf.predict(X_test)
np.sqrt(mean_squared_error(y_test, yhat))

12.177259012410774

In [15]:
# Cross Validation score for Mean Absolute Error
MAE = cross_val_score(clf, X_train, y_train, scoring='neg_mean_absolute_error')
np.mean(MAE)*-1

9.954055146050106

In [16]:
# Cross Validation score for Mean Squared Error
MSE = cross_val_score(clf, X_train, y_train, scoring='neg_mean_squared_error')
np.mean(MSE)*-1

154.01177689430605

In [17]:
# Cross Validation score for Root Mean Squared Error
RMSE = cross_val_score(clf, X_train, y_train, scoring='neg_root_mean_squared_error')
np.mean(RMSE)*-1

12.409150667849222