## *Part 3: Modeling on the Movielens*
***

In [101]:
#system specific
import os
import sys
assert sys.version_info >= (3,5)
#import the libraries
import pandas as pd
import numpy as np
import random
#visualizations 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#consistent size plots
from pylab import rcParams
rcParams['figure.figsize'] = (12,5)
rcParams['xtick.labelsize'] = 12
rcParams['ytick.labelsize'] = 12
rcParams['axes.labelsize'] = 12
#handle unwanted warnings
import warnings
warnings.filterwarnings(action='ignore',category=DeprecationWarning)
warnings.filterwarnings(action='ignore',category=FutureWarning)

In [102]:
#load the data
movies = pd.read_csv('movies.csv',engine='python',delimiter=',')

In [103]:
movies.head()

Unnamed: 0,MovieID,UserID,Rating,Age,Occupation,Count,Genre_enc_1,Genre_enc_2,Gender_enc,zipcode_sum,weekday_enc
0,1,1,5,1,10,1,2,3,0,12,2
1,48,1,5,1,10,1,2,3,0,12,2
2,150,1,5,1,10,1,7,7,0,12,3
3,260,1,4,1,10,1,0,1,0,12,3
4,527,1,5,1,10,1,7,16,0,12,2


In [104]:
#pick one of the movies for modeling
sample = movies.copy()

In [105]:
sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 923889 entries, 0 to 923888
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   MovieID      923889 non-null  int64
 1   UserID       923889 non-null  int64
 2   Rating       923889 non-null  int64
 3   Age          923889 non-null  int64
 4   Occupation   923889 non-null  int64
 5   Count        923889 non-null  int64
 6   Genre_enc_1  923889 non-null  int64
 7   Genre_enc_2  923889 non-null  int64
 8   Gender_enc   923889 non-null  int64
 9   zipcode_sum  923889 non-null  int64
 10  weekday_enc  923889 non-null  int64
dtypes: int64(11)
memory usage: 77.5 MB


In [106]:
len(sample)

923889

In [107]:
sample.head()

Unnamed: 0,MovieID,UserID,Rating,Age,Occupation,Count,Genre_enc_1,Genre_enc_2,Gender_enc,zipcode_sum,weekday_enc
0,1,1,5,1,10,1,2,3,0,12,2
1,48,1,5,1,10,1,2,3,0,12,2
2,150,1,5,1,10,1,7,7,0,12,3
3,260,1,4,1,10,1,0,1,0,12,3
4,527,1,5,1,10,1,7,16,0,12,2


In [108]:
warnings.filterwarnings(action='ignore',message='')
sample.drop('Count',axis=1,inplace=True)

In [109]:
sample['Rating'] = sample['Rating'].astype(float)

In [110]:
#split the data into input X and output y
X = sample.drop(['MovieID','Rating'],axis=1).values
y = sample['Rating'].values

In [111]:
#split into the train and test dataset
from sklearn.model_selection import train_test_split

In [112]:
#split into training and test dataset
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [113]:
X_train.shape, X_test.shape

((692916, 8), (230973, 8))

In [114]:
#import linear model
from sklearn.linear_model import LinearRegression

In [115]:
lin_reg =  LinearRegression(n_jobs=-1)

In [116]:
#fit the model
lin_reg.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

In [117]:
lin_reg.intercept_

3.441869822778767

In [118]:
lin_reg.coef_

array([ 4.49369423e-06,  4.65236626e-03,  1.09577220e-03,  1.49006286e-02,
        1.51196905e-03, -3.87363952e-02, -8.93586800e-04, -4.96554773e-03])

In [119]:
sample.corr()['Rating']

MovieID       -0.064032
UserID         0.010418
Rating         1.000000
Age            0.053454
Occupation     0.007550
Genre_enc_1    0.057247
Genre_enc_2    0.033631
Gender_enc    -0.017553
zipcode_sum   -0.003664
weekday_enc   -0.010453
Name: Rating, dtype: float64

In [120]:
#predict the ratings of the movie on the test dataset
predict_rating = np.round(lin_reg.predict(X_test),0)

In [121]:
#evaluate the model
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [122]:
error = mean_squared_error(y_test,predict_rating)

In [123]:
error

1.3502444008607066

In [124]:
print('Baseline Linear Model RMSE: %.3f' %(np.sqrt(error)))

Baseline Linear Model RMSE: 1.162


<b> This was a plain simple linear regression applied on the dataset. The sample could be biased as there could be more 4 and 5 ratings compared to lesser user ratings like 1, 2 and 3. A better approach is to use Stratified kfold sampling and get an average score using cross validation. This way we get a better generalization of the model performance. </b>

## *Stratified Kfold split and cross validation*

In [125]:
#import the library
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [129]:
#Stratifiedkfold split --> model fit --> predictions -- > mean absolute score 
skf = StratifiedKFold(n_splits=20,shuffle=True)
mse_score = []
for train_index, test_index in skf.split(X, y):
    #print("TRAIN:", train_index[:10], "TEST:", test_index[:10])
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    #fit the model
    lin_reg.fit(X_train,y_train)
    #prediction
    skf_rating_predict = np.round(lin_reg.predict(X_test))
    print('Expected')
    print(y_test[:10])
    print('Predicted')
    print(skf_rating_predict[:10])
    print('\n')
    #calculate the mean absolute error
    skf_score = mean_squared_error(y_test,skf_rating_predict)
    
    #append the mean absolute score per split
    mse_score.append(skf_score)

Expected
[3. 3. 5. 4. 3. 3. 4. 5. 4. 5.]
Predicted
[3. 3. 4. 4. 4. 4. 4. 4. 4. 4.]


Expected
[3. 4. 4. 3. 5. 4. 4. 4. 5. 5.]
Predicted
[3. 4. 3. 3. 3. 3. 4. 4. 4. 4.]


Expected
[5. 5. 5. 5. 3. 4. 5. 3. 5. 3.]
Predicted
[3. 4. 4. 3. 4. 4. 4. 4. 4. 4.]


Expected
[4. 4. 4. 4. 4. 3. 3. 5. 5. 4.]
Predicted
[4. 3. 4. 4. 4. 4. 4. 4. 4. 4.]


Expected
[4. 4. 4. 4. 5. 5. 4. 4. 4. 5.]
Predicted
[3. 3. 3. 4. 4. 4. 4. 4. 4. 4.]


Expected
[5. 4. 3. 3. 5. 4. 5. 5. 5. 4.]
Predicted
[4. 3. 4. 4. 4. 4. 4. 4. 4. 4.]


Expected
[5. 5. 3. 4. 4. 5. 5. 5. 5. 4.]
Predicted
[4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]


Expected
[3. 3. 4. 5. 5. 4. 5. 3. 5. 5.]
Predicted
[3. 3. 4. 4. 4. 4. 4. 4. 4. 4.]


Expected
[1. 3. 3. 5. 3. 4. 5. 5. 5. 3.]
Predicted
[4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]


Expected
[5. 4. 4. 4. 4. 4. 5. 3. 3. 5.]
Predicted
[3. 4. 4. 4. 4. 4. 4. 4. 4. 4.]


Expected
[5. 4. 4. 4. 4. 3. 5. 3. 5. 3.]
Predicted
[3. 3. 4. 4. 4. 4. 4. 4. 4. 4.]


Expected
[4. 5. 5. 3. 4. 5. 3. 4. 4. 3.]
Predicted
[3. 4. 4. 4. 4

In [130]:
print('Stratified K Fold RMSE: %.3f' % np.sqrt(np.mean(mse_score)))

Stratified K Fold RMSE: 1.163


## *Summary and further improvement*
Other sampling technique like RepeatedStratifiedKfold technique can be applied. 
Linear Model does not seem to give good result. The MAE is comparatively better if it is applied on a single movie. This can be verified by using a sample of the movied dataframe on a particular MovieID. 
The correlation betweeen the labels and the predictors are also very weak. The weak predictors can e dropped from modeling. 

### *Improvements*
- Use different regressors like SVM
- Use Repeated Stratified k-fold sampling technique
- Better feature engineering
- Rating can be used as a string instead of a numeric and Logistic Regression can be applied. 
