# Rating Prediction using Linear Regression

In [82]:
import math
import numpy as np
from numpy import linalg as LA
import pandas as pd
from IPython.display import Image, display

In [2]:
np.set_printoptions(precision=2)
pd.set_option('display.precision', 2)

## Move Feature Matrix

In [3]:
movies = pd.read_csv('movies_w_imgurl.csv')

In [4]:
movieGenres = pd.DataFrame(data=movies['genres'].str.split('|').apply(pd.Series, 1).stack(), columns=['genre'])
movieGenres.index = movieGenres.index.droplevel(1)

In [41]:
genres = movieGenres.groupby('genre').count()

In [53]:
movieWeights = pd.DataFrame(data=movies['movieId'])
for genre in genres.index:
    df = pd.DataFrame(data=movieGenres[movieGenres['genre']==genre], columns=[genre])
    df[genre] = 1
    movieWeights = movieWeights.join(df, on='movieId')
movieWeights.fillna(0, inplace=True)
movieWeights

Unnamed: 0,movieId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,6,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,7,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,8,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,9,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9,10,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


## Make Regression Model for Users

In [54]:
ratings = pd.read_csv('ratings-9_1.csv')
train = ratings[ratings['type'] == 'train'][['userId', 'movieId', 'rating']]
test = ratings[ratings['type'] == 'test'][['userId', 'movieId', 'rating']]

In [55]:
users = [33, 39, 77, 144, 238]

In [61]:
userId = 33
userRatings = train[train['userId'] == userId][['movieId', 'rating']]
userRatings = userRatings.sort_values(by='movieId')

In [63]:
userLRTrain = movieWeights[movieWeights['movieId'].isin(userRatings['movieId'].values)].sort_values(by='movieId')

In [70]:
X = userLRTrain.iloc[:,1:].values
Y = userRatings['rating'].values

In [85]:
from sklearn import linear_model as lm
reg = lm.LinearRegression()

In [86]:
reg.fit(X, Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [110]:
userTestRatings = pd.DataFrame(test[test['userId'] == userId])
userTestRatings

Unnamed: 0,userId,movieId,rating
6187,33,1060,4.0
6198,33,1291,4.0
6199,33,1347,2.0
6208,33,1982,4.0
6212,33,2005,4.0
6215,33,2064,5.0
6257,33,3794,4.0
6292,33,4678,3.0
6303,33,4974,3.0


### Make Prediction for Test Movies

In [111]:
pred = reg.predict(movieWeights[movieWeights['movieId'].isin(userTestRatings['movieId'].values)].iloc[:,1:].values)

In [115]:
userTestRatings['prediction'] = pd.Series(data=pred, index = userTestRatings.index)
userTestRatings

Unnamed: 0,userId,movieId,rating,prediction
6187,33,1060,4.0,3.46
6198,33,1291,4.0,2.88
6199,33,1347,2.0,2.97
6208,33,1982,4.0,3.99
6212,33,2005,4.0,2.57
6215,33,2064,5.0,3.22
6257,33,3794,4.0,3.49
6292,33,4678,3.0,2.88
6303,33,4974,3.0,3.63


In [116]:
mae = (userTestRatings['rating'] - userTestRatings['prediction']).abs().mean()
rmse = math.sqrt((userTestRatings['rating'] - userTestRatings['prediction']).pow(2).mean())
print(" MAE:", mae)
print("RMSE:", rmse)

 MAE: 0.790912429602
RMSE: 0.9650362717424512
