In [13]:
import numpy as np
import pandas as pd
import math
from sklearn.linear_model import LinearRegression
from sklearn.metrics.pairwise import pairwise_distances
from helperFunctions import *
from __future__ import division
import random

header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('u.data', sep='\t', names=header)

n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print ('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items))

template = pd.DataFrame(np.zeros((n_users, n_items)))
template.index = np.sort(df["user_id"].unique())
template.columns = np.sort(df["item_id"].unique())

Number of users = 943 | Number of movies = 1682


In [14]:
genre = pd.read_csv("u.genre", sep='|', names=["genre", "index"])

In [17]:
items = pd.read_csv("u.item", sep='|', names=["index", "movie", "date", "provider", "imdb"] + genre["genre"].tolist(), encoding='latin-1')

In [18]:
items.insert(loc=5, column='intercept', value=1)

In [19]:
items.sort_values(by="index", inplace=True)

In [20]:
items.head()

Unnamed: 0,index,movie,date,provider,imdb,intercept,unknown,Action,Adventure,Animation,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,1,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [21]:
users = pd.read_csv("u.user", sep='|', names=["index", "age", "gender", "occupation", "zipcode"])
users.sort_values(by="index", inplace=True)

In [22]:
users.head()

Unnamed: 0,index,age,gender,occupation,zipcode
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [23]:
users1 = pd.get_dummies(users, columns=["gender", "occupation"], 
               drop_first=True).drop(labels=["index", "zipcode"], axis=1)
users1.insert(loc=0, column="intercept", value=1)

In [36]:
users1.head()

Unnamed: 0,intercept,age,gender_M,occupation_artist,occupation_doctor,occupation_educator,occupation_engineer,occupation_entertainment,occupation_executive,occupation_healthcare,...,occupation_marketing,occupation_none,occupation_other,occupation_programmer,occupation_retired,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_writer
0,1,24,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,53,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,1,23,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,24,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1,33,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [37]:
users2 = np.array(users1[["intercept", "age", "gender_M"]])

In [38]:
users2

array([[ 1, 24,  1],
       [ 1, 53,  0],
       [ 1, 23,  1],
       ..., 
       [ 1, 20,  1],
       [ 1, 48,  0],
       [ 1, 22,  1]])

In [40]:
movie_similarity = pairwise_distances(items.drop(labels=["index", "movie", "date", "provider", "imdb"], axis=1),
                                     metric="cosine")
np.fill_diagonal(movie_similarity, np.inf)

In [42]:
movie_similarity

array([[        inf,  0.75      ,  0.64644661, ...,  0.71132487,
         0.29289322,  0.64644661],
       [ 0.75      ,         inf,  0.29289322, ...,  0.71132487,
         0.64644661,  0.64644661],
       [ 0.64644661,  0.29289322,         inf, ...,  0.59175171,
         0.5       ,  0.5       ],
       ..., 
       [ 0.71132487,  0.71132487,  0.59175171, ...,         inf,
         0.59175171,  0.18350342],
       [ 0.29289322,  0.64644661,  0.5       , ...,  0.59175171,
                inf,  0.5       ],
       [ 0.64644661,  0.64644661,  0.5       , ...,  0.18350342,
         0.5       ,         inf]])

In [27]:
most_movie_similarity = []
for i in range(n_items):
    idx = np.argpartition(movie_similarity[i, :], 5)
    most_movie_similarity.append(idx[:5])

In [28]:
lr = LinearRegression(fit_intercept=False)

In [34]:
total_sse = 0
for t in range(1, 6):
    ubase = pd.read_csv("u{}.base".format(t), sep='\t', names=header)
    utest = pd.read_csv("u{}.test".format(t), sep='\t', names=header)
    train = rating_matrix(ubase, template)
    test = rating_matrix(utest, template)
    theta = np.zeros((n_items, users2.shape[1]))
    print ("Content based regression on train set {} started ......".format(t))
    for i in range(n_items):
        mask = np.where(train[:, i] > 0)[0]
        if len(mask) < users2.shape[1]:
            theta[i, :] = np.nan
        else:
            lr.fit(users2[mask, :], train[mask, i])
            theta[i, :] = lr.coef_
            
    for i in range(n_items):
        if np.isnan(theta[i, :]).any():
            theta[i, :] = np.nanmean(theta[most_movie_similarity[i], :], axis = 0)
        if np.isnan(theta[i, :]).any():
            print (i)
            
    rating_hat = np.dot(users2, theta.T)
    tmp = sse(test, rating_hat)
    print (math.sqrt(tmp / utest.shape[0]))
    total_sse += tmp
    print ("Content based regression on train set {} ended".format(t))

Content based regression on train set 1 started ......
1.1391644770230356
Content based regression on train set 1 ended
Content based regression on train set 2 started ......
1.085667022241299
Content based regression on train set 2 ended
Content based regression on train set 3 started ......
1.0927996884590654
Content based regression on train set 3 ended
Content based regression on train set 4 started ......
1.1039467514870387
Content based regression on train set 4 ended
Content based regression on train set 5 started ......
1.0698597987387015
Content based regression on train set 5 ended


In [35]:
math.sqrt(total_sse / df.shape[0])

1.098533401136778