<center><img src="img/logo_hse_black.jpg"></center>

<h1><center>Data Analysis</center></h1>
<h2><center>Seminar: Recsys </center></h2>

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
%matplotlib inline

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12, 6)

# Data preprocessing

1. Download [data](https://cloud.mail.ru/public/CSjR/mPctRVc2u) with ratings and movies
2. Examine ids of movies and users
3. Examine documentation of scipy.sparse.coo_matrix
3. Encode ids in a proper way to ease creation of rating matrix


In [2]:
df_ratings = pd.read_csv('data/user_ratedmovies.dat', sep='\t')

In [3]:
df_movies = pd.read_csv('data/movies.dat', sep='\t', encoding = "ISO-8859-1")

In [4]:
df_ratings.head()

Unnamed: 0,userID,movieID,rating,date_day,date_month,date_year,date_hour,date_minute,date_second
0,75,3,1.0,29,10,2006,23,17,16
1,75,32,4.5,29,10,2006,23,23,44
2,75,110,4.0,29,10,2006,23,30,8
3,75,160,2.0,29,10,2006,23,16,52
4,75,163,4.0,29,10,2006,23,29,30


In [5]:
df_movies.head()

Unnamed: 0,id,title,imdbID,spanishTitle,imdbPictureURL,year,rtID,rtAllCriticsRating,rtAllCriticsNumReviews,rtAllCriticsNumFresh,...,rtAllCriticsScore,rtTopCriticsRating,rtTopCriticsNumReviews,rtTopCriticsNumFresh,rtTopCriticsNumRotten,rtTopCriticsScore,rtAudienceRating,rtAudienceNumRatings,rtAudienceScore,rtPictureURL
0,1,Toy story,114709,Toy story (juguetes),http://ia.media-imdb.com/images/M/MV5BMTMwNDU0...,1995,toy_story,9.0,73,73,...,100,8.5,17,17,0,100,3.7,102338,81,http://content7.flixster.com/movie/10/93/63/10...
1,2,Jumanji,113497,Jumanji,http://ia.media-imdb.com/images/M/MV5BMzM5NjE1...,1995,1068044-jumanji,5.6,28,13,...,46,5.8,5,2,3,40,3.2,44587,61,http://content8.flixster.com/movie/56/79/73/56...
2,3,Grumpy Old Men,107050,Dos viejos gruñones,http://ia.media-imdb.com/images/M/MV5BMTI5MTgy...,1993,grumpy_old_men,5.9,36,24,...,66,7.0,6,5,1,83,3.2,10489,66,http://content6.flixster.com/movie/25/60/25602...
3,4,Waiting to Exhale,114885,Esperando un respiro,http://ia.media-imdb.com/images/M/MV5BMTczMTMy...,1995,waiting_to_exhale,5.6,25,14,...,56,5.5,11,5,6,45,3.3,5666,79,http://content9.flixster.com/movie/10/94/17/10...
4,5,Father of the Bride Part II,113041,Vuelve el padre de la novia (Ahora también abu...,http://ia.media-imdb.com/images/M/MV5BMTg1NDc2...,1995,father_of_the_bride_part_ii,5.3,19,9,...,47,5.4,5,1,4,20,3.0,13761,64,http://content8.flixster.com/movie/25/54/25542...


In [6]:
df_ratings.userID.min(), df_ratings.userID.max()

(75, 71534)

In [7]:
df_ratings.movieID.min(), df_ratings.movieID.max()

(1, 65133)

Unique values without nan

In [8]:
df_ratings.userID.nunique()

2113

In [9]:
df_ratings.movieID.nunique()

10109

In [10]:
df_movies.loc[:, 'id'].nunique()

10197

### One-hot encoding userID and movieID

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
enc_user = LabelEncoder()
enc_movies = LabelEncoder()

In [13]:
enc_user.fit(df_ratings.userID.values)

LabelEncoder()

In [14]:
enc_movies.fit(df_movies.loc[:, 'id'].values)

LabelEncoder()

In [15]:
df_ratings.loc[:, 'userID'] = \
        enc_user.transform(df_ratings.loc[:, 'userID'].values)

In [16]:
df_ratings.loc[:, 'movieID'] = \
        enc_movies.transform(df_ratings.loc[:, 'movieID'].values)

In [17]:
df_movies.loc[:, 'id'] = \
        enc_movies.transform(df_movies.loc[:, 'id'].values)

In [18]:
from scipy.sparse import coo_matrix

In [19]:
n_movies = enc_movies.classes_.shape[0]
n_users = enc_user.classes_.shape[0]
n_movies, n_users

(10197, 2113)

In [20]:
R = coo_matrix((df_ratings.rating.values, 
                (df_ratings.userID.values, df_ratings.movieID.values)))

In [21]:
R

<2113x10197 sparse matrix of type '<class 'numpy.float64'>'
	with 855598 stored elements in COOrdinate format>

In [22]:
#number of non zero values
R.nnz

855598

In [23]:
n_movies * n_users

21546261

# SVD on Rating matrix
## Compressed representation of movies

1. Find latent representation of movies with scipy.sparse.linalg.svds
2. For each movie find 10 nearest neigbours in that feature space

In [24]:
from scipy.sparse.linalg import svds

In [25]:
U, S, V_T = svds(R, k=10)

In [26]:
U.shape

(2113, 10)

In [27]:
V_T.shape

(10, 10197)

In [28]:
#S is diagonal matrix ant it save like np.array
S

array([ 232.86920461,  261.70125609,  300.14199845,  315.32085391,
        348.46928961,  388.72357327,  395.35976356,  493.67832495,
        581.54459324, 1861.07048692])

In [29]:
V = V_T.T

In [30]:
from sklearn.neighbors import NearestNeighbors

In [31]:
#fine nearest 10 (11 - 1) neighbors for each vector, specify metric 
nn = NearestNeighbors(n_neighbors=11, metric='cosine', n_jobs=-1)
nn.fit(V)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=-1, n_neighbors=11, p=2, radius=1.0)

In [32]:
ind = nn.kneighbors(V, n_neighbors=11, return_distance=False)

In [33]:
ind.shape

(10197, 11)

In [34]:
df_movies.head()

Unnamed: 0,id,title,imdbID,spanishTitle,imdbPictureURL,year,rtID,rtAllCriticsRating,rtAllCriticsNumReviews,rtAllCriticsNumFresh,...,rtAllCriticsScore,rtTopCriticsRating,rtTopCriticsNumReviews,rtTopCriticsNumFresh,rtTopCriticsNumRotten,rtTopCriticsScore,rtAudienceRating,rtAudienceNumRatings,rtAudienceScore,rtPictureURL
0,0,Toy story,114709,Toy story (juguetes),http://ia.media-imdb.com/images/M/MV5BMTMwNDU0...,1995,toy_story,9.0,73,73,...,100,8.5,17,17,0,100,3.7,102338,81,http://content7.flixster.com/movie/10/93/63/10...
1,1,Jumanji,113497,Jumanji,http://ia.media-imdb.com/images/M/MV5BMzM5NjE1...,1995,1068044-jumanji,5.6,28,13,...,46,5.8,5,2,3,40,3.2,44587,61,http://content8.flixster.com/movie/56/79/73/56...
2,2,Grumpy Old Men,107050,Dos viejos gruñones,http://ia.media-imdb.com/images/M/MV5BMTI5MTgy...,1993,grumpy_old_men,5.9,36,24,...,66,7.0,6,5,1,83,3.2,10489,66,http://content6.flixster.com/movie/25/60/25602...
3,3,Waiting to Exhale,114885,Esperando un respiro,http://ia.media-imdb.com/images/M/MV5BMTczMTMy...,1995,waiting_to_exhale,5.6,25,14,...,56,5.5,11,5,6,45,3.3,5666,79,http://content9.flixster.com/movie/10/94/17/10...
4,4,Father of the Bride Part II,113041,Vuelve el padre de la novia (Ahora también abu...,http://ia.media-imdb.com/images/M/MV5BMTg1NDc2...,1995,father_of_the_bride_part_ii,5.3,19,9,...,47,5.4,5,1,4,20,3.0,13761,64,http://content8.flixster.com/movie/25/54/25542...


In [35]:
movie_title = df_movies.title.values

In [36]:
df_nn_movies = pd.DataFrame(data=movie_title[ind], 
                            columns=['movie'] + ['nn{}'.format(i+1) for i in range(10)])

In [37]:
ind[0]

array([   0,  565, 2873, 6010, 4007, 4576, 1156,  461,  351, 2404, 7925])

In [38]:
#fine similar movies for cartoon Shrek
idx = df_nn_movies.movie.str.contains('Shrek')
df_nn_movies.loc[idx]

Unnamed: 0,movie,nn1,nn2,nn3,nn4,nn5,nn6,nn7,nn8,nn9,nn10
4007,Shrek,Finding Nemo,Pirates of the Caribbean: The Curse of the Bla...,"Monsters, Inc.",The Lord of the Rings: The Two Towers,The Lord of the Rings: The Fellowship of the Ring,The Lord of the Rings: The Return of the King,Men in Black,Spider-Man,Star Wars: Episode I - The Phantom Menace,Shrek 2
7437,Shrek 2,"Monsters, Inc.",Finding Nemo,Harry Potter and the Chamber of Secrets,Harry Potter and the Prisoner of Azkaban,Harry Potter and the Sorcerer's Stone,Harry Potter and the Goblet of Fire,Pirates of the Caribbean: The Curse of the Bla...,Shrek,Ice Age,"The Chronicles of Narnia: The Lion, the Witch ..."
9507,Shrek the Third,Happy Feet,Sydney White,The Wild,Alvin and the Chipmunks,Going the Distance,The Spiderwick Chronicles,Journey to the Center of the Earth,Open Season,I Know Who Killed Me,Monster House
10146,Shrek the Halls,Witless Protection,The Big Squeeze,Just a Little Harmless Sex,I Confess,Hush,White Lightning,The Chase,The Longest Yard,FM,Road to Rio


# User-based CF

* Split data to train and test in proportion to 80/20
* Implement similarity function
* Implement User-based CF based on $K$ most similar users. How does MAE changes with $K$ in range [5-25]
* Repeat this process with normalized ratings

In [39]:
from scipy.spatial.distance import cosine
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
from scipy.spatial.distance import correlation
from sklearn.metrics import pairwise_distances

In [40]:
df_ratings.columns = ['userID', 'movieID', 'rating', 
                      'day', 'month', 'year', 'hour', 'minute', 'second']

In [41]:
df_ratings.loc[:, 'datetime']  = \
        pd.to_datetime(df_ratings.loc[:, ['day', 'month', 'year']])

#### Split train/test using datetime

In [42]:
q = df_ratings.datetime.quantile(.8)

In [43]:
idx = df_ratings.datetime <= q
df_ratings_train = df_ratings.loc[idx]

In [44]:
idx = df_ratings.datetime > q
df_ratings_test = df_ratings.loc[idx]

In [45]:
df_ratings_train.shape

(684810, 10)

In [46]:
df_ratings_test.shape

(170788, 10)

In [47]:
n_movies = enc_movies.classes_.shape[0]
n_users = enc_user.classes_.shape[0]

In [48]:
R_train = coo_matrix((df_ratings_train.rating.values, 
                     (df_ratings_train.userID.values, df_ratings_train.movieID.values)),
                     shape=(n_users, n_movies))

In [49]:
R_train.shape

(2113, 10197)

In [50]:
def my_metric(u, v):
    #find idx where u not eq 0 and v not eq 0
    idx = (u != 0) & (v != 0)
    if any(idx):
        sim_score = 2 - cosine(u[idx], v[idx])
    else:
        sim_score = 0
    return sim_score

In [51]:
R_train = R_train.toarray()

In [52]:
#convert coo_matrix to np.array
type(R_train)

numpy.ndarray

In [53]:
#compute distance using custom my_metric
sim = pdist(R_train, metric=my_metric)

In [54]:
Sim = squareform(sim)

In [55]:
df_ratings_test.head()

Unnamed: 0,userID,movieID,rating,day,month,year,hour,minute,second,datetime
925,5,164,1.0,30,7,2008,18,59,11,2008-07-30
936,5,354,3.0,30,7,2008,18,33,3,2008-07-30
937,5,355,3.0,30,7,2008,18,38,38,2008-07-30
938,5,367,2.5,27,8,2008,4,26,37,2008-08-27
939,5,461,4.5,16,4,2008,19,58,20,2008-04-16


In [56]:
predicted_ratings = []

for i, r in tqdm_notebook(df_ratings_test.iterrows()):
    watched_users = np.where(R_train[:, int(r['movieID'])])[0]
    sim = Sim[int(r['userID']), watched_users]
    sorted_idx = np.argsort(sim)
    for k in range(5, 25):
        
        ratings = R_train[watched_users[sorted_idx[-k:]], int(r['movieID'])]
        
        sim_k = sim[sorted_idx[-k:]]
        
        prediction = ratings.dot(sim_k)/(sim_k.sum()) 

        predicted_ratings.append({'userID': r['userID'],
                                  'movieID': r['movieID'],
                                  'prediction': prediction,
                                  'k': k})

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

  del sys.path[0]





In [57]:
df_predicted_ratings = pd.DataFrame(predicted_ratings)

In [58]:
df_predicted_ratings.head()

Unnamed: 0,k,movieID,prediction,userID
0,5,164,2.496698,5
1,6,164,2.580224,5
2,7,164,2.639898,5
3,8,164,2.622487,5
4,9,164,2.719535,5


In [59]:
df_predicted_ratings = \
    df_predicted_ratings.join(df_ratings_test.loc[:, ['movieID', 'userID', 'rating']].set_index(['movieID', 'userID']),
                             on=['movieID', 'userID'])

In [60]:
df_predicted_ratings.loc[:, 'error'] = abs(df_predicted_ratings.prediction - df_predicted_ratings.rating)

In [61]:
df_predicted_ratings.groupby('k').error.mean()

k
5     0.735228
6     0.726475
7     0.719277
8     0.713325
9     0.709649
10    0.706283
11    0.703417
12    0.700787
13    0.698963
14    0.697120
15    0.695238
16    0.693862
17    0.693168
18    0.691776
19    0.690907
20    0.690075
21    0.689213
22    0.688546
23    0.687775
24    0.687164
Name: error, dtype: float64