# Load Libraries

In [1]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix

import warnings
warnings.filterwarnings("ignore")

# Load Data

In [3]:
path = 'data/ml-latest-small/'

In [4]:
ratings = pd.read_csv(os.path.join(path, 'ratings.csv'), encoding='utf-8')
movies = pd.read_csv(os.path.join(path, 'movies.csv'), index_col='movieId', encoding='utf-8')
tags = pd.read_csv(os.path.join(path, 'tags.csv'), encoding='utf-8')

In [5]:
print(ratings.shape)
print(movies.shape)
print(tags.shape)

(100836, 4)
(9742, 2)
(3683, 4)


# EDA

## ratings

* 몇 명의 유저가 몇 개의 영화에 평점을 주었는지 확인
* 각 유저가 어떤 영화에 평점을 줬는디 sparse matrix 만들기

In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
user_list = ratings['userId'].unique()
movie_list = ratings['movieId'].unique()

print('총 유저 수 : ', len(user_list))
print('총 영화 수 : ', len(movie_list))

총 유저 수 :  610
총 영화 수 :  9724


In [8]:
user_movie_matrix = ratings.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)

sparse_mt = csr_matrix(user_movie_matrix.values)

In [12]:
user_movie_matrix.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
print(sparse_mt)

  (0, 0)	4.0
  (0, 4)	4.0
  (0, 6)	4.5
  (0, 14)	2.5
  (0, 16)	4.5
  (0, 17)	3.5
  (0, 18)	4.0
  (0, 20)	3.5
  (0, 26)	3.0
  (0, 30)	5.0
  (0, 31)	3.0
  (0, 32)	3.0
  (0, 39)	5.0
  (0, 42)	5.0
  (0, 43)	3.0
  (0, 44)	4.0
  (0, 45)	5.0
  (0, 49)	3.0
  (0, 53)	3.0
  (0, 56)	5.0
  (0, 62)	5.0
  (0, 63)	4.0
  (0, 65)	4.0
  (0, 67)	2.5
  (0, 70)	5.0
  :	:
  (9700, 337)	2.5
  (9701, 337)	3.0
  (9702, 183)	4.0
  (9702, 247)	3.5
  (9703, 317)	2.5
  (9704, 209)	1.0
  (9705, 461)	2.5
  (9706, 49)	3.5
  (9707, 337)	1.5
  (9708, 337)	4.0
  (9709, 337)	1.0
  (9710, 337)	1.5
  (9711, 337)	1.0
  (9712, 337)	1.0
  (9713, 183)	4.5
  (9714, 183)	3.5
  (9715, 183)	3.0
  (9716, 183)	4.0
  (9717, 183)	4.0
  (9718, 183)	3.5
  (9719, 183)	4.0
  (9720, 183)	3.5
  (9721, 183)	3.5
  (9722, 183)	3.5
  (9723, 330)	4.0


In [15]:
user_info_df = pd.DataFrame(data=[sum(list(user_movie_matrix[int(x)].value_counts())[1:]) for x in user_movie_matrix.columns],
                            index=user_movie_matrix.columns, columns=['movie_rated'])

In [16]:
user_info_df.head(10)

Unnamed: 0_level_0,movie_rated
userId,Unnamed: 1_level_1
1,232
2,29
3,39
4,216
5,44
6,314
7,152
8,47
9,46
10,140


In [20]:
user_info_df.describe()

Unnamed: 0,movie_rated
count,610.0
mean,165.304918
std,269.480584
min,20.0
25%,35.0
50%,70.5
75%,168.0
max,2698.0


In [17]:
movie_info_df = pd.DataFrame(data=[sum(list(user_movie_matrix.loc[int(x)].value_counts())[1:]) for x in user_movie_matrix.index],
                            index=user_movie_matrix.index, columns=['users_rated'])

In [18]:
movie_info_df.head(10)

Unnamed: 0_level_0,users_rated
movieId,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49
6,102
7,54
8,8
9,16
10,132


In [19]:
movie_info_df.describe()

Unnamed: 0,users_rated
count,9724.0
mean,10.369807
std,22.401005
min,1.0
25%,1.0
50%,3.0
75%,9.0
max,329.0


# Split Data

In [21]:
train, test = train_test_split(ratings, test_size=0.2, random_state=1990)

In [22]:
print(train.shape)
print(test.shape)

(80668, 4)
(20168, 4)


In [23]:
len(list(set(test['userId'].unique()) - set(train['userId'].unique())))

0

In [24]:
len(list(set(test['movieId'].unique()) - set(train['movieId'].unique())))

835

In [25]:
len(test['movieId'].unique())

5213

# Base Model

## 1. Random

In [27]:
ratings_range = np.arange(0.5, 5.5, step=0.5)
ratings_range

array([0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. ])

In [28]:
import random
pred_random = [random.choice(ratings_range) for x in range(len(test))]
pred_random[:10]

[4.0, 4.0, 1.0, 1.5, 1.0, 1.0, 4.5, 0.5, 3.0, 5.0]

In [29]:
test['pred_ratings_random'] = pred_random

In [30]:
test.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random
9651,64,2291,4.0,1161521153,4.0
62378,414,185,2.0,961515354,4.0
8742,60,1242,4.0,1393541757,1.0
33791,230,1682,2.5,1196304802,1.5
66797,431,3948,3.0,1267051796,1.0
89155,577,1968,3.0,945978474,1.0
65993,425,1320,4.0,1085490710,4.5
45565,301,1097,4.0,1211377777,0.5
99150,608,3354,2.5,1117491740,3.0
90611,590,903,4.5,1258421384,5.0


In [32]:
mse = mean_squared_error(y_true=test['rating'].values, y_pred=test['pred_ratings_random'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

3.7571028361761205 1.938324749926111


## 2. Movie-Average

In [44]:
train_movie = train.groupby('movieId').mean()

In [45]:
train_movie.head(10)

Unnamed: 0_level_0,userId,rating,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,307.325843,3.896067,1133684000.0
2,314.382979,3.414894,1132195000.0
3,285.44186,3.244186,1025835000.0
4,242.5,2.25,905213300.0
5,303.30303,3.136364,977346400.0
6,301.033708,3.949438,1058212000.0
7,293.073171,3.085366,968949000.0
8,336.166667,2.5,956764100.0
9,364.666667,3.0,950381000.0
10,308.697248,3.490826,1041163000.0


In [46]:
def avg_rating_prediction(train_set, x):
    if x in train_set.index:
        pred_rating = train_set.loc[x]['rating']
    else:
        pred_rating = random.choice(ratings_range)
    return pred_rating

In [47]:
test['pred_rating_movie_average'] = test['movieId'].apply(lambda x: avg_rating_prediction(train_movie, x))

In [48]:
test.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_rating_movie_average
9651,64,2291,4.0,1161521153,4.0,3.656716
62378,414,185,2.0,961515354,4.0,3.0
8742,60,1242,4.0,1393541757,1.0,4.111111
33791,230,1682,2.5,1196304802,1.5,3.779412
66797,431,3948,3.0,1267051796,1.0,3.45
89155,577,1968,3.0,945978474,1.0,3.816667
65993,425,1320,4.0,1085490710,4.5,3.197368
45565,301,1097,4.0,1211377777,0.5,3.733696
99150,608,3354,2.5,1117491740,3.0,2.6
90611,590,903,4.5,1258421384,5.0,3.961538


In [49]:
mse2 = mean_squared_error(y_true=test['rating'].values, y_pred=test['pred_rating_movie_average'].values)
rmse2 = np.sqrt(mse2)

print(mse2, rmse2)

1.0610871731369051 1.0300908567388147


## 3. User-Average

In [50]:
train_user = train.groupby('userId').mean()
train_user.head()

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1790.632432,4.345946,964986400.0
2,65843.727273,3.863636,1445715000.0
3,5635.823529,2.397059,1306464000.0
4,1978.586957,3.548913,965458700.0
5,334.0,3.648649,847435000.0


In [51]:
test['pred_rating_user_average'] = test['userId'].apply(lambda x: avg_rating_prediction(train_user, x))

In [52]:
test.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_rating_movie_average,pred_rating_user_average
9651,64,2291,4.0,1161521153,4.0,3.656716,3.796471
62378,414,185,2.0,961515354,4.0,3.0,3.387829
8742,60,1242,4.0,1393541757,1.0,4.111111,3.555556
33791,230,1682,2.5,1196304802,1.5,3.779412,2.837607
66797,431,3948,3.0,1267051796,1.0,3.45,2.625
89155,577,1968,3.0,945978474,1.0,3.816667,3.577236
65993,425,1320,4.0,1085490710,4.5,3.197368,3.530769
45565,301,1097,4.0,1211377777,0.5,3.733696,3.268041
99150,608,3354,2.5,1117491740,3.0,2.6,3.181682
90611,590,903,4.5,1258421384,5.0,3.961538,3.337931


In [57]:
mse3 = mean_squared_error(y_true=test['rating'].values, y_pred=test['pred_rating_user_average'].values)
rmse3 = np.sqrt(mse3)

print(mse3, rmse3)

0.8898362685686264 0.943311331729152


## 4. Genre-Average
장르별 평균 평점

In [60]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [61]:
train_user_movie_matrix = train.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)
train_user_movie_matrix.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,0.0,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,5.0,3.5,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
genres = movies['genres'].str.get_dummies(sep='|')
genres = genres.loc[train_user_movie_matrix.index.unique()]

In [69]:
genres.shape

(8889, 20)

In [68]:
genres.head()

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [71]:
train_movie_avg_ratings = train_user_movie_matrix.copy()
train_movie_avg_ratings = train_movie_avg_ratings.replace(0, np.NaN)
train_movie_avg_ratings = train_movie_avg_ratings.mean(axis=1)

train_movie_avg_ratings.head()

movieId
1    3.896067
2    3.414894
3    3.244186
4    2.250000
5    3.136364
dtype: float64

In [74]:
genres_avg_ratings = pd.DataFrame(index=genres.columns, columns=['avg_ratings'])

for genre in genres_avg_ratings.index:
    genres_avg_rating = train_movie_avg_ratings.loc[genres[genres[genre].isin([1])].index].mean()
    genres_avg_ratings.loc[genre]['avg_ratings'] = genres_avg_rating

In [75]:
genres_avg_ratings.head(10)

Unnamed: 0,avg_ratings
(no genres listed),3.350733
Action,3.102724
Adventure,3.205232
Animation,3.476945
Children,3.079641
Comedy,3.179317
Crime,3.314915
Documentary,3.775623
Drama,3.426599
Fantasy,3.215214


In [78]:
def get_genre_avg_ratings(x):
    genres_list = movies.loc[x]['genres'].split('|')
    rating = 0
    for genre in genres_list:
        rating += genres_avg_ratings.loc[genre]['avg_ratings']
    
    return rating/len(genres_list)

In [79]:
tqdm.pandas()
test['pred_rating_genre'] = test['movieId'].progress_apply(lambda x: get_genre_avg_ratings(x))

100%|██████████| 20168/20168 [00:02<00:00, 8998.18it/s]


In [80]:
test.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_rating_movie_average,pred_rating_user_average,pred_rating_genre
9651,64,2291,4.0,1161521153,4.0,3.656716,3.796471,3.332948
62378,414,185,2.0,961515354,4.0,3.0,3.387829,3.199269
8742,60,1242,4.0,1393541757,1.0,4.111111,3.555556,3.486652
33791,230,1682,2.5,1196304802,1.5,3.779412,2.837607,3.250731
66797,431,3948,3.0,1267051796,1.0,3.45,2.625,3.179317
89155,577,1968,3.0,945978474,1.0,3.816667,3.577236,3.302958
65993,425,1320,4.0,1085490710,4.5,3.197368,3.530769,3.089446
45565,301,1097,4.0,1211377777,0.5,3.733696,3.268041,3.217506
99150,608,3354,2.5,1117491740,3.0,2.6,3.181682,3.146278
90611,590,903,4.5,1258421384,5.0,3.961538,3.337931,3.326404


In [81]:
mse4 = mean_squared_error(y_true=test['rating'].values, y_pred=test['pred_rating_genre'].values)
rmse4 = np.sqrt(mse4)

print(mse4, rmse4)

1.1184243775958396 1.0575558508163243
