## 간단한 추천시스템 만들기

1. MoveieLens 데이터셋 불러오기
2. 학습셋과 평가셋 나누기
3. 간단한 추천시스템 알고리즘 만들기

   - 평점 예측
   - 평가는 RMSE

### 필요한 라이브러리 정의(Configuration

In [2]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix

import warnings
warnings.filterwarnings("ignore")

In [3]:
ratings_df = pd.read_csv('./ratings.csv', encoding='utf-8')
movies_df = pd.read_csv('./movies.csv', index_col='movieId', encoding='utf-8')
tags_df = pd.read_csv( './tags.csv', encoding='utf-8')

In [4]:
print(ratings_df.shape)
print(ratings_df.head())

(100836, 4)
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


#### ratings 데이터 정보 확인

- 몇 명의 유저가 몇 개의 영화에 평점을 줬는가

- 각 유저가 어떤 영화에 평점을 줬는지 sparse matrix

In [5]:
num_users = ratings_df['userId'].unique()
num_movies = ratings_df['movieId'].unique()

print("총 유저 수: ", len(num_users)) #유저는 610명
print("총 영화 수: ", len(num_movies)) #총 영화 수 9724

총 유저 수:  610
총 영화 수:  9724


In [6]:
#유저가 각 영화에 어떤 평점을 주었는지

# pivot ratings into movie features
user_movie_matrix = ratings_df.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)
 
# convert dataframe of movie features to scipy sparse matrix
sparse_mat = csr_matrix(user_movie_matrix.values)


In [7]:
print(user_movie_matrix)


userId   1    2    3    4    5    6    7    8    9    10   ...  601  602  603  \
movieId                                                    ...                  
1        4.0  0.0  0.0  0.0  4.0  0.0  4.5  0.0  0.0  0.0  ...  4.0  0.0  4.0   
2        0.0  0.0  0.0  0.0  0.0  4.0  0.0  4.0  0.0  0.0  ...  0.0  4.0  0.0   
3        4.0  0.0  0.0  0.0  0.0  5.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
4        0.0  0.0  0.0  0.0  0.0  3.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
5        0.0  0.0  0.0  0.0  0.0  5.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
...      ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
193581   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
193583   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
193585   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
193587   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
193609   0.0  0.0  0.0  0.0 

In [8]:
#(x,y)기준으로 x는 unique의 영화, y는 user, 나머지는 평점
print(sparse_mat)

  (0, 0)	4.0
  (0, 4)	4.0
  (0, 6)	4.5
  (0, 14)	2.5
  (0, 16)	4.5
  (0, 17)	3.5
  (0, 18)	4.0
  (0, 20)	3.5
  (0, 26)	3.0
  (0, 30)	5.0
  (0, 31)	3.0
  (0, 32)	3.0
  (0, 39)	5.0
  (0, 42)	5.0
  (0, 43)	3.0
  (0, 44)	4.0
  (0, 45)	5.0
  (0, 49)	3.0
  (0, 53)	3.0
  (0, 56)	5.0
  (0, 62)	5.0
  (0, 63)	4.0
  (0, 65)	4.0
  (0, 67)	2.5
  (0, 70)	5.0
  :	:
  (9700, 337)	2.5
  (9701, 337)	3.0
  (9702, 183)	4.0
  (9702, 247)	3.5
  (9703, 317)	2.5
  (9704, 209)	1.0
  (9705, 461)	2.5
  (9706, 49)	3.5
  (9707, 337)	1.5
  (9708, 337)	4.0
  (9709, 337)	1.0
  (9710, 337)	1.5
  (9711, 337)	1.0
  (9712, 337)	1.0
  (9713, 183)	4.5
  (9714, 183)	3.5
  (9715, 183)	3.0
  (9716, 183)	4.0
  (9717, 183)	4.0
  (9718, 183)	3.5
  (9719, 183)	4.0
  (9720, 183)	3.5
  (9721, 183)	3.5
  (9722, 183)	3.5
  (9723, 330)	4.0


In [9]:
#dataframe을 만든다
#유저별로 평점을 준 영화 수
user_info_df = pd.DataFrame(data = [sum(list(user_movie_matrix[int(x)].value_counts())[1:]) for x in user_movie_matrix.columns],
                           index = user_movie_matrix.columns, columns=['movies_rated'])

# sum(list(user_movie_matrix[1].value_counts())[1:])

In [10]:

user_info_df

Unnamed: 0_level_0,movies_rated
userId,Unnamed: 1_level_1
1,232
2,29
3,39
4,216
5,44
...,...
606,1115
607,187
608,831
609,37


In [11]:
#영화별로 평점을 받은 수
movie_info_df = pd.DataFrame(data = [sum(list(user_movie_matrix.loc[int(x)].value_counts())[1:]) for x in user_movie_matrix.index],
                           index = user_movie_matrix.index, columns=['users_rated'])
movie_info_df

Unnamed: 0_level_0,users_rated
movieId,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49
...,...
193581,1
193583,1
193585,1
193587,1


### Movielens 데이터 셋 나누기

In [12]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=1234)




In [13]:
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=1)

In [14]:
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)

(64534, 4)
(16134, 4)
(20168, 4)


#### test set에는 존재하지만, train set에는 없는 영화 혹은 사용자 비율


In [15]:
# 집합 A - 집합 B => 집합 B에는 없고 집합 A에만 있는 item 

# userId
print("사용자: ",len(list(set(test_df['userId'].unique()) - set(train_df['userId'].unique()))))

# movieId
print("영화: ", len(list(set(test_df['movieId'].unique()) - set(train_df['movieId'].unique()))))
print("test set의 전체 영화 수: ",  len(test_df['movieId'].unique()))

사용자:  0
영화:  911
test set의 전체 영화 수:  5171


**train/test만 나눴을 때는 786인데 지금은 911인 것으로 보아
학습량도 충분하지 않고, 예측해야할 것이 많으니 loss값이 다이나믹하게 낮아지진 않을 것 같다.**

In [16]:
movies_not_included = list(set(test_df['movieId'].unique()) - set(train_df['movieId'].unique()))
print(sorted(movies_not_included)[:10])

not_included_df = test_df[test_df.movieId.isin(movies_not_included)].sort_values(by='movieId')
print(not_included_df.head(10))

print("train set에 없고, test set에만 있는 영화 데이터 수: ", not_included_df.shape)

[40, 49, 117, 137, 178, 179, 241, 269, 304, 320]
       userId  movieId  rating  timestamp
84607     544       40     5.0  850688776
29386     202       49     3.0  974925453
97066     604      117     3.0  832080636
99501     609      137     3.0  847221054
27959     191      178     1.0  829760898
632         6      179     1.0  845555362
98493     607      241     4.0  964744490
27975     191      269     3.0  829760898
55216     367      304     5.0  997813416
695         6      304     4.0  845555546
train set에 없고, test set에만 있는 영화 데이터 수:  (1002, 4)


**1002개 의 데이터는 예측을 받지 못했다. 학습셋에 데이터가 없기 때문이다.**

#### 간단한 추천 알고리즘 만들기

1. 랜덤으로 평점 예측
2. 영화 평균 평점기반 예측
3. 사용자 평균 평점기반 예측
4. Rule기반 영화 랭킹 예측

**랜덤으로 평점예측**

In [21]:
#0.5~5.0사이의 숫자를 예측해야할 평점 수만큼 generate
ratings_df = np.arange(0.5,5.5,step=0.5)
ratings_df

array([0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. ])

In [22]:
import random
pred_random = [random.choice(ratings_df) for x in range(len(test_df))]
pred_random[:10]

[3.0, 2.5, 3.5, 2.0, 2.0, 5.0, 2.5, 4.0, 2.5, 5.0]

In [23]:
test_df['pred_ratings_random'] = pred_random

In [24]:
random_mse = mean_squared_error(y_true=test_df['rating'].values, y_pred=test_df['pred_ratings_random'].values)
random_rmse = np.sqrt(random_mse)

print(random_mse, random_rmse)

3.6854174930583103 1.9197441217668334


**영화 평균 평점기반 예측**

1. train set의 모든 영화에 대해서 평균 평점 구하기
2. test set예측 시, train set의 영화 평균 평점 활용하고 없다면 random 선택

In [25]:
train_movie_df = train_df.groupby('movieId').mean()

print(train_movie_df.shape)
print(train_movie_df.head())

(8263, 3)
             userId    rating     timestamp
movieId                                    
1        310.875862  3.889655  1.134420e+09
2        322.835821  3.373134  1.141576e+09
3        249.815789  3.407895  9.926473e+08
4        169.666667  2.666667  8.432490e+08
5        316.161290  3.032258  1.018687e+09


In [28]:
def avg_rating_prediction(training_set, x):
    if x in training_set.index:
        pred_rating = training_set.loc[x]['rating']
    else:
        pred_rating = random.choice(ratings_df)
    return pred_rating

In [34]:
test_df['pred_rating_user'] = test_df['userId'].apply(lambda x: avg_rating_prediction(train_movie_df, x))

test_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_rating_movie,pred_rating_user
99731,610,3527,5.0,1479545223,3.0,3.647059,3.527778
97583,606,1250,3.5,1171376891,2.5,4.133333,4.0
38197,262,213,5.0,840310907,3.5,3.5,3.909091
11474,68,69406,3.0,1261622505,2.0,3.4375,3.75
34105,232,4728,3.0,1218166950,2.0,2.7,4.0


In [37]:
movie_mse = mean_squared_error(y_true=test_df['rating'].values, y_pred=test_df['pred_rating_movie'].values)
movie_rmse = np.sqrt(movie_mse)

print(movie_mse, movie_rmse)

1.0836519073833304 1.040986026507239


**사용자 평균 평점기반 예측**

1. train set의 모든 유저가 준 평균 평점
2. test set 예측할 때, 유저가 train set에서 준 평균 평점을 활용. 유저가 없을 경우 random 평점 적용

In [38]:
train_user_df = train_df.groupby('userId').mean()

print(train_user_df.shape)
print(train_user_df.head())

(610, 3)
             movieId    rating     timestamp
userId                                      
1        1935.315068  4.397260  9.649875e+08
2       75953.166667  3.944444  1.445715e+09
3        5833.291667  2.500000  1.306464e+09
4        1934.557143  3.692857  9.647095e+08
5         339.655172  3.620690  8.474351e+08


In [39]:
test_df['pred_rating_user'] = test_df['userId'].apply(lambda x: avg_rating_prediction(train_user_df, x))

test_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_rating_movie,pred_rating_user
99731,610,3527,5.0,1479545223,3.0,3.647059,3.692951
97583,606,1250,3.5,1171376891,2.5,4.133333,3.648707
38197,262,213,5.0,840310907,3.5,3.5,3.0
11474,68,69406,3.0,1261622505,2.0,3.4375,3.24106
34105,232,4728,3.0,1218166950,2.0,2.7,3.242857


In [40]:
user_mse = mean_squared_error(y_true=test_df['rating'].values, y_pred=test_df['pred_rating_user'].values)
user_rmse = np.sqrt(user_mse)

print(user_mse, user_rmse)

0.8926074443084846 0.9447790452314682


**Rule 기반 영화 평점 예측하기**

train set에 포함된 유저의 영화 평균 평점과 영화의 장르를 활용하여, 장르별 평균 평점 계산 -> test set의 영화 장르의 평균 평점으로 예측

In [41]:
# create user_movie matrix by only using train_df
train_user_movie_matrix = train_df.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)

In [42]:
train_user_movie_matrix


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,0.0,3.0,4.0,2.5,0.0,2.5,0.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193573,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
genres_df = movies_df['genres'].str.get_dummies(sep='|')
print(genres_df.shape)
genres_df = genres_df.loc[train_df.movieId.unique()]
print(genres_df.shape)
genres_df.head()

(9742, 20)
(8263, 20)


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
113350,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0
2640,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3114,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
368,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
84152,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0


In [44]:
# trainset에서 영화별 유저 평점 평균
train_movie_avg_ratings_df = train_user_movie_matrix.copy()
train_movie_avg_ratings_df = train_movie_avg_ratings_df.replace(0, np.NaN)
train_movie_avg_ratings_df = train_movie_avg_ratings_df.mean(axis = 1)

train_movie_avg_ratings_df.head()

movieId
1    3.889655
2    3.373134
3    3.407895
4    2.666667
5    3.032258
dtype: float64

In [45]:
# genres_df에서 해당 장르가 포함된 모든 영화 index를 가져와서, 해당 영화의 유저 평균 평점의 평균을 구해서 장르 평균 평점으로 활용
genres_avg_ratings_df = pd.DataFrame(index=genres_df.columns, columns=['avg_ratings'])

for genre in genres_avg_ratings_df.index:
    genre_avg_rating = train_movie_avg_ratings_df.loc[genres_df[genres_df[genre].isin([1])].index].mean()
    genres_avg_ratings_df.loc[genre]['avg_ratings'] = genre_avg_rating

genres_avg_ratings_df

Unnamed: 0,avg_ratings
(no genres listed),3.28125
Action,3.14584
Adventure,3.237536
Animation,3.473996
Children,3.101268
Comedy,3.180659
Crime,3.325186
Documentary,3.780359
Drama,3.448573
Fantasy,3.251763


In [46]:
def get_genre_avg_ratings(x):
    genres_list = movies_df.loc[x]['genres'].split('|')
    rating = 0
    for genre in genres_list:
        rating += genres_avg_ratings_df.loc[genre]['avg_ratings']
    
    return rating / len(genres_list)

In [48]:
tqdm.pandas()
test_df['pred_rating_genre'] = test_df['movieId'].progress_apply(lambda x: get_genre_avg_ratings(x))

100%|██████████| 20168/20168 [00:08<00:00, 2511.41it/s]


In [49]:
test_df


Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_rating_movie,pred_rating_user,pred_rating_genre
99731,610,3527,5.0,1479545223,3.0,3.647059,3.692951,3.166174
97583,606,1250,3.5,1171376891,2.5,4.133333,3.648707,3.412840
38197,262,213,5.0,840310907,3.5,3.500000,3.000000,3.448573
11474,68,69406,3.0,1261622505,2.0,3.437500,3.241060,3.270600
34105,232,4728,3.0,1218166950,2.0,2.700000,3.242857,3.180659
...,...,...,...,...,...,...,...,...
41080,279,593,4.0,1506394242,3.5,4.131579,3.685841,3.150190
4897,31,780,4.0,850466616,2.0,3.482270,3.695652,3.184014
8023,56,410,3.0,835799188,1.0,3.170213,3.857143,3.177897
77467,483,2291,4.0,1415579167,4.5,3.769231,3.607221,3.353626


In [50]:
rule_mse = mean_squared_error(y_true=test_df['rating'].values, y_pred=test_df['pred_rating_genre'].values)
rule_rmse = np.sqrt(rule_mse)

print(rule_mse, rule_rmse)

1.1187629293852401 1.0577159020196492


**Rule기반2 : user의 평균 영화 평점을 normalize해서 확인, 평점 측정 수, 표준편차 활용**



In [62]:
#평균 점수를 얼마나 주었는지 그리고 편차가 어떻게 되는지를 통해서
#믿을만한 유저인지에 대한 판단이 가능함
train_user_info_df = pd.DataFrame({
    'avg_ratings': train_df.groupby('userId')['rating'].mean(),
    'std_ratings': train_df.groupby('userId')['rating'].std(),
    'count_ratings': train_df.groupby('userId')['rating'].count(),

})

train_user_info_df

Unnamed: 0_level_0,avg_ratings,std_ratings,count_ratings
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,4.397260,0.756393,146
2,3.944444,0.889297,18
3,2.500000,2.136637,24
4,3.692857,1.340549,140
5,3.620690,1.115277,29
...,...,...,...
606,3.648707,0.725049,696
607,3.694915,0.956386,118
608,3.174286,1.072561,525
609,3.181818,0.394771,22


In [63]:
min_count = train_user_info_df['count_ratings'].min()
max_count = train_user_info_df['count_ratings'].max()
avg_count = train_user_info_df['count_ratings'].mean()

train_user_info_df['weights']=train_user_info_df['count_ratings'].apply(lambda x:(x-avg_count)/(max_count-min_count))

In [64]:
train_user_info_df
#가중치를 통해서 신뢰도에 대한 근거 확충

Unnamed: 0_level_0,avg_ratings,std_ratings,count_ratings,weights
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,4.397260,0.756393,146,0.023268
2,3.944444,0.889297,18,-0.050806
3,2.500000,2.136637,24,-0.047334
4,3.692857,1.340549,140,0.019795
5,3.620690,1.115277,29,-0.044441
...,...,...,...,...
606,3.648707,0.725049,696,0.341555
607,3.694915,0.956386,118,0.007064
608,3.174286,1.072561,525,0.242596
609,3.181818,0.394771,22,-0.048492


In [67]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(train_user_info_df)
df_normalized = pd.DataFrame(np_scaled, columns = train_user_info_df.columns, index=train_user_info_df.index)
df_normalized

Unnamed: 0_level_0,avg_ratings,std_ratings,count_ratings,weights
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.841705,0.354011,0.080440,0.080440
2,0.722783,0.416214,0.006366,0.006366
3,0.343434,1.000000,0.009838,0.009838
4,0.656710,0.627411,0.076968,0.076968
5,0.637757,0.521978,0.012731,0.012731
...,...,...,...,...
606,0.645115,0.339341,0.398727,0.398727
607,0.657250,0.447613,0.064236,0.064236
608,0.520519,0.501986,0.299769,0.299769
609,0.522498,0.184763,0.008681,0.008681


In [68]:
df_normalized['normalized_avg_ratings'] = df_normalized['avg_ratings'] *5
df_normalized

Unnamed: 0_level_0,avg_ratings,std_ratings,count_ratings,weights,normalized_avg_ratings
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.841705,0.354011,0.080440,0.080440,4.208524
2,0.722783,0.416214,0.006366,0.006366,3.613917
3,0.343434,1.000000,0.009838,0.009838,1.717172
4,0.656710,0.627411,0.076968,0.076968,3.283550
5,0.637757,0.521978,0.012731,0.012731,3.188784
...,...,...,...,...,...
606,0.645115,0.339341,0.398727,0.398727,3.225575
607,0.657250,0.447613,0.064236,0.064236,3.286252
608,0.520519,0.501986,0.299769,0.299769,2.602597
609,0.522498,0.184763,0.008681,0.008681,2.612489


In [69]:
test_df['pred_rating_normalized'] = test_df['userId'].apply(lambda x:df_normalized.loc[x]['normalized_avg_ratings'])
test_df

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_rating_movie,pred_rating_user,pred_rating_genre,pred_rating_normalized
99731,610,3527,5.0,1479545223,3.0,3.647059,3.692951,3.166174,3.283673
97583,606,1250,3.5,1171376891,2.5,4.133333,3.648707,3.412840,3.225575
38197,262,213,5.0,840310907,3.5,3.500000,3.000000,3.448573,2.373737
11474,68,69406,3.0,1261622505,2.0,3.437500,3.241060,3.270600,2.690281
34105,232,4728,3.0,1218166950,2.0,2.700000,3.242857,3.180659,2.692641
...,...,...,...,...,...,...,...,...,...
41080,279,593,4.0,1506394242,3.5,4.131579,3.685841,3.150190,3.274336
4897,31,780,4.0,850466616,2.0,3.482270,3.695652,3.184014,3.287220
8023,56,410,3.0,835799188,1.0,3.170213,3.857143,3.177897,3.499278
77467,483,2291,4.0,1415579167,4.5,3.769231,3.607221,3.353626,3.171098


In [71]:
normal_mse= mean_squared_error(y_true = test_df['rating'].values, y_pred = test_df['pred_rating_normalized'].values)
normal_rmse = np.sqrt(normal_mse)

print(normal_mse, normal_rmse)

1.128213849523443 1.0621741145045114


In [72]:
#train/test:8,2, train,val,test : 6,2,2,를 한 것을 비교
print('random 기반')
print(3.7410005950019833 ,1.9341666409598692)
print(random_mse, random_rmse)
print('-' *100)

print('영화 평균 평점기반 예측')
print(1.0495663597492553,1.0244834599686103)
print(movie_mse, movie_rmse)
print('-'*100)

print('사용자 평균 평점기반 예측')
print(0.8905889036428333, 0.9437101798978504)
print(user_mse, user_rmse)
print('-' * 100)

print('Rule 기반 영화 평점 예측하기')
print(1.120579096060227,1.05857408624065)
print(rule_mse, rule_rmse)
print('-'*100)

print('정규화를 활용하여 Rule기반 영화 평점 예측하기')
print(1.120579096060227, 1.05857408624065)
print(normal_mse, normal_rmse)


random 기반
3.7410005950019833 1.9341666409598692
3.6854174930583103 1.9197441217668334
----------------------------------------------------------------------------------------------------
영화 평균 평점기반 예측
1.0495663597492553 1.0244834599686103
1.0836519073833304 1.040986026507239
----------------------------------------------------------------------------------------------------
사용자 평균 평점기반 예측
0.8905889036428333 0.9437101798978504
0.8926074443084846 0.9447790452314682
----------------------------------------------------------------------------------------------------
Rule 기반 영화 평점 예측하기
1.120579096060227 1.05857408624065
1.1187629293852401 1.0577159020196492
----------------------------------------------------------------------------------------------------
정규화를 활용하여 Rule기반 영화 평점 예측하기
1.120579096060227 1.05857408624065
1.128213849523443 1.0621741145045114


### 결론

**첫번쨰**

사용자기반으로 추천을 하는 것이 가장 error가 낮은 이유는 결국 사람이 영화에 평점을 매기기 때문입니다.

그것을 통해서, 사용자가 평점을 매길 때 보편적으로 잘 주는지 평균적으로 주는지 아니면 잘 안 주는지에 대해서도 나눌 수 있기 때문입니다.

<br>

**두번째**

데이터셋이 작기 때문에 validation으로 나누게 되면 충분한 학습이 되지 않아서 validation을 하지 않은 것보다 성능이 좋지 못한다.

무분별하게 나누는 것보단 비교를 통해서 각 상황별로 쓰는 게 맞는 것 같다.


<br>

검색을 해보니 데이터가 적은데 나누고 싶을 때는 KFold validation이 좋다고 해서 시도를 했다.

