# Simple RS System

- Data: MovieLens

- Train/ Test 나누기

- 평가: RMSE


## 필요한 라이브러리 정의 및 Data Load

In [None]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix

In [None]:
ratings = pd.read_csv('./ratings.csv')
movies = pd.read_csv('./movies.csv')
tags = pd.read_csv('./tags.csv')

**이번 실습에는 평점 데이터를 중점으로 활용하겠습니다**

In [None]:
users = ratings['userId'].unique()
movies = ratings['movieId'].unique()

print('총 유저 수:', len(users))
print('총 영화 수:', len(movies))

총 유저 수: 610
총 영화 수: 9724


pivot함수: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pivot.html

In [None]:
#pivot메소드를 movie특징 선정

user_movie_matrix = ratings.pivot(
    index = 'movieId',
    columns = 'userId',
    values='rating'
).fillna(0)
#csr_matrix를 활용하여 sparse matrix만듭니다.

sparse_mat = csr_matrix(user_movie_matrix.values)


In [None]:
print(user_movie_matrix)

userId   1    2    3    4    5    6    7    8    9    10   ...  601  602  603  \
movieId                                                    ...                  
1        4.0  0.0  0.0  0.0  4.0  0.0  4.5  0.0  0.0  0.0  ...  4.0  0.0  4.0   
2        0.0  0.0  0.0  0.0  0.0  4.0  0.0  4.0  0.0  0.0  ...  0.0  4.0  0.0   
3        4.0  0.0  0.0  0.0  0.0  5.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
4        0.0  0.0  0.0  0.0  0.0  3.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
5        0.0  0.0  0.0  0.0  0.0  5.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
...      ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
193581   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
193583   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
193585   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
193587   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
193609   0.0  0.0  0.0  0.0 

In [None]:
print(sparse_mat)
#(영화, 유저)

  (0, 0)	4.0
  (0, 4)	4.0
  (0, 6)	4.5
  (0, 14)	2.5
  (0, 16)	4.5
  (0, 17)	3.5
  (0, 18)	4.0
  (0, 20)	3.5
  (0, 26)	3.0
  (0, 30)	5.0
  (0, 31)	3.0
  (0, 32)	3.0
  (0, 39)	5.0
  (0, 42)	5.0
  (0, 43)	3.0
  (0, 44)	4.0
  (0, 45)	5.0
  (0, 49)	3.0
  (0, 53)	3.0
  (0, 56)	5.0
  (0, 62)	5.0
  (0, 63)	4.0
  (0, 65)	4.0
  (0, 67)	2.5
  (0, 70)	5.0
  :	:
  (9700, 337)	2.5
  (9701, 337)	3.0
  (9702, 183)	4.0
  (9702, 247)	3.5
  (9703, 317)	2.5
  (9704, 209)	1.0
  (9705, 461)	2.5
  (9706, 49)	3.5
  (9707, 337)	1.5
  (9708, 337)	4.0
  (9709, 337)	1.0
  (9710, 337)	1.5
  (9711, 337)	1.0
  (9712, 337)	1.0
  (9713, 183)	4.5
  (9714, 183)	3.5
  (9715, 183)	3.0
  (9716, 183)	4.0
  (9717, 183)	4.0
  (9718, 183)	3.5
  (9719, 183)	4.0
  (9720, 183)	3.5
  (9721, 183)	3.5
  (9722, 183)	3.5
  (9723, 330)	4.0


In [None]:
#유저가 몇 개의 영화를 보았는가
user_info_df = pd.DataFrame(data = [sum(list(user_movie_matrix[int(x)].value_counts())[1:]) for x in user_movie_matrix.columns],
                           index = user_movie_matrix.columns, columns=['movies_rated'])



In [None]:
user_info_df

Unnamed: 0_level_0,movies_rated
userId,Unnamed: 1_level_1
1,232
2,29
3,39
4,216
5,44
...,...
606,1115
607,187
608,831
609,37


In [None]:
movie_info_df = pd.DataFrame(data = [sum(list(user_movie_matrix.loc[int(x)].value_counts())[1:]) for x in user_movie_matrix.index],
                           index = user_movie_matrix.index, columns=['users_rated'])

In [None]:
movie_info_df

Unnamed: 0_level_0,users_rated
movieId,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49
...,...
193581,1
193583,1
193585,1
193587,1


### 학습

**데이터 나누기**


In [None]:
train_df, test_df = train_test_split(ratings, test_size=0.2, random_state=1234)

In [None]:
print(train_df.shape)
print(test_df.shape)

(80668, 4)
(20168, 4)


**test  set은 존재하지만 train set에는 없는 영화 혹은 사용자 비율**

In [None]:
# 집합 A - 집합 B => 집합 B에는 없고 집합 A에만 있는 item 

# userId
print("사용자: ",len(list(set(test_df['userId'].unique()) - set(train_df['userId'].unique()))))

# movieId
print("영화: ", len(list(set(test_df['movieId'].unique()) - set(train_df['movieId'].unique()))))
print("test set의 전체 영화 수: ",  len(test_df['movieId'].unique()))

사용자:  0
영화:  786
test set의 전체 영화 수:  5171


In [None]:
movies_not_included = list(set(test_df['movieId'].unique()) - set(train_df['movieId'].unique()))
print(sorted(movies_not_included)[:10])

not_included_df =  test_df[test_df.movieId.isin(movies_not_included)].sort_values(by = 'movieId')
print(not_included_df.head(10))

print('train set에 없고, test set에만 있는 영화 데이터 수: ', not_included_df.shape)

[49, 117, 137, 178, 241, 320, 359, 478, 488, 495]
       userId  movieId  rating   timestamp
29386     202       49     3.0   974925453
97066     604      117     3.0   832080636
99501     609      137     3.0   847221054
27959     191      178     1.0   829760898
98493     607      241     4.0   964744490
96182     603      320     3.0   953925390
728         6      359     3.0   845556412
92825     599      478     2.5  1498515125
73214     474      488     3.0  1047569232
96218     603      495     5.0   953927108
train set에 없고, test set에만 있는 영화 데이터 수:  (852, 4)


## 간단한 추천 알고리즘 

**1. 랜덤으로 평점 예측하기**

**2. 영화 평균 평점기반 예측하기**

**3. 사용자 평균 평점기반 예측하기**

**4. Rule기반 영화 랭킹 예측하기**

   - *test에 있고, train에 없는 경우*

### 랜덤으로 평점-예측하기

In [None]:
#0.5 ~ 5.0 사이의 숫자를 예측해야할 평점 수 만큼 generate

ratings_range = np.arange(0.5,5.5,step = 0.5)
ratings_range

array([0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. ])

In [None]:
import random
pred_random = [random.choice(ratings_range) for x in range(len(test_df))]
pred_random[:10]

[3.0, 1.0, 0.5, 0.5, 3.5, 3.5, 1.5, 1.5, 2.5, 5.0]

In [None]:
test_df['pred_ratings_random'] = pred_random

In [None]:
mse  = mean_squared_error(y_true = test_df['rating'].values, y_pred = test_df['pred_ratings_random'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

3.7130726894089645 1.9269334937690414


### 영화 평균 평정기반 예측하기

**train set의 모든 영화에 대한 평점 구하기**

**test set예측 시, train set의 영화 평균 평점 활용하기, 만약 없다면 random선택**

In [None]:


#train set의 모든 영화 평점 구하기
train_movie_df = train_df.groupby('movieId').mean()

print(train_movie_df.shape)
print(train_movie_df.head())

(8938, 3)
             userId    rating     timestamp
movieId                                    
1        307.473373  3.893491  1.128439e+09
2        327.475610  3.396341  1.142893e+09
3        266.386364  3.454545  9.900434e+08
4        192.750000  2.250000  8.425133e+08
5        309.526316  3.039474  1.007415e+09


In [None]:
#test set예측 시, train set의 영화 평균 평점 활용하기, 만약 없다면 random선택
def avg_rating_prediction(training_set, x):
    if x in training_set.index:
        pred_rating = training_set.loc[x]['rating']
    else:
        pred_rating = random.choice(ratings_range)
    return pred_rating

In [None]:
#test데이터에 평균평점 추가

test_df['pred_rating_movies'] = test_df['movieId'].apply(lambda x: avg_rating_prediction(train_movie_df,x))
test_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_rating_movies
99731,610,3527,5.0,1479545223,3.0,3.604167
97583,606,1250,3.5,1171376891,1.0,4.180556
38197,262,213,5.0,840310907,0.5,3.75
11474,68,69406,3.0,1261622505,0.5,3.571429
34105,232,4728,3.0,1218166950,3.5,2.769231


In [None]:
mse = mean_squared_error(y_true=test_df['rating'].values, y_pred=test_df['pred_rating_movies'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

1.0569790927916989 1.028094885111145


### 사용자 평균 기반 예측하기

**train set의 모든 유저가 준 평균 평점**

**test set예측할 때, 유저가 train set에서 준 평균 평점을 활용**

**유저가 없을 경우 random 평점 적용**

In [None]:
train_user_df = train_df.groupby('userId').mean()

print(train_user_df.shape)
print(train_user_df.head())

(610, 3)
             movieId    rating     timestamp
userId                                      
1        1891.168478  4.320652  9.649865e+08
2       70402.760000  3.940000  1.445715e+09
3        8394.733333  2.516667  1.306464e+09
4        1957.923077  3.631868  9.655941e+08
5         337.606061  3.636364  8.474351e+08


In [None]:
test_df['pred_rating_user'] = test_df['userId'].apply(lambda x: avg_rating_prediction(train_user_df, x))

test_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_rating_movies,pred_rating_user
99731,610,3527,5.0,1479545223,3.0,3.604167,3.678709
97583,606,1250,3.5,1171376891,1.0,4.180556,3.649718
38197,262,213,5.0,840310907,0.5,3.75,2.925
11474,68,69406,3.0,1261622505,0.5,3.571429,3.229331
34105,232,4728,3.0,1218166950,3.5,2.769231,3.242268


In [None]:
mse = mean_squared_error(y_true=test_df['rating'].values, y_pred=test_df['pred_rating_user'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

0.8905889036428333 0.9437101798978504


### Rule 기반 영화 평점 예측

**train set에 포함된 유저의 영화 평균 평점과 영화의 장르를 활용하여, 장르별 평균 평점 계산 -> test set의 영화 장르의 평균 평점으로 예측**

In [None]:
train_user_movie_matrix = train_df.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)

In [None]:
train_user_movie_matrix


In [None]:
genres_df = movies['genres'].str.get_dummies(sep='|')
print(genres_df.shape)
genres_df = genres_df.loc[train_df.movieId.unique()]
print(genres_df.shape)
genres_df.head()

In [None]:
movies['genres']

In [None]:
# trainset에서 영화별 유저 평점 평균
train_movie_avg_ratings_df = train_user_movie_matrix.copy()
train_movie_avg_ratings_df = train_movie_avg_ratings_df.replace(0, np.NaN)
train_movie_avg_ratings_df = train_movie_avg_ratings_df.mean(axis = 1)

train_movie_avg_ratings_df.head()

In [None]:
#genres_df에서 해당 장르가 포함된 모든 영화 index를 가져와서, , 해당 영화의 유저 평균 평점의 평균을 구해서 장르 평균 평점으로 활용
#예시: user가 영화에 준 평점이 있을 때, 그때의 영화 장르들의 평점이 곧 영화 평점
genres_avg_ratings_df = pd.DataFrame(index=genres_df.columns, columns=['avg_ratings'])

for genre in genres_avg_ratings_df.index:
    genre_avg_rating = train_movie_avg_ratings_df.loc[genres_df[genres_df[genre].isin([1])].index].mean()
    genres_avg_rating_df.loc[genre]['avg_ratings'] = genre_avg_rating

genres_avg_ratings_df

In [None]:
def get_genre_avg_ratings(x):
    genres_list = movies_df.loc[x]['genres'].split('|')
    rating = 0
    for genre in genres_list:
        rating += genres_avg_ratings_df.loc[genre]['avg_ratings']

    return rating / len(genres_list)

In [None]:
tqdm.pandas()
test_df['pred_rating_genre'] =  test_df['movieId'].progress_apply(lambda x: get_genre_avg_rating(x))

In [None]:
test_df

In [None]:
mse = mean_squared_error(y_true=test_df['rating'].values, y_pred=test_df['pred_rating_genre'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

## Rule 기반 영화 평점 예측하기

**user의 평균 영화 평점을 normalize해서 확인하기, 평점 측정 수 표준편차 활용 가능**

In [None]:
train_user_info_df = pd.DataFrame({
    'avg_ratings': train_df.groupby('userId')['rating'].mean(),
    'std_ratings': train_df.groupby('userId')['rating'].std(),
    'count_ratings': train_df.groupby('userId')['rating'].count()
})

train_user_info_df

In [None]:
min_count = train_user_info_df['count_ratings'].min()
max_count = train_user_info_df['count_ratings'].max()
avg_count = train_user_info_df['count_ratings'].mean()

train_user_info_df['weights'] = train_user_info_df['count_ratings'].apply(lambda x: (x-avg_count)/(max_count-min_count))

In [None]:
train_user_info_df

In [None]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(train_user_info_df)


In [None]:
df_normalized = pd.DataFrame(np_scaled, columns = train_user_info_df.columns, index = train_user_info_df.index)
df_normalized

In [None]:
df_normalized['normalized_avg_ratings'] = df_normalized['avg_ratings'] * 5
df_normalized

In [None]:
test_df['pred_rating_normalized'] = test_df['userId'].apply(lambda x: df_normalized.loc[x]['normalized_avg_ratings'])
test_df

In [None]:
mse = mean_squared_error(y_true=test_df['rating'].values, y_pred=test_df['pred_rating_normalized'].values)
rmse = np.sqrt(mse)

print(mse, rmse)