## 정량적 평가 - RMSE
--------------------
* 평균 제곱근 편차
* 실제 값 vs 모델 예측값 차이로 나타냄
* Scale-dependent하다 - 예측 대상 값에 영향을 받는다
  - 스케일에 의존도가 있다

NDCG
----------
* 랭킹 추천에 많이 사용되는 평가 지표
* 기존 정보검색에서 많이 사용했엏다
* top-N 랭킹 리스트 만들고, 더 관심있거나 관련성 높은 아이템 포함 여부 평가
* 순위에 가중치를 주고, 단순한 랭킹이 아닌 데이터의 성향을 반영하기 위한 평가 지표
* MAP(mean Average Precision), Top K Precision/Recall 등 평가 방법 보안
  - 추천 또는 정보 검색에서 특정 아이템에 biased ehls ruddn
  - 이미 유명하고 잘 알려진 인기있는 아이템 또는 한명의 사용자에 의해 만들어진 랭킹 등 문제

## OTHER
-------
1. Top-k
2. MAP (mean average Precsion)

In [3]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import os
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix
%matplotlib inline


path = 'data/ml-latest-small/'
rating_df = pd.read_csv(os.path.join(path+'ratings.csv'),encoding = 'utf-8')
tags_df = pd.read_csv(os.path.join(path+'tags.csv'),encoding = 'utf-8')
movies_df = pd.read_csv(os.path.join(path+'movies.csv'),index_col = 'movieId',encoding = 'utf-8')

* 각 유저가 어떤 영화에 평점을 줬는지에 대한 sparse_matrix

In [4]:
user_movie_matrix = rating_df.pivot(
    index = 'movieId',
    columns = 'userId',
    values = 'rating'
).fillna(0)

In [9]:
#convert dataframe of movie features to scipy sparse matrix

sparse_mat = csr_matrix(user_movie_matrix.values)

In [13]:
#각 user가 rating을 준 movie 갯수
user_info_df = pd.DataFrame(
    data = [sum(list(user_movie_matrix[int(x)].value_counts())[1:]) for x in user_movie_matrix.columns],
    index = user_movie_matrix.columns,
    columns = ['movies_rated']
)

In [19]:
train_df , test_df = train_test_split(rating_df,test_size=0.2,random_state=1234)

In [20]:
len(list(set(test_df['userId'].unique())-set(train_df['userId'])))

0

# random으로 예측 해버리기

In [22]:
ratings_range = np.arange(0.5,5.5,step = 0.5)

In [24]:
import random
pred_random = [random.choice(ratings_range) for x in range(len(test_df))]

In [27]:
test_df['random_rating_pred'] = pred_random

In [28]:
mse = mean_squared_error(
    y_true = test_df['rating'].values,
    y_pred = test_df['random_rating_pred'].values
)

In [30]:
rmse = np.sqrt(mse)

In [32]:
mse, rmse # 평균적으로 1.9만큼 절대값 차이가 존재한다. 라는 뜻

(3.7081267354224514, 1.9256496917722215)

## 영화 평균 평점기반 예측
   - train 데이터에 무비가 없으면 랜덤으로 추천

In [34]:
train_movie_df = train_df.groupby('movieId').mean()
train_movie_df.head(3)

Unnamed: 0_level_0,userId,rating,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,307.473373,3.893491,1128439000.0
2,327.47561,3.396341,1142893000.0
3,266.386364,3.454545,990043400.0


In [35]:
def avg_rating_prediction(training_set,x):
    if x in training_set.index:
        pred_rating = training_set.loc[x]['rating']
    else:
        pred_rating = random.choice(ratings_range)
    return pred_rating

In [36]:
test_df['pred_rating_movie'] = test_df['movieId'].apply(
    lambda x : avg_rating_prediction(train_movie_df,x)
)

In [37]:
avg_movie_mse = mean_squared_error(
    y_true = test_df['rating'].values,
    y_pred = test_df['pred_rating_movie'].values
)
avg_movie_rmse = np.sqrt(avg_movie_mse)

In [40]:
avg_movie_mse,avg_movie_rmse 
# 생각보다 사람들의 평점을 랜덤하게 준 것은 아니다. = 경향성이 있다.
# 왜냐하면 랜덤하게 예측해서 평가한 것 보다 많이 수치가 떨어져 있다.

(1.0493928175041145, 1.0243987590309325)

In [41]:
train_user_df = train_df.groupby('userId').mean()

train_user_df.head(3)

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1891.168478,4.320652,964986500.0
2,70402.76,3.94,1445715000.0
3,8394.733333,2.516667,1306464000.0


In [42]:
test_df['user_avg_pred'] = test_df['userId'].apply(
    lambda x : avg_rating_prediction(train_user_df,x)
)

In [44]:
user_pred_mse = mean_squared_error(
    y_true = test_df['rating'],
    y_pred = test_df['user_avg_pred']
)
user_pred_rmse = np.sqrt(user_pred_mse)

In [48]:
user_pred_mse, user_pred_rmse
# 0.1 떨여졌다 = 10%정도 향상
# train과 test간에 정보 손실이 없다

(0.8905889036428333, 0.9437101798978504)

## Rule 기반 영화 평점 예측


In [49]:
train_user_movie_matrix = train_df.pivot(
    index = 'movieId',
    columns = 'userId',
    values = 'rating'
).fillna(0)

In [53]:
genres_df = movies_df['genres'].str.get_dummies(sep='|')
genres_df = genres_df.loc[train_df.movieId.unique()]
genres_df.head()

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
5943,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2571,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
8958,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2322,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
2959,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0


In [89]:
train_movie_avg_ratings_df = train_user_movie_matrix.copy()
train_movie_avg_ratings_df = train_movie_avg_ratings_df.replace(0,np.NaN)
train_movie_avg_ratings_df = train_movie_avg_ratings_df.mean(axis = 1)
train_movie_avg_ratings_df.head()

movieId
1    3.893491
2    3.396341
3    3.454545
4    2.250000
5    3.039474
dtype: float64

In [60]:
genres_avg_ratings_df = pd.DataFrame(
    index = genres_df.columns,columns = ['avg_ratings']
)

In [94]:
for genre in genres_avg_ratings_df.index:
    genre_avg_rating = train_movie_avg_ratings_df.loc[genres_df[genres_df[genre].isin([1])].index].mean()
    genres_avg_ratings_df.loc[genre]['avg_ratings'] = genre_avg_rating
    
genres_avg_ratings_df.head()


Unnamed: 0,avg_ratings
(no genres listed),3.33642
Action,3.11085
Adventure,3.230721
Animation,3.492258
Children,3.101232


In [97]:
def get_genre_avg_ratings(x):
    # x is movieId
    genres_list = movies_df.loc[x]['genres'].split('|')
    rating = 0
    for genre in genres_list:
        rating += genres_avg_ratings_df.loc[genre]['avg_ratings']
    
    return rating / len(genres_list)

In [98]:
tqdm.pandas()
test_df['pred_rating_genre'] = test_df['movieId'].progress_apply(lambda x: get_genre_avg_ratings(x))

100%|██████████| 20168/20168 [00:05<00:00, 3761.89it/s]


In [100]:
genre_mse =mean_squared_error(
    y_true = test_df['rating'],
    y_pred = test_df['pred_rating_genre']
)
genre_rmse = np.sqrt(genre_mse)

In [102]:
genre_mse,genre_rmse
#영화 평점기반의 예측은 userId 값보다 작으므로 user의 개별적 특성을 넣는 것이 중요하다

(1.1251906030478547, 1.0607500191128232)

## Rule 기반 영화 평점 예측 2

In [105]:
train_user_info_df = pd.DataFrame({
    'avg_rating':train_df.groupby('userId')['rating'].mean(),
    'std_rating':train_df.groupby('userId')['rating'].std(),
    'count_rating':train_df.groupby('userId')['rating'].count()
})
train_user_info_df.head()

Unnamed: 0_level_0,avg_rating,std_rating,count_rating
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,4.320652,0.8366,184
2,3.94,0.820569,25
3,2.516667,2.12734,30
4,3.631868,1.317823,182
5,3.636364,1.084498,33


In [106]:
min_count= train_user_info_df.count_rating.min()
max_count = train_user_info_df.count_rating.max()
avg_count = train_user_info_df.count_rating.mean()

In [109]:
train_user_info_df['weights'] = train_user_info_df['count_rating'].apply(
    lambda x : (x-avg_count)/(max_count-min_count)
)
# Normalization 공식
# 가중치 column을 통해 해당 유저의 평가 신뢰성 얻을 수 있음
train_user_info_df.head()

Unnamed: 0_level_0,avg_rating,std_rating,count_rating,weights
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,4.320652,0.8366,184,0.023995
2,3.94,0.820569,25,-0.049718
3,2.516667,2.12734,30,-0.0474
4,3.631868,1.317823,182,0.023068
5,3.636364,1.084498,33,-0.04601


In [111]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(train_user_info_df)
df_normalized = pd.DataFrame(np_scaled, columns = train_user_info_df.columns, index=train_user_info_df.index)
df_normalized
#이미 평가 점수가 고정이라 큰 의미는 없음

Unnamed: 0_level_0,avg_rating,std_rating,count_rating,weights
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.822227,0.393261,0.079740,0.079740
2,0.722617,0.385725,0.006027,0.006027
3,0.350156,1.000000,0.008345,0.008345
4,0.641984,0.619470,0.078813,0.078813
5,0.643161,0.509791,0.009736,0.009736
...,...,...,...,...
606,0.646655,0.345449,0.404729,0.404729
607,0.678762,0.449188,0.061660,0.061660
608,0.514806,0.503682,0.302735,0.302735
609,0.548824,0.213816,0.007881,0.007881


In [114]:
df_normalized['normalized_avg_ratings'] = df_normalized['avg_rating'] * 5

test_df['pred_rating_normalized'] = test_df['userId'].apply(lambda x: df_normalized.loc[x]['normalized_avg_ratings'])
test_df

Unnamed: 0,userId,movieId,rating,timestamp,random_rating_pred,pred_rating_movie,user_avg_pred,pred_rating_genre,pred_rating_normalized
99731,610,3527,5.0,1479545223,2.5,3.604167,3.678709,3.138325,3.271208
97583,606,1250,3.5,1171376891,4.5,4.180556,3.649718,3.410377,3.233275
38197,262,213,5.0,840310907,3.0,3.750000,2.925000,3.429093,2.285047
11474,68,69406,3.0,1261622505,2.0,3.571429,3.229331,3.267870,2.683236
34105,232,4728,3.0,1218166950,1.5,2.769231,3.242268,3.181480,2.700164
...,...,...,...,...,...,...,...,...,...
41080,279,593,4.0,1506394242,2.0,4.127907,3.666667,3.132440,3.255452
4897,31,780,4.0,850466616,0.5,3.470760,3.911765,3.161424,3.576141
8023,56,410,3.0,835799188,4.0,3.131148,3.837838,3.174323,3.479414
77467,483,2291,4.0,1415579167,3.5,3.734375,3.598940,3.341203,3.166837


In [115]:
nomal_mse = mean_squared_error(
    y_true = test_df['rating'],
    y_pred = test_df['pred_rating_normalized']
)
nomal_rmse = np.sqrt(nomal_mse)

In [116]:
nomal_mse,nomal_rmse

(1.120579096060227, 1.05857408624065)