In [21]:
# 드라이브에 접근할 수 있도록 아래 코드 입력
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# TF-IDF 실습

# Collaborative Filtering 실습

In [4]:
import os
import pandas as pd
import seaborn as sns
import scipy
import numpy as np
import random
from matplotlib import pyplot as plt
from datetime import datetime
from tqdm.auto import tqdm
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

import warnings
warnings.filterwarnings("ignore")

## 유사도 계산 실습

- 사용하게될 `유사도`를 구현합니다.
- 모델을 구현할 때는 라이브러리를 사용하거나 모델 패키지 내부에 구현되어 있는 경우가 있지만, 개념을 잡기 위해 구현해봅니다.
- 항목
    1. Consine
    2. Jaccard
    3. Pearson correlation

### Consine

In [9]:
import math

def square_rooted(x):
    return math.sqrt(sum([a * a for a in x]))

def cosine(x, y):
    numerator = np.inner(x, y)
    denominator = square_rooted(x) *  square_rooted(y)
    return round(numerator / denominator, 3)    # 소수 4째자리 반올림


In [15]:
a = [3, 45, 7, 2]
b = [2, 54, 13, 15]
print('Consine 1 :',cosine( a, b))

a = [3, 45, 7, 2, 3]
b = [2, 54, 13, 15, 17]
print('Consine 2 :',cosine(a, b))

Consine 1 : 0.972
Consine 2 : 0.949


### Jaccard

In [13]:
def jaccard(x, y):
    intersection_cardinality = len(set(x).intersection(set(y)))
    union_cardinality = len(set(x).union(set(y)))
    return round(intersection_cardinality / union_cardinality, 3)  

In [16]:
a = [0,1,2,5,6]
b = [2,3,5,7,9]
print('Jaccard 1:', jaccard(a, b))

a = [0,1,2,5,6,10]
b = [2,3,5,7,9]
print('Jaccard 2:', jaccard(a, b))

Jaccard 1: 0.25
Jaccard 2: 0.222


### Pearson

In [17]:
def pearson_correlation(x, y):
    mean_x = sum(x) / len(x)
    mean_y = sum(y) / len(y)

    deviation_x = [i - mean_x for i in x]
    deviation_y = [i - mean_y for i in y]

    numerator = np.inner(deviation_x, deviation_y)
    denominator = square_rooted(deviation_x) * square_rooted(deviation_y)

    return round(numerator / denominator, 3)

In [20]:
a = [3, 45, 7, 2]
b = [2, 54, 13, 15]
print('Pearson 1 :', pearson_correlation(np.array(a), np.array(b)))

a = [3, 45, 7, 2, 3]
b = [2, 54, 13, 15, 10]
print('Pearson 2 :', pearson_correlation(a, b))

Pearson 1 : 0.968
Pearson 2 : 0.969


# Memory-based CF 구현

## Sparse Matrix 만들기

In [24]:
path = '/content/drive/MyDrive/Colab_Notebook/learning_spoons_recommendation_system/data/ml-latest-small/'

In [25]:
ratings_df = pd.read_csv(path + 'ratings.csv', encoding='utf-8')
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


 - 평점`rating`을 value로 두고, 영화`movieId`를 행, 유저`userId`를 열로 하여 `Sparse Matrix`를 만들 수 있습니다.

In [28]:
user_id_list = sorted(list(set(ratings_df['userId'].values)))
movie_id_list = sorted(list(set(ratings_df['movieId'].values)))

print('유저 수 :', len(user_id_list), '영화 수 :', len(movie_id_list))

유저 수 : 610 영화 수 : 9724


In [29]:
user_item_matrix = ratings_df.pivot_table('rating', 'userId', 'movieId')
user_item_matrix

movieId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,34,36,38,39,40,41,42,43,...,185135,185435,185473,185585,186587,187031,187541,187593,187595,187717,188189,188301,188675,188751,188797,188833,189043,189111,189333,189381,189547,189713,190183,190207,190209,190213,190215,190219,190221,191005,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,4.0,,4.0,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.5,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,3.0,,,,,,,,,,,2.0,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,4.0,,,,,,,,,,,,,,,,,,,,4.0,,,,,,,,,,,,4.0,4.0,,3.0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,2.5,,,,3.5,,4.0,4.0,2.0,,,,,,,,,3.5,4.5,,,4.0,,3.5,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
607,4.0,,,,,,,,,,3.0,,,,,,,,,,,,,,3.0,,,,,,,,3.0,4.0,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,,,,,,4.5,,,2.0,,3.5,,,2.0,,,,,,,3.0,3.5,3.5,,,3.0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
609,3.0,,,,,,,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


- 모르는 데이터에 대해선 `0`으로 채워줍니다.

In [38]:
user_item_matrix = user_item_matrix.fillna(0)
user_item_matrix

movieId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,34,36,38,39,40,41,42,43,...,185135,185435,185473,185585,186587,187031,187541,187593,187595,187717,188189,188301,188675,188751,188797,188833,189043,189111,189333,189381,189547,189713,190183,190207,190209,190213,190215,190219,190221,191005,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,2.5,0.0,0.0,0.0,3.5,0.0,4.0,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,4.5,0.0,0.0,4.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,2.0,0.0,3.5,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.5,3.5,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- `train` 데이터와 `test` 데이터를 나눠줍니다

In [35]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=10)

print(train_df.shape)
print(test_df.shape)

(80668, 4)
(20168, 4)


## User-based CF

### Similarity Matrix

In [60]:
from sklearn.metrics.pairwise import cosine_similarity

# 유저끼리의 유사도를 구해줍니다. -> 행렬의 차원은 유저 수 x 유저 수

user_similarity = cosine_similarity(user_item_matrix, user_item_matrix)
user_similarity_df = pd.DataFrame(data=user_similarity, columns=user_item_matrix.index, index=user_item_matrix.index)
user_similarity_df

userId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,571,572,573,574,575,576,577,578,579,580,581,582,583,584,585,586,587,588,589,590,591,592,593,594,595,596,597,598,599,600,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,1.000000,0.027283,0.059720,0.194395,0.129080,0.128152,0.158744,0.136968,0.064263,0.016875,0.132499,0.016458,0.092971,0.113238,0.160689,0.169858,0.264358,0.214868,0.325376,0.160969,0.153162,0.050691,0.106669,0.155193,0.099261,0.105791,0.238933,0.202866,0.145251,0.099781,0.164454,0.146211,0.151945,0.083544,0.074508,0.071673,0.144736,0.138588,0.329782,0.094755,...,0.090224,0.131072,0.253480,0.093450,0.062668,0.051803,0.317319,0.000000,0.141129,0.249326,0.057399,0.048914,0.052175,0.100406,0.114076,0.123280,0.122782,0.183922,0.118112,0.324766,0.136809,0.143934,0.174413,0.141960,0.110558,0.123713,0.312843,0.011280,0.282412,0.291272,0.080554,0.164455,0.221486,0.070669,0.153625,0.164191,0.269389,0.291097,0.093572,0.145321
2,0.027283,1.000000,0.000000,0.003726,0.016614,0.025333,0.027585,0.027257,0.000000,0.067445,0.044419,0.000000,0.043918,0.016901,0.119778,0.093728,0.103755,0.166253,0.012571,0.014137,0.090880,0.144635,0.013597,0.129902,0.226008,0.000000,0.000000,0.058145,0.104059,0.149324,0.000000,0.017807,0.032428,0.043299,0.000000,0.054012,0.028500,0.023147,0.000000,0.017446,...,0.000000,0.032675,0.167959,0.035531,0.000000,0.000000,0.014870,0.000000,0.000000,0.061953,0.136703,0.219315,0.043620,0.000000,0.127551,0.089562,0.000000,0.055900,0.026429,0.056348,0.036147,0.030684,0.062033,0.013125,0.000000,0.104568,0.011986,0.048508,0.098000,0.023248,0.202671,0.016866,0.011997,0.000000,0.000000,0.028429,0.012948,0.046211,0.027565,0.102427
3,0.059720,0.000000,1.000000,0.002251,0.005020,0.003936,0.000000,0.004941,0.000000,0.000000,0.000000,0.000000,0.000000,0.003064,0.017251,0.032299,0.009813,0.028241,0.019142,0.008756,0.004017,0.003070,0.003081,0.002592,0.005100,0.000000,0.002332,0.029771,0.005974,0.000000,0.003507,0.015494,0.007156,0.000736,0.000000,0.000000,0.000000,0.004196,0.003639,0.002530,...,0.020899,0.008056,0.000000,0.000000,0.000000,0.029830,0.039894,0.000000,0.005749,0.033121,0.008561,0.000000,0.000000,0.000000,0.000000,0.000986,0.015793,0.003923,0.004791,0.027418,0.000000,0.000000,0.006460,0.001983,0.000000,0.025873,0.028970,0.000000,0.039539,0.013143,0.005048,0.004892,0.024992,0.000000,0.010694,0.012993,0.019247,0.021128,0.000000,0.032119
4,0.194395,0.003726,0.002251,1.000000,0.128659,0.088491,0.115120,0.062969,0.011361,0.031163,0.054767,0.049945,0.076949,0.048989,0.071551,0.164761,0.145058,0.123217,0.206053,0.113755,0.053014,0.043403,0.093971,0.071690,0.032862,0.061832,0.128954,0.135593,0.061308,0.058140,0.126182,0.200597,0.167247,0.045571,0.050148,0.113807,0.061031,0.090125,0.240976,0.053436,...,0.018553,0.182857,0.125627,0.064559,0.061035,0.005932,0.169440,0.000000,0.098777,0.148584,0.043871,0.047619,0.048474,0.041246,0.088289,0.064095,0.138766,0.049146,0.064485,0.219298,0.116111,0.068196,0.217116,0.082614,0.091974,0.107908,0.275436,0.016054,0.201610,0.211921,0.085938,0.128273,0.307973,0.052985,0.084584,0.200395,0.131746,0.149858,0.032198,0.107683
5,0.129080,0.016614,0.005020,0.128659,1.000000,0.300349,0.108342,0.429075,0.000000,0.030611,0.183805,0.058860,0.017157,0.221711,0.110152,0.082171,0.162633,0.121313,0.098758,0.096474,0.058264,0.033074,0.066889,0.096215,0.040705,0.294282,0.100491,0.121894,0.068876,0.065534,0.068585,0.233932,0.282323,0.029953,0.311472,0.022065,0.303078,0.377773,0.152956,0.321077,...,0.020016,0.124806,0.129338,0.341347,0.049306,0.000000,0.122199,0.000000,0.182382,0.117199,0.057538,0.000000,0.124645,0.348215,0.029293,0.072713,0.154315,0.377256,0.230961,0.152971,0.000000,0.359595,0.171864,0.137990,0.073238,0.096181,0.116071,0.000000,0.098599,0.137053,0.068048,0.418747,0.110148,0.258773,0.148758,0.106435,0.152866,0.135535,0.261232,0.060792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.164191,0.028429,0.012993,0.200395,0.106435,0.102123,0.200035,0.099388,0.075898,0.088963,0.072988,0.075012,0.078030,0.100258,0.176102,0.187991,0.228150,0.244718,0.197557,0.133759,0.154664,0.146306,0.180398,0.120349,0.073508,0.057984,0.116688,0.239141,0.109923,0.083593,0.076391,0.113624,0.183288,0.126329,0.050278,0.094634,0.057503,0.110744,0.150151,0.096332,...,0.066803,0.201933,0.172313,0.050543,0.056985,0.024948,0.161706,0.029230,0.113237,0.303959,0.152900,0.076863,0.056115,0.073394,0.102095,0.120445,0.181949,0.100667,0.107110,0.313587,0.104047,0.076050,0.201965,0.141106,0.089641,0.186620,0.247790,0.060730,0.307964,0.310161,0.178084,0.116534,0.300669,0.066032,0.148141,1.000000,0.153063,0.262558,0.069622,0.201104
607,0.269389,0.012948,0.019247,0.131746,0.152866,0.162182,0.186114,0.185142,0.011844,0.010451,0.269857,0.032357,0.041894,0.163128,0.172278,0.108235,0.232015,0.191474,0.267631,0.076034,0.126051,0.032362,0.060483,0.133275,0.068520,0.172314,0.145752,0.177644,0.104298,0.128971,0.146604,0.152270,0.178559,0.119460,0.054149,0.083398,0.175458,0.163489,0.250775,0.126054,...,0.157236,0.172598,0.230264,0.174995,0.063632,0.005566,0.277245,0.000000,0.142133,0.222980,0.089457,0.042290,0.018075,0.172968,0.037530,0.137029,0.156100,0.195432,0.172363,0.303766,0.096840,0.179560,0.159920,0.230269,0.039137,0.141012,0.261206,0.002461,0.229975,0.219444,0.092525,0.199910,0.203540,0.137834,0.118780,0.153063,1.000000,0.283081,0.149190,0.139114
608,0.291097,0.046211,0.021128,0.149858,0.135535,0.178809,0.323541,0.187233,0.100435,0.077424,0.165329,0.045540,0.150508,0.139580,0.239751,0.182810,0.226019,0.328848,0.357684,0.255559,0.241128,0.194950,0.157319,0.170871,0.077472,0.115403,0.143279,0.341066,0.108109,0.097244,0.124598,0.157475,0.182282,0.245362,0.079864,0.088450,0.137988,0.159527,0.249292,0.138185,...,0.090180,0.162648,0.366207,0.133860,0.084302,0.059132,0.205068,0.000000,0.158486,0.463412,0.178818,0.057913,0.099307,0.164377,0.182266,0.193141,0.135310,0.160833,0.113565,0.397702,0.180687,0.221534,0.256632,0.210040,0.048688,0.273697,0.247656,0.048990,0.427623,0.373028,0.158355,0.197514,0.232771,0.155306,0.178142,0.262558,0.283081,1.000000,0.121993,0.322055
609,0.093572,0.027565,0.000000,0.032198,0.261232,0.214234,0.090840,0.423993,0.000000,0.021766,0.243111,0.000000,0.000000,0.290085,0.092756,0.056137,0.178866,0.107490,0.101163,0.000000,0.066256,0.085359,0.026316,0.102989,0.024313,0.392579,0.065996,0.090873,0.081750,0.081547,0.074893,0.108999,0.186822,0.093544,0.213665,0.036608,0.369436,0.320054,0.072277,0.224892,...,0.033209,0.128006,0.133284,0.381254,0.049083,0.000000,0.126357,0.020983,0.105622,0.125743,0.068017,0.000000,0.062826,0.390869,0.029161,0.090871,0.078900,0.418791,0.255039,0.110791,0.031238,0.379788,0.110493,0.114996,0.000000,0.054495,0.092068,0.000000,0.102966,0.104708,0.035653,0.335231,0.061941,0.236601,0.097610,0.069622,0.149190,0.121993,1.000000,0.053225


### Average Rating

- `user_id` : 15에 대해서 test 데이터의 평점을 예측해봅니다.

In [42]:
user_id = 15
user_test_df = test_df[test_df.userId == user_id]
user_test_df

Unnamed: 0,userId,movieId,rating,timestamp
1557,15,122904,2.0,1510571949
1484,15,3535,3.5,1510572486
1561,15,134853,4.5,1510572481
1500,15,5445,4.0,1510571793
1553,15,115713,2.0,1510572009
1497,15,4886,3.5,1510577956
1547,15,109487,4.0,1510571878
1504,15,5989,5.0,1510571938
1467,15,2011,5.0,1510572060
1558,15,122922,2.0,1510572670


In [52]:
result = []

for _, row in user_test_df.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    rating = row['rating']

    if movie_id in user_item_matrix.columns:
        # 영화에 평점을 매긴 다른 유저들을 가져옵니다.
        user_movie_matrix = user_item_matrix[user_item_matrix[movie_id] > 0][[movie_id]] # [[]] : n * 1 행렬 -> 1 * n 행렬
        
        # 다른 유저들의 해당 영화의 평점 평균을 구합니다.
        numerator = user_movie_matrix[movie_id].sum()
        
        # 유저들의 숫자를 구합니다.
        denominator = len(user_movie_matrix)

        predicted_rating = numerator / denominator

        result.append([int(user_id), int(movie_id), rating, predicted_rating])

result_df = pd.DataFrame(result, columns=['userId', 'movieId', 'rating', 'predictedRating'])
result_df

Unnamed: 0,userId,movieId,rating,predictedRating
0,15,122904,2.0,3.833333
1,15,3535,3.5,3.788136
2,15,134853,4.5,3.813953
3,15,5445,4.0,3.6375
4,15,115713,2.0,3.910714
5,15,4886,3.5,3.871212
6,15,109487,4.0,3.993151
7,15,5989,5.0,3.921739
8,15,2011,5.0,3.505747
9,15,122922,2.0,3.704545


In [53]:
# train data의 전체 평균 평점
global_average = train_df['rating'].mean()

In [54]:
# 전체 test 데이터에서 대해서 average rating

result = []

for _, row in tqdm(test_df.iterrows()):
    user_id = row['userId']
    movie_id = row['movieId']
    rating = row['rating']

    if movie_id in user_item_matrix.columns and user_id in user_item_matrix.index:
        user_movie_matrix = user_item_matrix[user_item_matrix[movie_id] > 0][[movie_id]]
        numerator = user_movie_matrix[movie_id].sum()
        denominator = len(user_movie_matrix)
        predicted_rating = numerator / denominator

        result.append([user_id, movie_id, rating, predicted_rating])

    else:
        result.append([user_id, movie_id, rating, global_average])

result_df = pd.DataFrame(result, columns=['userId', 'movieId', 'rating', 'predictedRating'])
result_df

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Unnamed: 0,userId,movieId,rating,predictedRating
0,239.0,924.0,4.0,3.894495
1,384.0,2763.0,3.0,3.645833
2,52.0,58559.0,5.0,4.238255
3,600.0,719.0,2.5,2.842857
4,318.0,117364.0,4.0,4.000000
...,...,...,...,...
20163,20.0,5015.0,4.0,3.305556
20164,177.0,6787.0,3.0,4.018519
20165,103.0,969.0,4.0,3.970588
20166,42.0,2875.0,3.0,2.950000


- 성능을 구해봅니다.

In [56]:
# rmse
mse = mean_squared_error(y_true=result_df['rating'].values, y_pred=result_df['predictedRating'].values)
rmse = np.sqrt(mse)

# mae
mae = mean_absolute_error(y_true=result_df['rating'].values, y_pred=result_df['predictedRating'].values)

print('rmse :', rmse, 'mae :', mae)

rmse : 0.8804104492124903 mae : 0.669246345562672


### Weighted Average Rating

In [58]:
# 전체 test 데이터에 대해서 Similarity 활용한 Weighted Average Rating

result = []

for _, row in tqdm(test_df.iterrows()):
    user_id = row['userId']
    movie_id = row['movieId']
    rating = row['rating']

    if movie_id in user_item_matrix.columns and user_id in user_item_matrix.index:
        # 영화에 평점을 매긴 다른 유저들을 가져옵니다
        movie_ratings = user_item_matrix[user_item_matrix[movie_id] > 0][movie_id]
        user_ids = movie_ratings.index

        # 예측하려는 유저와 해당 영화 평점을 매긴 다른 유저들간의 유사도를 구합니다
        similarities = user_similarity_df[user_ids].loc[user_id]

        # 위에서 구한 유사도를 가중치로 사용하여 다른 유저들의 평점의 평균을 구합니다
        numerator = np.inner(movie_ratings.values, similarities)
        denominator = similarities.sum()

        if denominator == 0:
            continue
        
        predicted_rating = numerator / denominator

        result.append([user_id, movie_id, rating, predicted_rating])
    else:
        result.append([user_id, movie_id, rating, global_average])

weighted_average_result_df = pd.DataFrame(result, columns=['userId', 'movieId', 'rating', 'predictedRating'])
weighted_average_result_df
    


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Unnamed: 0,userId,movieId,rating,predictedRating
0,239.0,924.0,4.0,3.861305
1,384.0,2763.0,3.0,3.579130
2,52.0,58559.0,5.0,4.284072
3,600.0,719.0,2.5,2.653606
4,318.0,117364.0,4.0,4.000000
...,...,...,...,...
20163,20.0,5015.0,4.0,3.447736
20164,177.0,6787.0,3.0,3.871911
20165,103.0,969.0,4.0,3.871269
20166,42.0,2875.0,3.0,2.930864


In [59]:
# 성능 구하기

# rmse 구하기
mse = mean_squared_error(y_true = weighted_average_result_df['rating'].values,
                         y_pred = weighted_average_result_df['predictedRating'].values)
rmse = np.sqrt(mse)

# mae 구하기
mae = mean_absolute_error(y_true = weighted_average_result_df['rating'].values,
                          y_pred = weighted_average_result_df['predictedRating'].values)

print("rmse :", rmse, "mae :", mae)

rmse : 0.7606241166199128 mae : 0.5683615432279351


### k-Nearest Neighborhood CF(user-based)

In [62]:
# 전체 test 데이터에서 대해서 유사도가 높은 유저 k명에 대해서만 weighted average rating

k = 20

result = []

for _, row in tqdm(test_df.iterrows()):
    user_id = row['userId']
    movie_id = row['movieId']
    rating = row['rating']

    if movie_id in user_item_matrix.columns and user_id in user_item_matrix.index:
        # 영화에 평점을 매긴 다른 유저들을 가져옵니다.
        movie_ratings = user_item_matrix[user_item_matrix[movie_id] > 0][movie_id]
        user_ids = movie_ratings.index

        # 예측하려는 유저와 해당 영화에 평점을 매긴 다른 유저들 가운데 유사도가 높은 k명의 유저만을 사용합니다.
        candidate_similarities = user_similarity_df[user_ids].loc[user_id].sort_values(ascending=False)[:k]
        candidate_movie_ratings = movie_ratings[candidate_similarities.index]

        # 유사도가 높은 k명의 유저의 유사도와 평점을 사용하여 예측 유저의 평점을 예측합니다.
        numerator = np.inner(candidate_movie_ratings.values, candidate_similarities)
        denominator = candidate_similarities.sum()

        if denominator == 0:
            result.append([user_id, movie_id, rating, global_average])
            continue

        predicted_rating = numerator / denominator
        result.append([user_id, movie_id, rating, predicted_rating])
    else:
        result.append([user_id, movie_id, rating, global_average])

k_weighted_average_result_df = pd.DataFrame(result, columns=['userId', 'movieId', 'rating', 'predictedRating'])
k_weighted_average_result_df



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Unnamed: 0,userId,movieId,rating,predictedRating
0,239.0,924.0,4.0,3.339909
1,384.0,2763.0,3.0,3.566854
2,52.0,58559.0,5.0,4.552965
3,600.0,719.0,2.5,2.488838
4,318.0,117364.0,4.0,4.000000
...,...,...,...,...
20163,20.0,5015.0,4.0,3.447736
20164,177.0,6787.0,3.0,3.805651
20165,103.0,969.0,4.0,3.799549
20166,42.0,2875.0,3.0,2.930864


In [63]:
# 성능 구하기

# rmse 구하기
mse = mean_squared_error(y_true = k_weighted_average_result_df['rating'].values,
                         y_pred = k_weighted_average_result_df['predictedRating'].values)
rmse = np.sqrt(mse)

# mae 구하기
mae = mean_absolute_error(y_true = k_weighted_average_result_df['rating'].values,
                          y_pred = k_weighted_average_result_df['predictedRating'].values)

print("rmse", rmse)
print("mae", mae)

rmse 0.7086499608895889
mae 0.5319914777470475


## Item-based CF

- 아이템끼리 `유사도`를 사용하여 평점을 예측합니다.
- `TF-IDF`와 아이템 벡터 생성 방법은 다르지만 결과적으로 같은 아이템을 벡터로 표현한다는 점에서 유사합니다.

In [75]:
# train 데이터를 가지고 아이템-유저 매트릭스를 생성합니다
item_user_matrix = train_df.pivot_table('rating', 'movieId', 'userId').fillna(0)
item_user_matrix

userId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,571,572,573,574,575,576,577,578,579,580,581,582,583,584,585,586,587,588,589,590,591,592,593,594,595,596,597,598,599,600,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,4.0,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0,4.5,3.5,4.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,5.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,3.0,0.0,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,0.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,3.5,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,2.5,4.0,0.0,4.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [68]:
from sklearn.metrics.pairwise import cosine_similarity
# 아이템끼리의 유사도를 구합니다.

item_similarity = cosine_similarity(item_user_matrix, item_user_matrix)
item_similarity_df = pd.DataFrame(item_similarity, columns=item_user_matrix.index, index=item_user_matrix.index)
item_similarity_df

movieId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,34,36,38,39,40,41,42,43,...,185029,185031,185033,185135,185435,185473,185585,187031,187541,187593,187595,187717,188189,188301,188675,188751,188797,189043,189111,189333,189381,189547,189713,190183,190207,190209,190213,190215,190219,190221,191005,193565,193567,193571,193573,193579,193581,193583,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,1.000000,0.338970,0.315604,0.030125,0.261765,0.295128,0.235605,0.093550,0.170233,0.280456,0.211667,0.180384,0.175283,0.168406,0.113473,0.231483,0.320978,0.159384,0.328983,0.133588,0.261711,0.225783,0.128264,0.122684,0.213169,0.144268,0.088120,0.083628,0.122811,0.05456,0.158106,0.405606,0.391379,0.255490,0.115899,0.300504,0.0,0.124922,0.057084,0.101177,...,0.055246,0.000000,0.0,0.045164,0.077159,0.0,0.000000,0.0,0.053576,0.095953,0.089876,0.028935,0.028935,0.108452,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.028935,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.338970,1.000000,0.249558,0.094187,0.238732,0.208502,0.224759,0.120127,0.010906,0.274381,0.264332,0.149847,0.165212,0.025273,0.237822,0.215922,0.143026,0.143215,0.458861,0.107509,0.229252,0.170068,0.082083,0.175883,0.106965,0.127935,0.149667,0.101958,0.132202,0.00000,0.270751,0.312757,0.361233,0.154186,0.114429,0.327841,0.0,0.118605,0.070865,0.144261,...,0.086363,0.120618,0.0,0.150172,0.120618,0.0,0.067566,0.0,0.000000,0.194998,0.192153,0.000000,0.000000,0.094187,0.105541,0.0,0.0,0.0,0.0,0.000000,0.105541,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.315604,0.249558,1.000000,0.000000,0.341006,0.240170,0.307270,0.269336,0.273639,0.194403,0.107019,0.227969,0.101901,0.122183,0.075891,0.233875,0.160637,0.196988,0.246462,0.111602,0.270801,0.127762,0.128627,0.265934,0.164826,0.190308,0.162824,0.058620,0.134153,0.00000,0.201098,0.281323,0.190108,0.248642,0.127663,0.216146,0.0,0.120628,0.144456,0.108590,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.086058,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.030125,0.094187,0.000000,1.000000,0.181493,0.051524,0.251309,0.162301,0.000000,0.104297,0.111642,0.000000,0.000000,0.130892,0.195122,0.128758,0.178754,0.000000,0.017846,0.000000,0.085256,0.053113,0.000000,0.119792,0.136365,0.000000,0.138832,0.064792,0.000000,0.00000,0.294802,0.037946,0.127786,0.085711,0.000000,0.000000,0.0,0.137138,0.000000,0.216041,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.261765,0.238732,0.341006,0.181493,1.000000,0.250941,0.473463,0.117417,0.303564,0.180550,0.113795,0.188123,0.055000,0.243498,0.120995,0.110893,0.218332,0.226823,0.208416,0.044712,0.163268,0.163306,0.094580,0.203505,0.166037,0.169880,0.272617,0.000000,0.055952,0.00000,0.259778,0.261037,0.227506,0.235631,0.160779,0.232939,0.0,0.163878,0.033717,0.291658,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.046449,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193579,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.752577,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
193581,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.752577,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
193583,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.752577,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
193587,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.752577,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


### Weighted Average Rating

In [72]:
# 유저가 본 영화들을 활용해 예측합니다.

result = []

for _, row in tqdm(test_df.iterrows()):
    user_id = row['userId']
    movie_id = row['movieId']
    rating = row['rating']

    if user_id in item_user_matrix.columns and movie_id in item_user_matrix.index:
        # 해당 유저가 봤던 다른 아이템 리스트를 가져옵니다.
        item_ratings = item_user_matrix[item_user_matrix[user_id] > 0][user_id]
        movie_ids = item_ratings.index

        # 예측 아이템과 유저가 봤던 다른 아이템 간의 유사도를 구합니다.
        item_similarities = item_similarity_df[movie_ids].loc[movie_id]

        # 다른 아이템들과 예측 아이템과의 유사도를 가중치로 사용하여 평점을 예측합니다.
        numerator = np.inner(item_ratings, item_similarities)
        denominator = item_similarities.sum()

        if denominator == 0:
            result.append([user_id, movie_id, rating, global_average])
            continue
        
        predicted_rating = numerator / denominator

        result.append([user_id, movie_id, rating, predicted_rating])
    else:
        result.append([user_id, movie_id, rating, global_average])
item_based_result_df = pd.DataFrame(result, columns=['userId', 'movieId', 'rating', 'predictedRating'])
item_based_result_df




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Unnamed: 0,userId,movieId,rating,predictedRating
0,239.0,924.0,4.0,4.079030
1,384.0,2763.0,3.0,2.963573
2,52.0,58559.0,5.0,4.571206
3,600.0,719.0,2.5,2.959931
4,318.0,117364.0,4.0,3.503954
...,...,...,...,...
20163,20.0,5015.0,4.0,3.563257
20164,177.0,6787.0,3.0,3.388352
20165,103.0,969.0,4.0,4.082881
20166,42.0,2875.0,3.0,3.678894


In [73]:
# 성능 구하기

# rmse 구하기
mse = mean_squared_error(y_true = item_based_result_df['rating'].values,
                         y_pred = item_based_result_df['predictedRating'].values)
rmse = np.sqrt(mse)

# mae 구하기
mae = mean_absolute_error(y_true = item_based_result_df['rating'].values,
                          y_pred = item_based_result_df['predictedRating'].values)

print("rmse", rmse)
print("mae", mae)

rmse 0.9255275896882756
mae 0.7132160614874838


### k-Nearest Neighborhood CF

In [78]:
k = 20

result = []

for _, row in tqdm(test_df.iterrows()):
    user_id = row['userId']
    movie_id = row['movieId']
    rating = row['rating']

    if user_id in item_user_matrix.columns and movie_id in item_user_matrix.index:
        # 해당 유저가 봤던 다른 아이템 리스트를 가져옵니다.
        item_ratings = item_user_matrix[item_user_matrix[user_id] > 0][user_id]
        movie_ids = item_ratings.index

        # 예측 아이템과 유저가 봤던 다른 아이템 간의 유사도를 구하고, 유사도가 가장 높은 아이템 k개를 선택합니다.
        candidate_similarities = item_similarity_df[movie_ids].loc[movie_id].sort_values(ascending=False)[:k]
        candidate_item_ratings = item_ratings[candidate_similarities.index]

        # 선택된 k개의 아이템의 평점과 예측 아이템과의 유사도를 가중치로 사용하여 평점을 예측합니다.
        numerator = np.inner(candidate_item_ratings.values, candidate_similarities)
        denominator = candidate_item_ratings.sum()

        if denominator == 0:
            result.append([user_id, movie_id, rating, global_average])
            continue
        
        predicted_ratings = numerator / denominator

        result.append([user_id, movie_id, rating, predicted_ratings])
    else:
        result.append([user_id, movie_id, rating, global_average])

knn_item_based_result_df = pd.DataFrame(result, columns=['userId', 'movieId', 'rating', 'predictedRating'])
knn_item_based_result_df

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Unnamed: 0,userId,movieId,rating,predictedRating
0,239.0,924.0,4.0,0.425763
1,384.0,2763.0,3.0,0.243161
2,52.0,58559.0,5.0,0.453530
3,600.0,719.0,2.5,0.261348
4,318.0,117364.0,4.0,3.503954
...,...,...,...,...
20163,20.0,5015.0,4.0,0.316005
20164,177.0,6787.0,3.0,0.265946
20165,103.0,969.0,4.0,0.273742
20166,42.0,2875.0,3.0,0.296416


In [79]:
# 성능 구하기

# rmse 구하기
mse = mean_squared_error(y_true = item_based_result_df['rating'].values,
                         y_pred = item_based_result_df['predictedRating'].values)
rmse = np.sqrt(mse)

# mae 구하기
mae = mean_absolute_error(y_true = item_based_result_df['rating'].values,
                          y_pred = item_based_result_df['predictedRating'].values)

print("rmse", rmse)
print("mae", mae)

rmse 0.9255275896882756
mae 0.7132160614874838
