# Collaborative Filtering

Memory-Based Algorithm
- Item based (더 많이 함) <-- dot production없이 유사도를 기반으로 주로 활용됨
- User based
  - dot product 버전 구현


Model-Based Algorithm
- Latent Factor 협업 필터링 방법 (Matrix Factorization)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 구글 드라이브 연결

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

path = "/content/drive/MyDrive/lecture/"
os.listdir(path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


['movies.csv',
 'ratings.csv',
 'tags.csv',
 'links.csv',
 'movies_refined.csv',
 'ratings_refined.csv',
 '무비렌즈 데이터 전처리.ipynb',
 '협업 필터링 - User-based.ipynb',
 'movie_user.csv',
 'movie_user.gsheet',
 '제목 없는 문서.gdoc',
 '협업 필터링 - Item-based.ipynb',
 '협업 필터링 - User-based with dot production.ipynb']

In [None]:
import pandas as pd
import numpy as np

# 데이터 로드

In [None]:
ratings = pd.read_csv(path + "ratings_refined.csv", usecols=['userId', 'movieId', 'rating'])
ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100784,610,166534,4.0
100785,610,168248,5.0
100786,610,168250,5.0
100787,610,168252,5.0


In [None]:
movies = pd.read_csv(path + "movies_refined.csv", usecols=['movieId', 'title'])
movies

Unnamed: 0,movieId,title
0,1,Toy Story
1,2,Jumanji
2,3,Grumpier Old Men
3,4,Waiting to Exhale
4,5,Father of the Bride Part II
...,...,...
9698,193581,Black Butler: Book of the Atlantic
9699,193583,No Game No Life: Zero
9700,193585,Flint
9701,193587,Bungo Stray Dogs: Dead Apple


In [None]:
df = pd.merge(ratings, movies, on='movieId', how='left')
df

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story
1,1,3,4.0,Grumpier Old Men
2,1,6,4.0,Heat
3,1,47,5.0,Seven (a.k.a. Se7en)
4,1,50,5.0,"Usual Suspects, The"
...,...,...,...,...
100784,610,166534,4.0,Split
100785,610,168248,5.0,John Wick: Chapter Two
100786,610,168250,5.0,Get Out
100787,610,168252,5.0,Logan


null 값 체크

In [None]:
df.columns[df.isna().any()].tolist()

['title']

영화명 결측치 체크

In [None]:
df[df['title'].isnull()]

Unnamed: 0,userId,movieId,rating,title
80567,509,26958,3.5,


# User-based CF + Dot Production

## 유저 유사도 행렬 준비

In [None]:
user_movie = df.pivot_table(values='rating', index='userId', columns='title')
user_movie

title,'71,'Hellboy': The Seeds of Creation,'Round Midnight,'Salem's Lot,'Til There Was You,'Tis the Season for Love,"'burbs, The",'night Mother,(500) Days of Summer,*batteries not included,...,Zulu,[REC],[REC]²,[REC]³ 3 Génesis,anohana: The Flower We Saw That Day - The Movie,eXistenZ,xXx,xXx: State of the Union,¡Three Amigos!,À nous la liberté (Freedom for Us)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,4.5,3.5,,,
609,,,,,,,,,,,...,,,,,,,,,,


In [None]:
# 610 x 9413 행렬
# 사용자를 9413차원의 벡터로 보려는 것
user_movie.shape

(610, 9413)

## 결측치 처리

null값이 있으면 cosine similarity 함수가 안돌아감

하지만, null값을 0으로 치환하고 계산할경우 결과가 달라짐

(마치 해당 영화를 보고 0점을 준것으로 계산)

In [None]:
user_movie_tmp = user_movie.copy().fillna(0)

## 유사도 행렬 계산

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

user_similarity_matrix = cosine_similarity(user_movie_tmp)
user_similarity_matrix.shape

(610, 610)

## 데이터 프레임화

In [None]:
# 영화-영화 유저관람 내역에 대한 유사도
user_ids = user_movie.index

user_similarity = pd.DataFrame(user_similarity_matrix,
                                index=user_ids, columns=user_ids)
print(user_similarity.shape)
user_similarity.head()

(610, 610)


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.027433,0.060048,0.195461,0.129788,0.128855,0.156882,0.13772,0.064616,0.016968,...,0.081929,0.165357,0.222414,0.071056,0.155099,0.167947,0.270867,0.294153,0.094085,0.150053
2,0.027433,1.0,0.0,0.003726,0.016614,0.025333,0.027713,0.027257,0.0,0.067445,...,0.205005,0.016866,0.01202,0.0,0.0,0.028564,0.012948,0.046926,0.027565,0.102624
3,0.060048,0.0,1.0,0.002251,0.00502,0.003936,0.0,0.004941,0.0,0.0,...,0.005106,0.004892,0.025041,0.0,0.010738,0.013055,0.019247,0.021234,0.0,0.032181
4,0.195461,0.003726,0.002251,1.0,0.128659,0.092718,0.115652,0.062969,0.011361,0.031163,...,0.086928,0.134839,0.308497,0.052985,0.08493,0.203157,0.131746,0.150605,0.032198,0.107891
5,0.129788,0.016614,0.00502,0.128659,1.0,0.300349,0.108842,0.429075,0.0,0.030611,...,0.068831,0.418747,0.110364,0.258773,0.149366,0.107667,0.152866,0.136211,0.261232,0.060909


In [None]:
# 높은수록 비슷한 유저
# 가장 높은 유저는 본인 자신 [0] index 제외
sample_user = 1
user_similarity[sample_user].sort_values(ascending=False)[1:10]

userId
266    0.359369
313    0.353714
57     0.346927
368    0.346557
91     0.337481
288    0.333936
469    0.332700
39     0.331592
452    0.331223
Name: 1, dtype: float64

## Dot Production 미니예제

임의의 사용자 x  
x가 아직 안본 임의의 영화 m에 대해


- m을 본 사용자 집합 U (3명이 봤다면 u1, u2, u3)
- U의 m에 대한 평점집합 R (u1이 2점, u2 4점, u3이 5점 줬다 가정)


  


유저 유사도 활용

- m을 본 평점의 평균은 (2+4+5) / 3 이지만,


x와 유사한 정도로 평점 가중치 처리!
  - x에 대해 u1의 유사도0.5
  - x에 대해 u2의 유사도0.7
  - x에 대해 u3의 유사도0.8
- (2x0.5 + 4x0.7 + 5x0.8) / (0.5+0.7+0.8)


모든 사용자 집합 X에 대해  
위 연산 처리하면  

모든 사용자 별 아직 안본 영화에 대한 예측 평점 모두 계산됨




최종 추천 영화

아직 안본 영화의 예측 평점이 높은 순으로 추천해주는 방식

In [None]:
# 테스트용 임의의 영화 선택
title = 'Black Robe'

In [None]:
# 해당 영화를 본 사용자들
idx = df[df['title'] == title].index

watched_user = df.loc[idx, 'userId'].tolist()
watched_user

[28, 140, 414]

In [None]:
# 유사도 subset
# (행)영화본유저 x (열)전체유저
sub_sim_mat = user_similarity.loc[watched_user]
sub_sim_mat

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
28,0.207437,0.058558,0.029983,0.136556,0.12276,0.103157,0.242525,0.107797,0.072655,0.101741,...,0.18414,0.168319,0.190332,0.073656,0.10315,0.243591,0.178906,0.342838,0.091519,0.307594
140,0.199282,0.041029,0.007271,0.147038,0.1452,0.198456,0.254412,0.20431,0.092567,0.100951,...,0.151127,0.225118,0.222802,0.131818,0.133473,0.246238,0.251251,0.322686,0.118942,0.181847
414,0.285745,0.089679,0.029848,0.219965,0.120821,0.195184,0.212785,0.130303,0.096816,0.144694,...,0.215508,0.195503,0.326366,0.125768,0.184004,0.348081,0.231278,0.463101,0.086262,0.394322


In [None]:
# 전체유저610명 x 이 영화 본 유저3명
sub_sim_mat = sub_sim_mat.T.to_numpy()
print(sub_sim_mat.shape)
sub_sim_mat

(610, 3)


array([[0.2074366 , 0.19928216, 0.28574522],
       [0.05855824, 0.04102873, 0.08967948],
       [0.0299826 , 0.00727126, 0.02984835],
       ...,
       [0.3428378 , 0.32268559, 0.46310068],
       [0.09151894, 0.1189419 , 0.08626191],
       [0.30759397, 0.18184665, 0.39432192]])

전체 유저에 대한 영화본3명의 유사도 총합 (정규화용)

In [None]:
sim_N = np.sum(sub_sim_mat, axis=1) + 1
print(sim_N.shape)

(610,)


이 영화 본 3명이 작성한 평점

In [None]:
df.loc[idx]

Unnamed: 0,userId,movieId,rating,title
4453,28,4688,2.5,Black Robe
21460,140,4688,3.5,Black Robe
63735,414,4688,4.0,Black Robe


In [None]:
watched_user_y = df.loc[idx, 'rating']

# 행렬 연산 위해 reshape
watched_user_y = np.array(watched_user_y.tolist()).reshape(-1, 1)
print(watched_user_y.shape)
watched_user_y[:10]

(3, 1)


array([[2.5],
       [3.5],
       [4. ]])

모든 유저에 대한 평점 예측

A = 유사도 행렬 (모든 유저 x 영화 본 유저)

B = 평점 행렬 (영화본 유저들의 이 영화 평점 x 1)

N = A의 컬럼 총합 (모든 영화 x sum(영화본 유저))


예측 = (A x B) / N

In [None]:
# (610 x 3) * (3 x 1) / (610, ) sub_sim_mat => 유사도 행렬값
pred_y = np.matmul(sub_sim_mat, watched_user_y).flatten() / sim_N
print(pred_y.shape)

(610,)


예측결과 데이터프레임화

In [None]:
all_users = list(user_similarity.index)
len(all_users)

610

In [None]:
title_list = [title] * len(all_users)
len(title_list)

610

In [None]:
cur_pred = pd.DataFrame(zip(title_list, all_users, pred_y),
                        columns=['title', 'userId', 'pred_rating'])
cur_pred

Unnamed: 0,title,userId,pred_rating
0,Black Robe,1,1.393861
1,Black Robe,2,0.545474
2,Black Robe,3,0.205978
3,Black Robe,4,1.154516
4,Black Robe,5,0.934909
...,...,...,...
605,Black Robe,606,1.557820
606,Black Robe,607,1.355306
607,Black Robe,608,1.803464
608,Black Robe,609,0.763572


## 평점 예측

In [None]:
from tqdm.notebook import tqdm

def modeling(similarity_matrix, data):
    df_pred_all = pd.DataFrame()
    titles = sorted(data['title'].unique())
    all_users = similarity_matrix.index
    n_users = len(all_users)

    for title in tqdm(titles):
        idx = data[data['title'] == title].index

        # 유사도
        watched_user = data.loc[idx, 'userId'].tolist()
        sub_sim_mat = similarity_matrix.loc[watched_user]
        sub_sim_mat = sub_sim_mat.T.to_numpy()
        sim_N = np.sum(sub_sim_mat, axis=1) + 1


        # 평점 예측
        watched_user_y = data.loc[idx, 'rating']
        watched_user_y = np.array(watched_user_y.tolist()).reshape(-1, 1)

        pred_y = np.matmul(sub_sim_mat, watched_user_y).flatten() / sim_N

        title_list = [title] * n_users
        cur_pred = pd.DataFrame(zip(title_list, all_users, pred_y),
                                columns=['title', 'userId', 'pred_rating'])

        # 결과 기록
        df_pred_all = pd.concat([df_pred_all, cur_pred], axis=0)
    return df_pred_all

In [None]:
df_pred_all = modeling(user_similarity, df)
df_pred_all

## 예측 정확도 평가

테스트용 샘플 데이터 추출

In [None]:
from sklearn.model_selection import train_test_split

_, test_data = train_test_split(df[['userId', 'title', 'rating']],
                test_size=20000, random_state=1234, stratify=df['userId'])
test_data

Unnamed: 0,userId,title,rating
72039,464,Escape from L.A.,3.0
90528,590,Carlito's Way,4.0
20360,135,"Nightmare Before Christmas, The",4.0
42826,288,"Naked Gun 2 1/2: The Smell of Fear, The",4.0
37923,258,Guardians of the Galaxy 2,3.5
...,...,...,...
94958,599,Thor: Ragnarok,3.0
63609,414,Revenge of the Nerds II: Nerds in Paradise,2.0
68727,448,"Wild Bunch, The",5.0
38371,263,Gosford Park,4.5


예측 평점 조인

In [None]:
test_data = pd.merge(test_data, df_pred_all, on=['userId', 'title'], how='left')
test_data

MAE, MSE, RMSE

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

true_y = np.array(test_data['rating'])
pred_y = np.array(test_data['pred_rating'])

mae = mean_absolute_error(y_true=true_y, y_pred=pred_y)
mse = mean_squared_error(y_true=true_y, y_pred=pred_y)
rmse = np.sqrt(mse)

print(f"MAE  : {str(round(mae, 2))}")
print(f"MSE  : {str(round(mse, 2))}")
print(f"RMSE : {str(round(rmse, 2))}")

Coverage

3점 이상인 경우 영화를 봤다고 가정 (혹은 추천했다고 가정)

In [None]:
# 모델이 추천한 영화 개수
n_recommends = sum(1 * (pred_y > 4.0))
n_recommends

In [None]:
n_movies = df['title'].nunique()

In [None]:
# Coverage
n_recommends / n_movies

Precision

In [None]:
def get_precision(true_y, pred_y, threshold):
    trues = 1 * (true_y >= threshold)
    n_trues = sum(trues)

    pred_trues = 1 * (pred_y >= threshold)

    true_positive = sum(trues + pred_trues == 2)

    precision = true_positive / n_trues

    return precision

In [None]:
get_precision(true_y, pred_y, 3)

## 평점 예측 결과 저장

In [None]:
df_pred_all.to_csv(path + "user_based_cf_prediction.csv", index=False)

# 사용자 유사도 기반 추천

In [None]:
# 샘플 사용자
user_id = 1

In [None]:
user_mov = df[df['userId'] == user_id]
user_mov

In [None]:
user_mov_pred = df_pred_all[df_pred_all['userId'] == user_id]
user_mov_pred

In [None]:
user_mov = pd.merge(user_mov, user_mov_pred, on=['userId', 'title'], how='right')
user_mov

In [None]:
# 사용자가 아직 안 본 영화
movie_candidate = user_mov[user_mov['movieId'].isnull()]
movie_candidate

In [None]:
# 예측 평점 기준 상위 10개
movie_candidate.sort_values(by='pred_rating', ascending=False)[:10]

In [None]:
# 사용자가 안본 영화 중 예측 평점 높은 10개 추천
def get_recomendation(user_id):
    user_mov = df[df['userId'] == user_id]
    user_mov_pred = df_pred_all[df_pred_all['userId'] == user_id]
    user_mov = pd.merge(user_mov, user_mov_pred, on=['userId', 'title'], how='right')

    # 사용자가 아직 안 본 영화
    movie_candidate = user_mov[user_mov['movieId'].isnull()]
    movie_candidate = movie_candidate.sort_values(by='pred_rating', ascending=False)[:10]

    return movie_candidate['title'].tolist()

In [None]:
get_recomendation(user_id)