# Collaborative Filtering

Memory-Based Algorithm
- Item based (더 많이 함) <-- dot production없이 유사도를 기반으로 주로 활용됨
- User based


Model-Based Algorithm
- Latent Factor 협업 필터링 방법 (Matrix Factorization)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# 구글 드라이브 연결

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

path = "/content/drive/MyDrive/lecture"
os.listdir(path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


['movies.csv',
 'ratings.csv',
 'tags.csv',
 'links.csv',
 'movie_user.csv',
 'movie_user.gsheet',
 '제목 없는 문서.gdoc',
 '협업 필터링 - Item-based.ipynb',
 'movies_refined.csv',
 'ratings_refined.csv',
 '무비렌즈 데이터 전처리.ipynb',
 '협업 필터링 - User-based.ipynb',
 '협업 필터링 - User-based with dot production.ipynb',
 'Untitled0.ipynb',
 '협업_필터링_Item_based_ipynb의_사본.ipynb']

In [None]:
import pandas as pd
import numpy as np


# 데이터 로드

In [None]:
ratings = pd.read_csv("/content/drive/MyDrive/lecture/ratings_refined.csv", usecols=['userId', 'movieId', 'rating'])
ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100784,610,166534,4.0
100785,610,168248,5.0
100786,610,168250,5.0
100787,610,168252,5.0


In [None]:
movies = pd.read_csv( "/content/drive/MyDrive/lecture/movies_refined.csv", usecols=['movieId', 'title'])
movies

Unnamed: 0,movieId,title
0,1,Toy Story
1,2,Jumanji
2,3,Grumpier Old Men
3,4,Waiting to Exhale
4,5,Father of the Bride Part II
...,...,...
9698,193581,Black Butler: Book of the Atlantic
9699,193583,No Game No Life: Zero
9700,193585,Flint
9701,193587,Bungo Stray Dogs: Dead Apple


In [None]:
df = pd.merge(ratings, movies, on='movieId', how='left')
df

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story
1,1,3,4.0,Grumpier Old Men
2,1,6,4.0,Heat
3,1,47,5.0,Seven (a.k.a. Se7en)
4,1,50,5.0,"Usual Suspects, The"
...,...,...,...,...
100784,610,166534,4.0,Split
100785,610,168248,5.0,John Wick: Chapter Two
100786,610,168250,5.0,Get Out
100787,610,168252,5.0,Logan


null 값 체크

In [None]:
df.columns[df.isna().any()].tolist()

[]

영화명 결측치 체크

In [None]:
df[df['title'].isnull()]

Unnamed: 0,userId,movieId,rating,title


# Item-based CF

## 영화 유사도 행렬 준비

In [None]:
movie_user = df.pivot_table(values='rating', index='title', columns='userId')
movie_user.to_csv('movie_user.csv')
movie_user


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71,,,,,,,,,,,...,,,,,,,,,,4.0
'Hellboy': The Seeds of Creation,,,,,,,,,,,...,,,,,,,,,,
'Round Midnight,,,,,,,,,,,...,,,,,,,,,,
'Salem's Lot,,,,,,,,,,,...,,,,,,,,,,
'Til There Was You,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ,,,,,,,,,,,...,,,5.0,,,,,4.5,,
xXx,,,,,,,,,1.0,,...,,,,,,,,3.5,,2.0
xXx: State of the Union,,,,,,,,,,,...,,,,,,,,,,1.5
¡Three Amigos!,4.0,,,,,,,,,,...,,,,,,,,,,


In [None]:
# 9413 x 610 행렬
# 영화를 610차원의 벡터로 보려는 것
movie_user.shape

(9413, 610)

## 결측치 처리

null값이 있으면 cosine similarity 함수가 안돌아감

하지만, null값을 0으로 치환하고 계산할경우 결과가 달라짐

(마치 해당 영화를 보고 0점을 준것으로 계산)

In [None]:
#movie_user_tmp = movie_user.copy().fillna(0)
from sklearn.metrics.pairwise import pairwise_distances

# 사용자 평균 평점 계산
user_means = movie_user.mean(axis=1)

# 결측치를 사용자 평균 평점으로 대체
movie_user_tmp = movie_user.copy()
for col in movie_user.columns:
    movie_user_tmp[col].fillna(user_means, inplace=True)

# 피어슨-베르세르 유사도 계산
pearson_baseline_similarity_matrix = 1 - pairwise_distances(movie_user_tmp, metric='correlation')

# 결측치를 채워넣을 때 사용되는 코드
movie_user_tmp = movie_user.apply(lambda col: col.combine_first(user_means))
from sklearn.metrics.pairwise import pairwise_distances

# 결측치를 0으로 대체
movie_user_tmp = movie_user.fillna(0)

# MSD 계산
msd_similarity_matrix = 1 / (1 + pairwise_distances(movie_user_tmp.T, metric='sqeuclidean'))

# 결측치를 채워넣을 때 사용되는 코드
movie_user_tmp = movie_user.apply(lambda col: col.combine_first(movie_user_tmp.mean(axis=1)))


In [None]:
movie_user_tmp

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71,0.006557,0.006557,0.006557,0.006557,0.006557,0.006557,0.006557,0.006557,0.006557,0.006557,...,0.006557,0.006557,0.006557,0.006557,0.006557,0.006557,0.006557,0.006557,0.006557,4.000000
'Hellboy': The Seeds of Creation,0.006557,0.006557,0.006557,0.006557,0.006557,0.006557,0.006557,0.006557,0.006557,0.006557,...,0.006557,0.006557,0.006557,0.006557,0.006557,0.006557,0.006557,0.006557,0.006557,0.006557
'Round Midnight,0.011475,0.011475,0.011475,0.011475,0.011475,0.011475,0.011475,0.011475,0.011475,0.011475,...,0.011475,0.011475,0.011475,0.011475,0.011475,0.011475,0.011475,0.011475,0.011475,0.011475
'Salem's Lot,0.008197,0.008197,0.008197,0.008197,0.008197,0.008197,0.008197,0.008197,0.008197,0.008197,...,0.008197,0.008197,0.008197,0.008197,0.008197,0.008197,0.008197,0.008197,0.008197,0.008197
'Til There Was You,0.013115,0.013115,0.013115,0.013115,0.013115,0.013115,0.013115,0.013115,0.013115,0.013115,...,0.013115,0.013115,0.013115,0.013115,0.013115,0.013115,0.013115,0.013115,0.013115,0.013115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ,0.139344,0.139344,0.139344,0.139344,0.139344,0.139344,0.139344,0.139344,0.139344,0.139344,...,0.139344,0.139344,5.000000,0.139344,0.139344,0.139344,0.139344,4.500000,0.139344,0.139344
xXx,0.109016,0.109016,0.109016,0.109016,0.109016,0.109016,0.109016,0.109016,1.000000,0.109016,...,0.109016,0.109016,0.109016,0.109016,0.109016,0.109016,0.109016,3.500000,0.109016,2.000000
xXx: State of the Union,0.016393,0.016393,0.016393,0.016393,0.016393,0.016393,0.016393,0.016393,0.016393,0.016393,...,0.016393,0.016393,0.016393,0.016393,0.016393,0.016393,0.016393,0.016393,0.016393,1.500000
¡Three Amigos!,4.000000,0.133607,0.133607,0.133607,0.133607,0.133607,0.133607,0.133607,0.133607,0.133607,...,0.133607,0.133607,0.133607,0.133607,0.133607,0.133607,0.133607,0.133607,0.133607,0.133607


In [None]:
from sklearn.metrics import mean_squared_error
from scipy.spatial.distance import cdist


# 사용자 간의 MSD 유사도 계산
msd_similarity_matrix = 1 / (1 + cdist(movie_user_tmp, movie_user_tmp, metric='euclidean')**2)

# 결과 확인
print(msd_similarity_matrix.shape)  # 행렬의 크기 확인


(9413, 9413)


## 유사도 행렬 계산

In [None]:
msd_similarity_matrix

array([[1.        , 0.0303996 , 0.02419474, ..., 0.03853976, 0.0033108 ,
        0.055591  ],
       [0.0303996 , 1.        , 0.0742454 , ..., 0.02645739, 0.0033108 ,
        0.055591  ],
       [0.02419474, 0.0742454 , 1.        , ..., 0.02171234, 0.00324434,
        0.0377174 ],
       ...,
       [0.03853976, 0.02645739, 0.02171234, ..., 1.        , 0.00330626,
        0.0433535 ],
       [0.0033108 , 0.0033108 , 0.00324434, ..., 0.00330626, 1.        ,
        0.00345558],
       [0.055591  , 0.055591  , 0.0377174 , ..., 0.0433535 , 0.00345558,
        1.        ]])

## 데이터 프레임화

In [None]:
movie_titles = movie_user.index
movie_titles

Index([''71', ''Hellboy': The Seeds of Creation', ''Round Midnight',
       ''Salem's Lot', ''Til There Was You', ''Tis the Season for Love',
       ''burbs, The', ''night Mother', '(500) Days of Summer',
       '*batteries not included',
       ...
       'Zulu', '[REC]', '[REC]²', '[REC]³ 3 Génesis',
       'anohana: The Flower We Saw That Day - The Movie', 'eXistenZ', 'xXx',
       'xXx: State of the Union', '¡Three Amigos!',
       'À nous la liberté (Freedom for Us)'],
      dtype='object', name='title', length=9413)

In [None]:
# 영화-영화 유저관람 내역에 대한 유사도
movie_similarity = pd.DataFrame(msd_similarity_matrix,
                                index=movie_titles, columns=movie_titles)
print(movie_similarity.shape)
movie_similarity

(9413, 9413)


title,'71,'Hellboy': The Seeds of Creation,'Round Midnight,'Salem's Lot,'Til There Was You,'Tis the Season for Love,"'burbs, The",'night Mother,(500) Days of Summer,*batteries not included,...,Zulu,[REC],[REC]²,[REC]³ 3 Génesis,anohana: The Flower We Saw That Day - The Movie,eXistenZ,xXx,xXx: State of the Union,¡Three Amigos!,À nous la liberté (Freedom for Us)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71,1.000000,0.030400,0.024195,0.023883,0.019679,0.052027,0.004639,0.038576,0.001583,0.010640,...,0.011483,0.008151,0.032807,0.091181,0.038576,0.002724,0.004716,0.038540,0.003311,0.055591
'Hellboy': The Seeds of Creation,0.030400,1.000000,0.074245,0.023883,0.019679,0.052027,0.004639,0.038576,0.001520,0.010640,...,0.011483,0.006485,0.017155,0.028695,0.038576,0.002724,0.004402,0.026457,0.003311,0.055591
'Round Midnight,0.024195,0.074245,1.000000,0.019890,0.016910,0.036061,0.005032,0.029079,0.001511,0.009817,...,0.010512,0.006180,0.015033,0.023134,0.029079,0.002679,0.004278,0.021712,0.003244,0.037717
'Salem's Lot,0.023883,0.023883,0.019890,1.000000,0.100345,0.035435,0.004463,0.028647,0.001503,0.009730,...,0.010424,0.006138,0.014881,0.022828,0.028647,0.002664,0.004246,0.021407,0.003223,0.037046
'Til There Was You,0.019679,0.019679,0.016910,0.100345,1.000000,0.026853,0.004319,0.022788,0.001493,0.008996,...,0.009571,0.005847,0.013169,0.018978,0.022788,0.002618,0.004121,0.018023,0.003155,0.027756
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ,0.002724,0.002724,0.002679,0.002664,0.002618,0.002813,0.002443,0.002960,0.001253,0.002683,...,0.002579,0.002145,0.002584,0.002721,0.002770,1.000000,0.002393,0.002722,0.002083,0.002820
xXx,0.004716,0.004402,0.004278,0.004246,0.004121,0.004651,0.002889,0.004529,0.001708,0.003671,...,0.004451,0.004263,0.004589,0.005017,0.004529,0.002393,1.000000,0.005132,0.002463,0.004671
xXx: State of the Union,0.038540,0.026457,0.021712,0.021407,0.018023,0.041204,0.004596,0.032380,0.001611,0.010228,...,0.010965,0.008519,0.019065,0.032509,0.032380,0.002722,0.005132,1.000000,0.003306,0.043353
¡Three Amigos!,0.003311,0.003311,0.003244,0.003223,0.003155,0.003445,0.003491,0.003589,0.001444,0.003570,...,0.003119,0.002487,0.003104,0.003306,0.003380,0.002083,0.002463,0.003306,1.000000,0.003456


In [None]:
# 영화-영화 유저관람 내역에 대한 유사도
movie_similarity = pd.DataFrame(msd_similarity_matrix,
                                index=movie_titles, columns=movie_titles)
print(movie_similarity.shape)
movie_similarity

(9413, 9413)


title,'71,'Hellboy': The Seeds of Creation,'Round Midnight,'Salem's Lot,'Til There Was You,'Tis the Season for Love,"'burbs, The",'night Mother,(500) Days of Summer,*batteries not included,...,Zulu,[REC],[REC]²,[REC]³ 3 Génesis,anohana: The Flower We Saw That Day - The Movie,eXistenZ,xXx,xXx: State of the Union,¡Three Amigos!,À nous la liberté (Freedom for Us)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71,1.000000,0.030400,0.024195,0.023883,0.019679,0.052027,0.004639,0.038576,0.001583,0.010640,...,0.011483,0.008151,0.032807,0.091181,0.038576,0.002724,0.004716,0.038540,0.003311,0.055591
'Hellboy': The Seeds of Creation,0.030400,1.000000,0.074245,0.023883,0.019679,0.052027,0.004639,0.038576,0.001520,0.010640,...,0.011483,0.006485,0.017155,0.028695,0.038576,0.002724,0.004402,0.026457,0.003311,0.055591
'Round Midnight,0.024195,0.074245,1.000000,0.019890,0.016910,0.036061,0.005032,0.029079,0.001511,0.009817,...,0.010512,0.006180,0.015033,0.023134,0.029079,0.002679,0.004278,0.021712,0.003244,0.037717
'Salem's Lot,0.023883,0.023883,0.019890,1.000000,0.100345,0.035435,0.004463,0.028647,0.001503,0.009730,...,0.010424,0.006138,0.014881,0.022828,0.028647,0.002664,0.004246,0.021407,0.003223,0.037046
'Til There Was You,0.019679,0.019679,0.016910,0.100345,1.000000,0.026853,0.004319,0.022788,0.001493,0.008996,...,0.009571,0.005847,0.013169,0.018978,0.022788,0.002618,0.004121,0.018023,0.003155,0.027756
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ,0.002724,0.002724,0.002679,0.002664,0.002618,0.002813,0.002443,0.002960,0.001253,0.002683,...,0.002579,0.002145,0.002584,0.002721,0.002770,1.000000,0.002393,0.002722,0.002083,0.002820
xXx,0.004716,0.004402,0.004278,0.004246,0.004121,0.004651,0.002889,0.004529,0.001708,0.003671,...,0.004451,0.004263,0.004589,0.005017,0.004529,0.002393,1.000000,0.005132,0.002463,0.004671
xXx: State of the Union,0.038540,0.026457,0.021712,0.021407,0.018023,0.041204,0.004596,0.032380,0.001611,0.010228,...,0.010965,0.008519,0.019065,0.032509,0.032380,0.002722,0.005132,1.000000,0.003306,0.043353
¡Three Amigos!,0.003311,0.003311,0.003244,0.003223,0.003155,0.003445,0.003491,0.003589,0.001444,0.003570,...,0.003119,0.002487,0.003104,0.003306,0.003380,0.002083,0.002463,0.003306,1.000000,0.003456


# 영화 유사도 기반 추천

In [None]:
# 샘플 사용자
user = 1

# 샘플 사용자가 본 영화 중 임의로 한 개 선택
movie_title = df.loc[df['userId'] == user, 'title'][0]
movie_title

'Toy Story'

In [None]:
# 사용자가 본 영화와 유사도가 높은 영화 10개 추천 함수
def get_recomendation(title):
    return movie_similarity[title].sort_values(ascending=False)[1:11]#0번은 자기자신임

In [None]:
get_recomendation(movie_title)

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import jaccard_score

# ... (Previous code to read data and create movie_user dataframe)

# 결측치를 0으로 대체
movie_user_tmp = movie_user.fillna(0)

# Jaccard 유사도 계산
jaccard_similarity_matrix = pd.DataFrame(1 - cdist(movie_user_tmp, movie_user_tmp, metric='jaccard'),
                                         index=movie_user.index, columns=movie_user.index)

# 결과 확인
print(jaccard_similarity_matrix.shape)  # 행렬의 크기 확인
jaccard_similarity_matrix

(9413, 9413)


title,'71,'Hellboy': The Seeds of Creation,'Round Midnight,'Salem's Lot,'Til There Was You,'Tis the Season for Love,"'burbs, The",'night Mother,(500) Days of Summer,*batteries not included,...,Zulu,[REC],[REC]²,[REC]³ 3 Génesis,anohana: The Flower We Saw That Day - The Movie,eXistenZ,xXx,xXx: State of the Union,¡Three Amigos!,À nous la liberté (Freedom for Us)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71,1.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.125,0.00,0.000000,0.0,0.000000,0.000000,0.00,0.000000,0.0
'Hellboy': The Seeds of Creation,0.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000,0.00,0.000000,0.0,0.000000,0.000000,0.00,0.000000,0.0
'Round Midnight,0.0,0.0,1.0,0.0,0.0,0.0,0.055556,0.000000,0.000000,0.0,...,0.000000,0.000,0.00,0.000000,0.0,0.000000,0.000000,0.00,0.000000,0.0
'Salem's Lot,0.0,0.0,0.0,1.0,0.5,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000,0.00,0.000000,0.0,0.000000,0.000000,0.00,0.000000,0.0
'Til There Was You,0.0,0.0,0.0,0.5,1.0,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000,0.00,0.000000,0.0,0.000000,0.000000,0.00,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ,0.0,0.0,0.0,0.0,0.0,0.0,0.030303,0.000000,0.033333,0.0,...,0.000000,0.000,0.00,0.000000,0.0,1.000000,0.000000,0.00,0.046512,0.0
xXx,0.0,0.0,0.0,0.0,0.0,0.0,0.051282,0.000000,0.000000,0.0,...,0.038462,0.000,0.04,0.041667,0.0,0.000000,1.000000,0.04,0.021277,0.0
xXx: State of the Union,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000,0.00,0.000000,0.0,0.000000,0.040000,1.00,0.000000,0.0
¡Three Amigos!,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.038462,0.032258,0.0,...,0.034483,0.000,0.00,0.000000,0.0,0.046512,0.021277,0.00,1.000000,0.0
