# Collaborative Filtering

Memory-Based Algorithm
- Item based (더 많이 함) <-- dot production없이 유사도를 기반으로 주로 활용됨
- User based
  - 우선 dot product 없는 버전 구현 (Item based와 방식 동일)


Model-Based Algorithm
- Latent Factor 협업 필터링 방법 (Matrix Factorization)

# 구글 드라이브 연결

In [3]:
import os
from google.colab import drive
drive.mount('/content/drive')

path = "/content/drive/MyDrive/lecture"
os.listdir(path)

Mounted at /content/drive


['movies.csv',
 'ratings.csv',
 'tags.csv',
 'links.csv',
 'movie_user.csv',
 'movie_user.gsheet',
 '제목 없는 문서.gdoc',
 '협업 필터링 - Item-based.ipynb',
 '협업 필터링 - User-based with dot production.ipynb',
 'movies_refined.csv',
 'ratings_refined.csv',
 '무비렌즈 데이터 전처리.ipynb',
 '협업 필터링 - User-based.ipynb']

In [4]:
import pandas as pd
import numpy as np

# 데이터 로드

In [6]:
ratings = pd.read_csv("/content/drive/MyDrive/lecture/ratings_refined.csv", usecols=['userId', 'movieId', 'rating'])
ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100784,610,166534,4.0
100785,610,168248,5.0
100786,610,168250,5.0
100787,610,168252,5.0


In [7]:
movies = pd.read_csv("/content/drive/MyDrive/lecture/movies_refined.csv", usecols=['movieId', 'title'])
movies

Unnamed: 0,movieId,title
0,1,Toy Story
1,2,Jumanji
2,3,Grumpier Old Men
3,4,Waiting to Exhale
4,5,Father of the Bride Part II
...,...,...
9698,193581,Black Butler: Book of the Atlantic
9699,193583,No Game No Life: Zero
9700,193585,Flint
9701,193587,Bungo Stray Dogs: Dead Apple


In [8]:
df = pd.merge(ratings, movies, on='movieId', how='left')
df

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story
1,1,3,4.0,Grumpier Old Men
2,1,6,4.0,Heat
3,1,47,5.0,Seven (a.k.a. Se7en)
4,1,50,5.0,"Usual Suspects, The"
...,...,...,...,...
100784,610,166534,4.0,Split
100785,610,168248,5.0,John Wick: Chapter Two
100786,610,168250,5.0,Get Out
100787,610,168252,5.0,Logan


null 값 체크

In [9]:
df.columns[df.isna().any()].tolist()

[]

영화명 결측치 체크

In [10]:
df[df['title'].isnull()]

Unnamed: 0,userId,movieId,rating,title


# User-based CF

## 유저 유사도 행렬 준비

In [11]:
user_movie = df.pivot_table(values='rating', index='userId', columns='title')
user_movie

title,'71,'Hellboy': The Seeds of Creation,'Round Midnight,'Salem's Lot,'Til There Was You,'Tis the Season for Love,"'burbs, The",'night Mother,(500) Days of Summer,*batteries not included,...,Zulu,[REC],[REC]²,[REC]³ 3 Génesis,anohana: The Flower We Saw That Day - The Movie,eXistenZ,xXx,xXx: State of the Union,¡Three Amigos!,À nous la liberté (Freedom for Us)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,4.5,3.5,,,
609,,,,,,,,,,,...,,,,,,,,,,


In [12]:
# 610 x 9413 행렬
# 사용자를 9413차원의 벡터로 보려는 것
user_movie.shape

(610, 9413)

## 결측치 처리

null값이 있으면 cosine similarity 함수가 안돌아감

하지만, null값을 0으로 치환하고 계산할경우 결과가 달라짐

(마치 해당 영화를 보고 0점을 준것으로 계산)

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
user_movie_tmp = user_movie.copy().fillna(0)

## 유사도 행렬 계산

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

user_similarity_matrix = cosine_similarity(user_movie_tmp)
user_similarity_matrix.shape

(610, 610)

In [20]:
user_similarity_matrix

array([[1.        , 0.02743256, 0.06004793, ..., 0.29415306, 0.09408534,
        0.1500534 ],
       [0.02743256, 1.        , 0.        , ..., 0.04692625, 0.0275654 ,
        0.10262448],
       [0.06004793, 0.        , 1.        , ..., 0.02123375, 0.        ,
        0.03218075],
       ...,
       [0.29415306, 0.04692625, 0.02123375, ..., 1.        , 0.12260061,
        0.32839128],
       [0.09408534, 0.0275654 , 0.        , ..., 0.12260061, 1.        ,
        0.05332821],
       [0.1500534 , 0.10262448, 0.03218075, ..., 0.32839128, 0.05332821,
        1.        ]])

## 데이터 프레임화

In [22]:
user_ids = user_movie.index
user_ids

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            601, 602, 603, 604, 605, 606, 607, 608, 609, 610],
           dtype='int64', name='userId', length=610)

In [None]:
# 영화-영화 유저관람 내역에 대한 유사도
user_similarity = pd.DataFrame(user_similarity_matrix,
                                index=user_ids, columns=user_ids)
print(user_similarity.shape)
user_similarity.head()

# 사용자 유사도 기반 추천

In [None]:
# 샘플 사용자
user_id = 1

In [None]:
# 유사한 사용자 10명
user_similarity[user_id].sort_values(ascending=False)[1:11]

In [None]:
similar_users = user_similarity[user_id].sort_values(ascending=False)[1:11]
similar_users = similar_users.reset_index()
similar_users

In [None]:
df.loc[df['userId'] == 266, 'title'].tolist()[0]

In [None]:
titles = []
for uid in similar_users['userId'].tolist():
    title = df.loc[df['userId'] == uid, 'title'].tolist()[0]
    titles.append(title)
list(set(titles))

In [None]:
# 사용자와 유사한 사람들이 많이 본 영화 추천
def get_recomendation(user_id):
    similar_users = user_similarity[user_id].sort_values(ascending=False)[1:11]
    similar_users = similar_users.reset_index()

    titles = []
    for uid in similar_users['userId'].tolist():
        title = df.loc[df['userId'] == uid, 'title'].tolist()[0]
        titles.append(title)

    return list(set(titles))

In [None]:
get_recomendation(user_id)