# Collaborative filtering

## import library

In [1]:
import pandas as pd
import numpy as np
import math
import re
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD, accuracy
from sklearn.model_selection import cross_validate, train_test_split

## import files

In [2]:
ratings_df = pd.read_csv('data/ratings.csv')

In [3]:
ratings_df = ratings_df.drop(columns='timestamp')

In [4]:
ratings_df.shape

(26024289, 3)

In [5]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0


In [6]:
links_df = pd.read_csv('data/links.csv')

In [7]:
links_df = links_df.drop(columns = 'imdbId')
links_df.head()

Unnamed: 0,movieId,tmdbId
0,1,862.0
1,2,8844.0
2,3,15602.0
3,4,31357.0
4,5,11862.0


In [8]:
movies_df = pd.read_csv('data/tmdb_5000_movies.csv')

## TMDB 데이터셋 기반으로 영화 ID 리스트 생성

In [9]:
movieId_list = movies_df[['id', 'title']].set_index('id').join(links_df.set_index('tmdbId')[['movieId']]).dropna()
movieId_list['movieId'] = movieId_list['movieId'].map(int)
movieId_list = movieId_list.set_index('movieId')

In [10]:
movieId_list.shape

(4599, 1)

In [11]:
movieId_list.head()

Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
18,Four Rooms
260,Star Wars
6377,Finding Nemo
356,Forrest Gump
2858,American Beauty


## TMDB 데이터셋에 존재하지 않는 영화에 대한 평점 삭제

In [12]:
ratings_df = ratings_df[ratings_df['movieId'].isin(movieId_list.index)][['userId', 'movieId', 'rating']]

In [13]:
ratings_df.shape

(17977131, 3)

## 적은 수의 평점을 받은 영화 삭제

In [14]:
df_movie_summary = ratings_df.groupby('movieId')['rating'].agg(['count'])
print('The number of tmdb movies in rating.csv dataset : ',len(df_movie_summary))

The number of tmdb movies in rating.csv dataset :  4597


## 최소 5177개의 평점 기록이 있는 영화만 쓰도록 함.

In [15]:
movie_benchmark = int(round(df_movie_summary['count'].quantile(0.8),0))
movie_benchmark

5177

In [16]:
drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index
print("Dropped movie count : ",len(drop_movie_list))
print("Target movie count : ", len(df_movie_summary) - len(drop_movie_list))

Dropped movie count :  3677
Target movie count :  920


## 적은 수의 평점을 남긴 유저 삭제를 위한 기준점 설정

In [17]:
df_user_summary = ratings_df.groupby('userId')['rating'].agg(['count'])
df_user_summary.head()

Unnamed: 0_level_0,count
userId,Unnamed: 1_level_1
1,23
2,14
3,7
4,50
5,17


# 최소 280개 이상의 리뷰를 남긴 유저의 데이터만을 사용하도록 한다.

In [18]:
user_benchmark = int(round(df_user_summary['count'].quantile(0.95),0))
drop_user_list = df_user_summary[df_user_summary['count'] < user_benchmark].index
user_benchmark

280

## 앞서 설정한 기준점을 바탕으로 평점 데이터셋 가공

In [19]:
ratings_df = ratings_df[~ratings_df['movieId'].isin(drop_movie_list)]
ratings_df = ratings_df[~ratings_df['userId'].isin(drop_user_list)]

## 평점 데이터셋 가공 후의 모습

In [20]:
ratings_df.shape

(4488072, 3)

In [21]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
1710,24,1,4.0
1713,24,16,3.0
1714,24,17,3.0
1715,24,21,4.0
1716,24,25,3.0


## 데이터 전처리 후 ratings dataframe에 남은 영화 종류의 총 수

In [22]:
# num of tmdb movies after data clean
ratings_df['movieId'].nunique()

920

In [23]:
# Data Example
print(ratings_df.iloc[::500000, :])

          userId  movieId  rating
1710          24        1     4.0
2868550    29887     2599     4.5
5806126    59698    91500     2.0
8740001    90150     2948     3.0
11699629  121323     3006     5.0
14591950  151662     6874     4.0
17420662  180783    36529     3.5
20290065  210853    68954     3.5
23179335  240551     8533     3.0


In [24]:
movieId_list.head()

Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
18,Four Rooms
260,Star Wars
6377,Finding Nemo
356,Forrest Gump
2858,American Beauty


# userId = 231이 과거에 평점 5점을 준 영화들

In [25]:
df_1 = ratings_df[(ratings_df['userId'] == 231) & (ratings_df['rating'] == 5)]
df_1 = df_1.set_index('movieId')
df_1 = df_1.join(movieId_list)['title']
df_1

movieId
110                                             Braveheart
260                                              Star Wars
318                               The Shawshank Redemption
527                                       Schindler's List
1196                               The Empire Strikes Back
1198                               Raiders of the Lost Ark
1259                                           Stand by Me
2028                                   Saving Private Ryan
3578                                             Gladiator
4993     The Lord of the Rings: The Fellowship of the Ring
33166                                                Crash
60684                                             Watchmen
Name: title, dtype: object

## ratings dataframe을 이용하여 trainset 생성

In [26]:
reader = Reader()
svd = SVD()

result = movieId_list.copy()
result = result.reset_index()
result = result[~result['movieId'].isin(drop_movie_list)]

# getting full dataset
data_set = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

train_data_set = data_set.build_full_trainset()
#trainset, testset = train_test_split(data_set, test_size=.25)

## 모델 학습

In [27]:
#model.fit(X_train, y_train).score(X_train, y_train)
%time svd.fit(train_data_set)

Wall time: 4min 27s


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1f2a57ed0f0>

## 예측 리스트에서 유저가 이미 본 영화에 대한 정보를 제거한 뒤 평점 예측 결과를 출력

In [28]:
def user_rating_prediction(userId, result):
    rating_prediction = result
    #print("length of result : ", len(result['movieId']))
    rating_prediction['Estimate_Score'] = rating_prediction['movieId'].apply(lambda x : svd.predict(userId, x).est)
    user_already_seen = ratings_df[ratings_df['userId'] == userId]
    user_already_seen = user_already_seen.set_index('movieId')
    user_already_seen = user_already_seen.join(movieId_list)['title']
    new_to_user = rating_prediction[~rating_prediction['title'].isin(user_already_seen.values)]
    new_to_user = new_to_user.sort_values('Estimate_Score', ascending=False)
    new_to_user = new_to_user.drop(columns='movieId')
    return new_to_user.head(10)

## movietitle에 해당하는 영화에 평점 5점을 남긴 유저의 아이디 목록을 출력

In [29]:
def title2userlist(movietitle):
    movie_id = movieId_list.index[movieId_list['title'] == movietitle].tolist()[0]
    user_id_list = ratings_df[ratings_df['movieId'] == movie_id][ratings_df['rating'] == 5.0]['userId'].values
    return user_id_list

## 위에 정의된 함수들을 통해 유저 100명에 해당하는 영화의 목록을 출력

In [30]:
def get_recommendation_by_title(movietitle, set_count=100):
    user_id_list = title2userlist(movietitle)
    prediction_results = list()
    for user_id in user_id_list:
        if len(prediction_results) == set_count:
            break
        temp_list = user_rating_prediction(user_id, result)['title'].values
        if len(temp_list) == 10:
            prediction_results.append(temp_list)
    return prediction_results

## Alien 영화를 좋아하는 유저 100명이 좋아할 만한 영화의 목록을 출력

In [31]:
final_result = get_recommendation_by_title('Alien')
final_result

  This is separate from the ipykernel package so we can avoid doing imports until


[array(['The Princess Bride', 'The Dark Knight', 'Braveheart',
        "National Lampoon's Vacation", 'Guardians of the Galaxy',
        'Toy Story 3', 'How to Train Your Dragon', 'Iron Man',
        'Batman Begins', 'Up'], dtype=object),
 array(['Seven Samurai', 'The Lives of Others', 'Spirited Away',
        'Being John Malkovich', 'Eternal Sunshine of the Spotless Mind',
        'Lawrence of Arabia', 'Das Boot', 'No Country for Old Men',
        'The Usual Suspects', 'City of God'], dtype=object),
 array(['Psycho', '12 Angry Men', 'City of God', 'Interstellar',
        'Saving Private Ryan', 'There Will Be Blood',
        "National Lampoon's Vacation", 'The Lives of Others', 'Ex Machina',
        'Mad Max: Fury Road'], dtype=object),
 array(['The Lord of the Rings: The Fellowship of the Ring', 'Casablanca',
        'The Lord of the Rings: The Return of the King',
        'The Lord of the Rings: The Two Towers', 'Seven Samurai',
        'Spirited Away', 'The Lives of Others',
       