# Collaborative filtering

## import library

In [1]:
import pandas as pd
import numpy as np
import math
import re
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD, accuracy
from sklearn.model_selection import cross_validate, train_test_split

## import files

In [2]:
input_path = 'Downloads/input/'

In [3]:
ratings_df = pd.read_csv(input_path + 'ratings.csv')
ratings_df = ratings_df.drop(columns = 'timestamp')

In [4]:
print(ratings_df.shape)
ratings_df.head()

(26024289, 3)


Unnamed: 0,userId,movieId,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0


In [5]:
links_df = pd.read_csv(input_path + 'links.csv')
links_df = links_df.drop(columns = 'imdbId')

In [6]:
print(links_df.shape)
links_df.head()

(45843, 2)


Unnamed: 0,movieId,tmdbId
0,1,862.0
1,2,8844.0
2,3,15602.0
3,4,31357.0
4,5,11862.0


In [7]:
movies_df = pd.read_csv(input_path + 'tmdb_5000_movies.csv')

In [8]:
print(movies_df.shape)
movies_df.head()

(4803, 20)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


## TMDB 데이터셋 기반으로 영화 ID 리스트 생성

In [9]:
movieId_list = movies_df[['id', 'title']].set_index('id').join(links_df.set_index('tmdbId')[['movieId']]).dropna()
movieId_list['movieId'] = movieId_list['movieId'].map(int)
movieId_list = movieId_list.set_index('movieId')

In [10]:
movieId_list.shape

(4599, 1)

In [11]:
movieId_list.head()

Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
18,Four Rooms
260,Star Wars
6377,Finding Nemo
356,Forrest Gump
2858,American Beauty


## TMDB 데이터셋에 존재하지 않는 영화에 대한 평점 삭제

In [12]:
ratings_df = ratings_df[ratings_df['movieId'].isin(movieId_list.index)]

In [13]:
ratings_df.shape

(17977131, 3)

## 적은 수의 평점을 받은 영화 삭제
- ### 최소 5177개의 평점 기록이 있는 영화만 쓰도록 함.

In [14]:
df_movie_summary = ratings_df.groupby('movieId')['rating'].agg(['count'])
movie_benchmark = int(round(df_movie_summary['count'].quantile(0.75),0))
movie_benchmark

3774

## 적은 수의 평점을 남긴 유저 삭제
- ### 최소 280개 이상의 리뷰를 남긴 유저의 데이터만을 사용하도록 함.

In [15]:
df_user_summary = ratings_df.groupby('userId')['rating'].agg(['count'])
user_benchmark = int(round(df_user_summary['count'].quantile(0.75),0))
user_benchmark

67

## 앞서 설정한 기준점을 바탕으로 평점 데이터셋 축소

In [16]:
orig_ratings_df = ratings_df.copy()

In [17]:
ratings_df = orig_ratings_df.copy()

In [18]:
original_movie_list = ratings_df['movieId'].unique()
original_user_count = ratings_df['userId'].nunique()

while True:
    df_movie_summary = ratings_df.groupby('movieId')['rating'].agg(['count'])
    drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index
    
    if len(drop_movie_list) == 0:
        break
    else:
        ratings_df = ratings_df[~ratings_df['movieId'].isin(drop_movie_list)]
    
    df_user_summary = ratings_df.groupby('userId')['rating'].agg(['count'])
    drop_user_list = df_user_summary[df_user_summary['count'] < user_benchmark].index
    
    if len(drop_user_list) == 0:
        break
    else:
        ratings_df = ratings_df[~ratings_df['userId'].isin(drop_user_list)]

drop_movie_list = list(set(original_movie_list) - set(ratings_df['userId'].unique()))

print("Dropped movie count : ", len(drop_movie_list))
print("Dropped user count : ", (original_user_count - ratings_df['userId'].nunique()))

Dropped movie count :  3567
Dropped user count :  208245


## 축소된 평점 데이터셋에서 한 userId가 남긴 평점 개수를 측정

In [19]:
ratings_df['count'] = ratings_df.groupby('userId')['rating'].transform('count')

## 평점 데이터셋 가공 후의 모습

In [20]:
print(ratings_df.shape)
ratings_df.head()

(10406807, 4)


Unnamed: 0,userId,movieId,rating,count
317,9,1,4.5,70
318,9,150,4.0,70
320,9,497,4.0,70
321,9,594,3.0,70
322,9,750,4.0,70


## 데이터 전처리 후 ratings dataframe에 남은 영화 종류의 총 수

In [21]:
# num of tmdb movies after data clean
ratings_df['movieId'].nunique()

985

In [22]:
# Data Example
print(ratings_df.iloc[::500000, :])

          userId  movieId  rating  count
317            9        1     4.5     70
1243663    12770    56174     4.0     81
2487283    25813     2359     4.0    307
3744890    39045     2161     4.5    319
4998745    51419     3114     4.0    125
6254815    64523     4034     1.5    303
7513976    77547     1644     3.0    146
8774176    90506     7147     1.0    181
10033527  103502      648     3.5    174
11296264  117194      858     5.0    129
12543779  130151     8784     5.0    226
13778426  143161      296     4.0     69
15016889  156186     1393     3.5    272
16248864  168830     1198     4.0    208
17488724  181477     1201     5.0     71
18734725  194606      527     4.5    141
19979490  207742      541     5.0    364
21225520  220483     5995     4.5    102
22481799  233410    48774     5.0     89
23741351  246455     3671     4.0    372
24996406  260013     1608     3.0    381


In [23]:
movieId_list.head()

Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
18,Four Rooms
260,Star Wars
6377,Finding Nemo
356,Forrest Gump
2858,American Beauty


## userId = 231이 과거에 평점 5점을 준 영화들

In [24]:
df_1 = ratings_df[(ratings_df['userId'] == 231) & (ratings_df['rating'] == 5)]
df_1 = df_1.set_index('movieId')
df_1 = df_1.join(movieId_list)['title']
df_1

movieId
110                                             Braveheart
260                                              Star Wars
318                               The Shawshank Redemption
527                                       Schindler's List
1196                               The Empire Strikes Back
1198                               Raiders of the Lost Ark
1259                                           Stand by Me
2028                                   Saving Private Ryan
3578                                             Gladiator
4993     The Lord of the Rings: The Fellowship of the Ring
33166                                                Crash
60684                                             Watchmen
Name: title, dtype: object

## ratings dataframe을 이용하여 trainset 생성

In [25]:
reader = Reader()
svd = SVD()

result = movieId_list.copy()
result = result.reset_index()
result = result[~result['movieId'].isin(drop_movie_list)]

# getting full dataset
data_set = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

train_data_set = data_set.build_full_trainset()
#trainset, testset = train_test_split(data_set, test_size=.25)

## 모델 학습

In [26]:
#model.fit(X_train, y_train).score(X_train, y_train)
%time svd.fit(train_data_set)

Wall time: 15min 48s


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x23e19e58128>

## 예측 리스트에서 유저가 이미 본 영화에 대한 정보를 제거한 뒤 평점 예측 결과를 출력

In [27]:
def user_rating_prediction(userId, target_movie_title, result):
    rating_prediction = result[]
    #print("length of result : ", len(result['movieId']))
    rating_prediction['Estimate_Score'] = rating_prediction['movieId'].apply(lambda x : svd.predict(userId, x).est)
    user_already_seen = ratings_df[ratings_df['userId'] == userId]
    user_already_seen = user_already_seen.set_index('movieId')
    user_already_seen = user_already_seen.join(movieId_list)['title']
    new_to_user = rating_prediction[~rating_prediction['title'].isin(user_already_seen.values)]
    new_to_user = new_to_user.sort_values('Estimate_Score', ascending=False)
    new_to_user = new_to_user.drop(columns='movieId')
    return new_to_user.head(10)

## movietitle에 해당하는 영화에 평점 5점을 남긴 유저의 아이디 목록을 출력

In [28]:
def title2userlist(movietitle):
    movie_id = movieId_list.index[movieId_list['title'] == movietitle].tolist()[0]
    user_id_list = ratings_df[ratings_df['movieId'] == movie_id][ratings_df['rating'] == 5.0].sort_values('count', ascending=False)['userId'].values
    return user_id_list

## 위에 정의된 함수들을 통해 유저 100명에 해당하는 영화의 목록을 출력

In [29]:
def get_recommendation_by_title(movietitle, set_count=100):
    user_id_list = title2userlist(movietitle)
    prediction_results = list()
    for user_id in user_id_list:
        if len(prediction_results) == set_count:
            break
        temp_list = user_rating_prediction(user_id, movietitle, result)['title'].values
        if len(temp_list) == 10:
            prediction_results.append(temp_list)
    return prediction_results

## Alien 영화를 좋아하는 유저 100명이 좋아할 만한 영화의 목록을 출력

In [30]:
final_result = get_recommendation_by_title('Alien')
final_result

  This is separate from the ipykernel package so we can avoid doing imports until


[array(['Her', 'Inside Out', 'Gravity',
        'Captain America: The Winter Soldier', 'The Wolf of Wall Street',
        'Life of Pi', 'X-Men: Days of Future Past', 'The Avengers',
        'Frozen', 'Trust'], dtype=object),
 array(['Underclassman', 'My Soul to Take', 'The Girl on the Train',
        'The Joneses', 'Yogi Bear', "Mo' Better Blues", 'Julia',
        'The City of Your Final Destination', 'The Virginity Hit',
        'How Green Was My Valley'], dtype=object),
 array(['I Love You, Beth Cooper', 'Wild Target', 'The Joneses',
        'Yogi Bear', "Mo' Better Blues", 'Julia',
        'The City of Your Final Destination', 'The Virginity Hit',
        'How Green Was My Valley', 'Bathing Beauty'], dtype=object),
 array(['The Color Purple', 'Lone Star', 'The American President',
        'Bathing Beauty', 'Perrier’s Bounty', 'The Girl on the Train',
        'The Joneses', 'Yogi Bear', "Mo' Better Blues", 'Julia'],
       dtype=object),
 array(['Billy Elliot', "Recess: School's Out"