# Movie Recommendation by Pearson's' R correlations

## import library

In [34]:
import pandas as pd
import numpy as np
import math
import re
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD, accuracy
from sklearn.model_selection import cross_validate, train_test_split

## import files

In [35]:
ratings_df = pd.read_csv('data/ratings.csv')

In [36]:
ratings_df = ratings_df.drop(columns='timestamp')

In [39]:
ratings_df.shape

(26024289, 3)

In [40]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0


In [41]:
links_df = pd.read_csv('data/links.csv')

In [42]:
links_df = links_df.drop(columns = 'imdbId')
links_df.head()

Unnamed: 0,movieId,tmdbId
0,1,862.0
1,2,8844.0
2,3,15602.0
3,4,31357.0
4,5,11862.0


In [43]:
movies_df = pd.read_csv('data/tmdb_5000_movies.csv')

## TMDB 데이터셋 기반으로 영화 ID 리스트 생성

In [44]:
movieId_list = movies_df[['id', 'title']].set_index('id').join(links_df.set_index('tmdbId')[['movieId']]).dropna()
movieId_list['movieId'] = movieId_list['movieId'].map(int)
movieId_list = movieId_list.set_index('movieId')

In [45]:
movieId_list.shape

(4599, 1)

In [46]:
movieId_list.head()

Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
18,Four Rooms
260,Star Wars
6377,Finding Nemo
356,Forrest Gump
2858,American Beauty


## TMDB 데이터셋에 존재하지 않는 영화에 대한 평점 삭제

In [48]:
ratings_df = ratings_df[ratings_df['movieId'].isin(movieId_list.index)][['userId', 'movieId', 'rating']]

In [49]:
ratings_df.shape

(17977131, 3)

## 적은 수의 평점을 받은 영화 삭제

In [50]:
df_movie_summary = ratings_df.groupby('movieId')['rating'].agg(['count'])
print('The number of tmdb movies in rating.csv dataset : ',len(df_movie_summary))

The number of tmdb movies in rating.csv dataset :  4597


## 최소 5177개의 평점 기록이 있는 영화만 쓰도록 함.

In [51]:
movie_benchmark = int(round(df_movie_summary['count'].quantile(0.8),0))
movie_benchmark

5177

In [52]:
drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index
print("Dropped movie count : ",len(drop_movie_list))
print("Target movie count : ", len(df_movie_summary) - len(drop_movie_list))

Dropped movie count :  3677
Target movie count :  920


## 적은 수의 평점을 남긴 유저 삭제를 위한 기준점 설정

In [53]:
df_user_summary = ratings_df.groupby('userId')['rating'].agg(['count'])
df_user_summary.head()

Unnamed: 0_level_0,count
userId,Unnamed: 1_level_1
1,23
2,14
3,7
4,50
5,17


# 최소 280개 이상의 리뷰를 남긴 유저의 데이터만을 사용하도록 한다.

In [54]:
user_benchmark = int(round(df_user_summary['count'].quantile(0.95),0))
drop_user_list = df_user_summary[df_user_summary['count'] < user_benchmark].index
user_benchmark

280

## 앞서 설정한 기준점을 바탕으로 평점 데이터셋 가공

In [55]:
ratings_df = ratings_df[~ratings_df['movieId'].isin(drop_movie_list)]
ratings_df = ratings_df[~ratings_df['userId'].isin(drop_user_list)]

## 평점 데이터셋 가공 후의 모습

In [56]:
ratings_df.shape

(4488072, 3)

In [57]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
1710,24,1,4.0
1713,24,16,3.0
1714,24,17,3.0
1715,24,21,4.0
1716,24,25,3.0


## 데이터 전처리 후 ratings dataframe에 남은 영화 종류의 총 수

In [58]:
# num of tmdb movies after data clean
ratings_df['movieId'].nunique()

920

In [61]:
# Data Example
print(ratings_df.iloc[::500000, :])

          userId  movieId  rating
1710          24        1     4.0
2868550    29887     2599     4.5
5806126    59698    91500     2.0
8740001    90150     2948     3.0
11699629  121323     3006     5.0
14591950  151662     6874     4.0
17420662  180783    36529     3.5
20290065  210853    68954     3.5
23179335  240551     8533     3.0


In [62]:
movieId_list.head()

Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
18,Four Rooms
260,Star Wars
6377,Finding Nemo
356,Forrest Gump
2858,American Beauty


# userId = 231이 과거에 평점 5점을 준 영화들

In [67]:
df_1 = ratings_df[(ratings_df['userId'] == 231) & (ratings_df['rating'] == 5)]
df_1 = df_1.set_index('movieId')
df_1 = df_1.join(movieId_list)['title']
print(df_1)

movieId
110                                             Braveheart
260                                              Star Wars
318                               The Shawshank Redemption
527                                       Schindler's List
1196                               The Empire Strikes Back
1198                               Raiders of the Lost Ark
1259                                           Stand by Me
2028                                   Saving Private Ryan
3578                                             Gladiator
4993     The Lord of the Rings: The Fellowship of the Ring
33166                                                Crash
60684                                             Watchmen
Name: title, dtype: object


## ratings dataframe을 이용하여 trainset 생성

In [64]:
reader = Reader()
svd = SVD()

user_1 = movieId_list.copy()
user_1 = user_1.reset_index()
user_1 = user_1[~user_1['movieId'].isin(drop_movie_list)]

# getting full dataset
data_set = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

train_data_set = data_set.build_full_trainset()
#trainset, testset = train_test_split(data_set, test_size=.25)

## 모델 학습

In [65]:
#model.fit(X_train, y_train).score(X_train, y_train)
%time svd.fit(train_data_set)

Wall time: 4min 22s


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x20a009f99e8>

## userId = 231의 평점을 예측

In [66]:
user_1['Estimate_Score'] = user_1['movieId'].apply(lambda x: svd.predict(231, x).est)

user_1 = user_1.drop('movieId', axis = 1)

user_1 = user_1.sort_values('Estimate_Score', ascending=False)
print(user_1.head(10))

                                                 title  Estimate_Score
54   The Lord of the Rings: The Fellowship of the Ring        4.615967
260                                         The Matrix        4.592412
56       The Lord of the Rings: The Return of the King        4.555520
55               The Lord of the Rings: The Two Towers        4.533065
1                                            Star Wars        4.499314
601                            The Empire Strikes Back        4.458673
127                           The Shawshank Redemption        4.377738
602                                 Return of the Jedi        4.366863
304                                             Aliens        4.341575
161                                              Alien        4.305118
