# Movie Recommendation by Pearson's' R correlations

## import library

In [1]:
import pandas as pd

## import files

In [2]:
ratings_df = pd.read_csv('Downloads/input/ratings.csv')

In [3]:
links_df = pd.read_csv('Downloads/input/links.csv')

In [4]:
movies_df = pd.read_csv('Downloads/input/tmdb_5000_movies.csv')

## TMDB 데이터셋 기반으로 영화 ID 리스트 생성

In [5]:
movieId_list = movies_df[['id', 'title']].set_index('id').join(links_df.set_index('tmdbId')[['movieId']]).dropna()
movieId_list['movieId'] = movieId_list['movieId'].map(int)
movieId_list = movieId_list.set_index('movieId')

In [6]:
movieId_list.shape

(4599, 1)

In [7]:
movieId_list.head()

Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
18,Four Rooms
260,Star Wars
6377,Finding Nemo
356,Forrest Gump
2858,American Beauty


## 평점 데이터셋의 초기 모습

In [8]:
ratings_df.shape

(26024289, 4)

In [9]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


## TMDB 데이터셋에 존재하지 않는 영화에 대한 평점 삭제

In [10]:
ratings_df = ratings_df[ratings_df['movieId'].isin(movieId_list.index)][['userId', 'movieId', 'rating']]

In [11]:
ratings_df.shape

(17977131, 3)

In [12]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,110,1.0
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0
6,1,2762,4.5


## 적은 수의 평점을 받은 영화 삭제

In [13]:
df_movie_summary = ratings_df.groupby('movieId')['rating'].agg(['count'])
df_movie_summary.head()

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
1,66008
10,32534
11,19475
14,6807
15,3125


In [14]:
movie_benchmark = int(round(df_movie_summary['count'].quantile(0.5),0))
movie_benchmark

903

In [15]:
drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index
drop_movie_list

Int64Index([   254,    294,    387,    476,    626,    664,    715,    803,
               869,    885,
            ...
            170319, 170445, 171737, 171963, 172347, 172853, 173857, 174533,
            174591, 175115],
           dtype='int64', name='movieId', length=2298)

## 적은 수의 평점을 남긴 유저 삭제

In [16]:
df_user_summary = ratings_df.groupby('userId')['rating'].agg(['count'])
df_user_summary.head()

Unnamed: 0_level_0,count
userId,Unnamed: 1_level_1
1,23
2,14
3,7
4,50
5,17


In [17]:
user_benchmark = int(round(df_user_summary['count'].quantile(0.5),0))
user_benchmark

23

In [18]:
drop_user_list = df_user_summary[df_user_summary['count'] < user_benchmark].index
drop_user_list

Int64Index([     2,      3,      5,      6,     10,     13,     14,     17,
                18,     19,
            ...
            270876, 270878, 270880, 270881, 270882, 270883, 270888, 270889,
            270890, 270895],
           dtype='int64', name='userId', length=134043)

In [19]:
ratings_df = ratings_df[~ratings_df['movieId'].isin(drop_movie_list)]
ratings_df = ratings_df[~ratings_df['userId'].isin(drop_user_list)]

## 평점 데이터셋 가공 후의 모습

In [20]:
ratings_df.shape

(15964404, 3)

In [21]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,110,1.0
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0
6,1,2762,4.5


In [22]:
ratings_df_p = pd.pivot_table(ratings_df, values='rating', index='userId', columns='movieId')

In [23]:
ratings_df_p.shape

(135041, 2299)

In [24]:
ratings_df_p.head()

movieId,1,10,11,14,15,16,17,18,19,20,...,142488,143385,143859,146656,148626,149406,152077,158872,159093,160438
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,4.0,,,,,,,,,,...,,,,,,,,,,
9,4.5,,,,,,,,,,...,,,,,,,,,,


## 영화 추천 함수

In [25]:
def recommend(movie_title):
    print("For movie ({})".format(movie_title))
    print("- Top 10 movies recommended based on Pearsons'R correlation - ")

    i = int(movieId_list.index[movieId_list['title'] == movie_title][0])
    target = ratings_df_p[i]
    similar_to_target = ratings_df_p.corrwith(target)

    corr_target = pd.DataFrame(similar_to_target, columns = ['PearsonR'])
    corr_target.dropna(inplace = True)
    corr_target = corr_target.sort_values('PearsonR', ascending = False)
    corr_target = corr_target.join(movieId_list)
    print(corr_target[1:11].to_string(index = False))

## 영화 추천 예시

In [26]:
recommend('Avatar')

For movie (Avatar)
- Top 10 movies recommended based on Pearsons'R correlation - 
PearsonR                                      title
                                                   
0.479821             Transformers: Dark of the Moon
0.477683                  What Planet Are You From?
0.471082                               Transformers
0.453935     Fantastic 4: Rise of the Silver Surfer
0.448464                              Green Lantern
0.445034        Transformers: Revenge of the Fallen
0.434122                      X-Men: The Last Stand
0.432529                                   Dinosaur
0.429435  The Hobbit: The Battle of the Five Armies
0.423874                   X-Men Origins: Wolverine
