In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Python용 데이터분석 라이브러리
# Excel과 같이, 행과 열로 구성된 데이터 객체를 다룸
 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/the-movies-dataset/ratings.csv
/kaggle/input/the-movies-dataset/links_small.csv
/kaggle/input/the-movies-dataset/credits.csv
/kaggle/input/the-movies-dataset/keywords.csv
/kaggle/input/the-movies-dataset/movies_metadata.csv
/kaggle/input/the-movies-dataset/ratings_small.csv
/kaggle/input/the-movies-dataset/links.csv


# Metadata 정제하기

In [2]:
# pandas로 데이터 읽어오기
meta = pd.read_csv("/kaggle/input/the-movies-dataset/movies_metadata.csv")

# 사용하고 싶은 데이터만 뽑기 (metadata에서 필요한 열(column)만 추려내기)
meta = meta[['id', 'original_title', 'original_language', 'genres']]
print(meta)

           id               original_title original_language  \
0         862                    Toy Story                en   
1        8844                      Jumanji                en   
2       15602             Grumpier Old Men                en   
3       31357            Waiting to Exhale                en   
4       11862  Father of the Bride Part II                en   
...       ...                          ...               ...   
45461  439050                      رگ خواب                fa   
45462  111109          Siglo ng Pagluluwal                tl   
45463   67758                     Betrayal                en   
45464  227506          Satana likuyushchiy                en   
45465  461257                     Queerama                en   

                                                  genres  
0      [{'id': 16, 'name': 'Animation'}, {'id': 35, '...  
1      [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...  
2      [{'id': 10749, 'name': 'Romance'}, {'id': 35, .

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
# rename - column name 바꾸기(바꿀 것만 넣어주기) (열(column) 이름 변경)
# columns 파라미터를 딕셔너리 형태로 넣어주기
meta = meta.rename(columns={'id':'movieId',
                            'original_title': 'title',
                            'original_language': 'language'})

print(meta)

      movieId                        title language  \
0         862                    Toy Story       en   
1        8844                      Jumanji       en   
2       15602             Grumpier Old Men       en   
3       31357            Waiting to Exhale       en   
4       11862  Father of the Bride Part II       en   
...       ...                          ...      ...   
45461  439050                      رگ خواب       fa   
45462  111109          Siglo ng Pagluluwal       tl   
45463   67758                     Betrayal       en   
45464  227506          Satana likuyushchiy       en   
45465  461257                     Queerama       en   

                                                  genres  
0      [{'id': 16, 'name': 'Animation'}, {'id': 35, '...  
1      [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...  
2      [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...  
3      [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...  
4                         [{'id': 35, 'name'

In [4]:
# meta['language'] == 'en' 인 것들만 추출
# loc - column 조건에 맞는 모든 row 들 출력 
meta = meta.loc[meta['language'] == 'en', :]
meta

Unnamed: 0,movieId,title,language,genres
0,862,Toy Story,en,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,8844,Jumanji,en,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,15602,Grumpier Old Men,en,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."
3,31357,Waiting to Exhale,en,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
4,11862,Father of the Bride Part II,en,"[{'id': 35, 'name': 'Comedy'}]"
...,...,...,...,...
45459,222848,Caged Heat 3000,en,"[{'id': 878, 'name': 'Science Fiction'}]"
45460,30840,Robin Hood,en,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name..."
45463,67758,Betrayal,en,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam..."
45464,227506,Satana likuyushchiy,en,[]


In [5]:
# language 가 en 인 row를 출력하는데 그 중 column이 titler과 language 인 것만 출력
meta.loc[meta['language'] == 'en', ["title", "language"]]

Unnamed: 0,title,language
0,Toy Story,en
1,Jumanji,en
2,Grumpier Old Men,en
3,Waiting to Exhale,en
4,Father of the Bride Part II,en
...,...,...
45459,Caged Heat 3000,en
45460,Robin Hood,en
45463,Betrayal,en
45464,Satana likuyushchiy,en


In [6]:
# to_numeric 데이터 타입을 숫자(int)로 변경
meta.movieId = pd.to_numeric(meta.movieId)
meta.movieId

0           862
1          8844
2         15602
3         31357
4         11862
          ...  
45459    222848
45460     30840
45463     67758
45464    227506
45465    461257
Name: movieId, Length: 32269, dtype: int64

In [7]:
# genre의 정보가 많아 필요한 것만 사용하기 위해 수정
print(meta.genres)

0        [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
1        [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
2        [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...
3        [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
4                           [{'id': 35, 'name': 'Comedy'}]
                               ...                        
45459             [{'id': 878, 'name': 'Science Fiction'}]
45460    [{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...
45463    [{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...
45464                                                   []
45465                                                   []
Name: genres, Length: 32269, dtype: object


In [10]:
# eval -> 파이썬에서 사용가능한 데이터타입으로 변환시켜줌
txt = meta.genres[0]

myset = set()
for d in eval(txt):
    myset.add(d['name'])
    
print(myset)

{'Animation', 'Comedy', 'Family'}


In [11]:
# apply('함수') -> 함수를 실행하여 리턴되는 값을 넣어줌

# genre 정제하기: json string을 python set으로 변환하는 함수 구현
def str_to_set(genre):
    myset = set()
    for d in eval(genre):
        myset.add(d['name'])
    return myset

# genres를 사용하려는 값으로만 덮어씀 -> meta.genres의 모든 값에 str_to_set 적용
meta.genres = meta.genres.apply(str_to_set)

meta

Unnamed: 0,movieId,title,language,genres
0,862,Toy Story,en,"{Animation, Comedy, Family}"
1,8844,Jumanji,en,"{Adventure, Fantasy, Family}"
2,15602,Grumpier Old Men,en,"{Comedy, Romance}"
3,31357,Waiting to Exhale,en,"{Drama, Comedy, Romance}"
4,11862,Father of the Bride Part II,en,{Comedy}
...,...,...,...,...
45459,222848,Caged Heat 3000,en,{Science Fiction}
45460,30840,Robin Hood,en,"{Drama, Action, Romance}"
45463,67758,Betrayal,en,"{Drama, Action, Thriller}"
45464,227506,Satana likuyushchiy,en,{}


# Keywords 정제하기

In [12]:
keywords = pd.read_csv( '/kaggle/input/the-movies-dataset/keywords.csv' ) 

# 아래 둘은 같은 기능
# keywords['keywords']
# keywords.keywords

# keywords.keywords 에서도 필요한 정보만 뽑아오기
keywords['keywords'] = keywords['keywords'].apply(str_to_set)
keywords = keywords.rename(columns={'id':'movieId'})

keywords

Unnamed: 0,movieId,keywords
0,862,"{friends, rivalry, jealousy, boy next door, fr..."
1,8844,"{new home, recluse, disappearance, giant insec..."
2,15602,"{duringcreditsstinger, old men, fishing, best ..."
3,31357,"{single mother, interracial relationship, base..."
4,11862,"{daughter, contraception, gynecologist, mother..."
...,...,...
46414,439050,{tragic love}
46415,111109,"{artist, pinoy, play}"
46416,67758,{}
46417,227506,{}


# Keywords와 Metadata 합치기

In [13]:
# merge 하기 ( 조건 : 같은 movieId를 갖는 데이터, 값이 있는것만(inner))
# on 기준으로 how 방식으로
meta = pd.merge(meta, keywords, on='movieId', how='inner')
meta

Unnamed: 0,movieId,title,language,genres,keywords
0,862,Toy Story,en,"{Animation, Comedy, Family}","{friends, rivalry, jealousy, boy next door, fr..."
1,8844,Jumanji,en,"{Adventure, Fantasy, Family}","{new home, recluse, disappearance, giant insec..."
2,15602,Grumpier Old Men,en,"{Comedy, Romance}","{duringcreditsstinger, old men, fishing, best ..."
3,31357,Waiting to Exhale,en,"{Drama, Comedy, Romance}","{single mother, interracial relationship, base..."
4,11862,Father of the Bride Part II,en,{Comedy},"{daughter, contraception, gynecologist, mother..."
...,...,...,...,...,...
32847,222848,Caged Heat 3000,en,{Science Fiction},{}
32848,30840,Robin Hood,en,"{Drama, Action, Romance}",{}
32849,67758,Betrayal,en,"{Drama, Action, Thriller}",{}
32850,227506,Satana likuyushchiy,en,{},{}


# Jaccard Similarity 

In [14]:
# iloc -> 인덱스 번호로 위치에 해당하는거 출력
dk = meta.loc[meta.title == 'The Dark Knight'].iloc[0]
dkr = meta.loc[meta.title == 'The Dark Knight Rises'].iloc[0]

print(dk)
print(dkr)

# concat 합치기 , axis = 1이면 같은 컬럼을 묶어서 테이블 형태로
pd.concat([dk, dkr], axis=1).T

movieId                                                   155
title                                         The Dark Knight
language                                                   en
genres                       {Drama, Action, Crime, Thriller}
keywords    {batman, chaos, joker, criminal mastermind, or...
Name: 10278, dtype: object
movieId                                                 49026
title                                   The Dark Knight Rises
language                                                   en
genres                       {Crime, Action, Drama, Thriller}
keywords    {batman, villainess, catwoman, crime fighter, ...
Name: 14315, dtype: object


Unnamed: 0,movieId,title,language,genres,keywords
10278,155,The Dark Knight,en,"{Drama, Action, Crime, Thriller}","{batman, chaos, joker, criminal mastermind, or..."
14315,49026,The Dark Knight Rises,en,"{Crime, Action, Drama, Thriller}","{batman, villainess, catwoman, crime fighter, ..."


In [15]:
# Jaccard 유사도 함수 구현 및 실행

def jaccard_similarity(s1, s2):
    
    # 분모가 0이면 안되니까 조건을 걸어 체크
    # |은 합집합을 의미
    if len(s1|s2) == 0:
        return 0
    
    return len(s1&s2)/len(s1|s2) # 합집합 크기 분의 교집합 크기 


In [16]:
print(dk.genres)
print(dk.keywords)
print(dk.genres | dk.keywords) # 장르와 키워드를 합침

{'Drama', 'Action', 'Crime', 'Thriller'}
{'batman', 'chaos', 'joker', 'criminal mastermind', 'organized crime', 'sadism', 'crime fighter', 'dc comics', 'vigilante', 'based on comic', 'gotham city', 'district attorney', 'superhero', 'tragic hero', 'super powers', 'scarecrow', 'secret identity', 'super villain', 'imax'}
{'batman', 'Action', 'chaos', 'joker', 'district attorney', 'criminal mastermind', 'superhero', 'tragic hero', 'Thriller', 'organized crime', 'super powers', 'sadism', 'crime fighter', 'scarecrow', 'dc comics', 'vigilante', 'secret identity', 'super villain', 'Drama', 'Crime', 'imax', 'based on comic', 'gotham city'}


In [17]:
# 두 영화의 장르와 키워드를 합친 후 자카드 유사도를 구해보자
jaccard_similarity(dk.genres | dk.keywords, dkr.genres | dkr.keywords)

0.37142857142857144

# Rating 데이터 읽고 정제하기

In [18]:
# 평점 데이터 가져오기
ratings = pd.read_csv('/kaggle/input/the-movies-dataset/ratings_small.csv' ) 
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
...,...,...,...,...
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663


In [19]:
# 테이블 합치기 위해 일단 데이터 타입 확인
ratings.movieId

0           31
1         1029
2         1061
3         1129
4         1172
          ... 
99999     6268
100000    6269
100001    6365
100002    6385
100003    6565
Name: movieId, Length: 100004, dtype: int64

In [20]:
# metadata로부터 movieId, title 정보 가져와 movieId 기준으로 inner 조건으로 합치기
ratings = pd.merge(ratings, meta[['movieId', 'title']], on='movieId', how='inner')
ratings

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,1371,2.5,1260759135,Rocky III
1,4,1371,4.0,949810302,Rocky III
2,7,1371,3.0,851869160,Rocky III
3,19,1371,4.0,855193404,Rocky III
4,21,1371,3.0,853852263,Rocky III
...,...,...,...,...,...
33295,652,127728,5.0,1439586990,8:46
33296,652,129009,4.0,1442690827,Love Is a Ball
33297,653,2103,3.0,948161066,Solaris
33298,659,167,4.0,836137550,K-PAX


In [21]:
# pivot table 만들기
# 어떤거 기준으로 할지 괄호 안에 적어주기
matrix = ratings.pivot_table(index= 'userId', columns='title', values='rating')
matrix

title,!Women Art Revolution,'Gator Bait,'Twas the Night Before Christmas,10 Items or Less,10 Things I Hate About You,"10,000 BC",11'09''01 - September 11,12 + 1,12 Angry Men,1408,...,Young and Innocent,Zaat,Zabriskie Point,Zapped Again!,Zardoz,Zodiac,eXistenZ,xXx,¡Three Amigos!,Мой сводный брат Франкенштейн
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,3.5,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,3.5,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,,,,,,,,,,,...,,,,,,,,,,
668,,,,,,,,,,,...,,,,,,,,,,
669,,,,,,,,,,,...,,,,,,,,,,
670,,,,,,,,,,,...,,,,,,,,,,


# Pearson Correlation Coefficient

In [22]:
def pearson_similarity(u1, u2):
    # mean()->비어있지 않은 값들의 평균을 구해줌
    # 평균값을 빼서 축을 옮겨주는 역할
    u1_c = u1 - u1.mean() 
    u2_c = u2 - u2.mean()
    
    
    denom = np.sqrt(np.sum(u1_c ** 2) * np.sum(u2_c ** 2))
    
    if denom != 0:
        return np.sum(u1_c * u2_c)/denom
    else:
        return 0
    

# 두 영화의 유사도 계산하기
dk_rating = matrix['The Dark Knight']
pn_rating = matrix['Prom Night']

pearson_similarity(dk_rating, pn_rating)

0.773565934694095

# 비슷한 영화 추천 기능 구현

In [23]:
def find_similar_movies(input_title, matrix, n, alpha): 
    # input_meta: 입력된 영화의 metadata
    input_meta = meta.loc[meta['title'] == input_title].iloc[0] 
    # input_set: 입력된 영화의 genres와 keyword의 합집합
    input_set = input_meta.genres | input_meta.keywords
    
    # result: 모든 영화마다 유사도를 계산하여 저장하기 위함
    result = []
    
    for this_title in matrix.columns: 
        
        # 입력된 영화는 유사도 계산을 하지 않고 패스
        if this_title == input_title:
            continue
            
        # this_meta: 유사도를 계산하려는 영화의 metadata
        this_meta = meta.loc[ meta[ 'title'] == this_title].iloc[0] 
        # this_set: 이 영화의 genres와 keywords의 합집합
        this_set = this_meta.genres | this_meta.keywords
        
        # pearson: 입력 영화와 이번 영화의 pearson 유사도 결과
        pearson = pearson_similarity(matrix[this_title], matrix[input_title])
        # jaccard: 입력 영화와 이번 영화의 jaccard 유사도 결과
        jaccard = jaccard_similarity(this_set, input_set)
        
        # score: pearson 점수와 jaccard 점수의 가중치 합
        score = alpha * pearson + (1 - alpha) * jaccard 
        
        # result에 계산 결과 추가
        result.append((this_title, pearson, jaccard, score))
        
    # 모든 영화에 대해 유사도 계산이 끝나면 result를 정렬
    result.sort(key=lambda r: r[3], reverse=True)
    
    # 상위 n개를 return
    return result[:n]

In [24]:
# The Dark Knight 와 비슷한 영화 10개 추천 
# 가중치 0.3 -> pearson보다 jaccard에 좀 더 가중치있게
result = find_similar_movies('The Dark Knight', matrix, 10, 0.3)
pd.DataFrame(result, columns = ['title', 'pearson', 'jaccard', 'score'])

Unnamed: 0,title,pearson,jaccard,score
0,Wild Wild West,0.773566,0.032258,0.25465
1,Prom Night,0.773566,0.022222,0.247625
2,Batman Begins,0.005015,0.292683,0.206383
3,Yamakasi - Les samouraïs des temps modernes,0.377145,0.125,0.200643
4,Blue Thunder,0.326617,0.133333,0.191318
5,Midnight in the Garden of Good and Evil,0.373841,0.111111,0.18993
6,Topaz,0.377145,0.103448,0.185557
7,Big Bad Mama,0.344649,0.107143,0.178395
8,Sneakers,0.41583,0.068966,0.173025
9,The Enforcer,0.326617,0.103448,0.170399
