## Import Module

In [1]:
import os
import os.path as path
import gc
import re
import math
import json

In [2]:
import numpy as np
import pandas as pd

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Setting

In [5]:
pd.set_option('display.max_row', 100)
pd.set_option('display.max_columns', 100)

In [6]:
# 디렉토리 기본 경로 지정
DIR_PATH = path.join('.', 'kaggle-the-movies-dataset')
DIR_SAVE_PATH = path.join('.', 'kaggle-the-movies-dataset', 'output')

print(DIR_PATH)
print(DIR_SAVE_PATH)

.\kaggle-the-movies-dataset
.\kaggle-the-movies-dataset\output


### 데이터 전처리

In [7]:
# movie 경로
path_movie = path.join(DIR_PATH, 'movies_metadata.csv')
print(path_movie)

.\kaggle-the-movies-dataset\movies_metadata.csv


In [8]:
# 데이터 로드 후 확인
movie_data = pd.read_csv(path_movie, low_memory=False)
# 일단 언어가 영어인 영화 자료만 활용
movie_data = movie_data.loc[movie_data['original_language'] == 'en']
print(movie_data.shape)
movie_data.head()

(32269, 24)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [9]:
movie_data.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [10]:
# 오로지 장르와 키워드 기반 유사도를 측정할 계획이므로 여기선 장르 정보만 추출하면 됨
movie_data = movie_data[['id', 'original_language', 'title', 'genres']]
movie_data.head()

Unnamed: 0,id,original_language,title,genres
0,862,en,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,8844,en,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,15602,en,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."
3,31357,en,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
4,11862,en,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]"


In [11]:
# keywords 경로
path_keyword = path.join(DIR_PATH, 'keywords.csv')
print(path_keyword)

.\kaggle-the-movies-dataset\keywords.csv


In [12]:
# 데이터 로드 후 확인
movie_keyword = pd.read_csv(path_keyword, low_memory=False)
print(movie_keyword.shape)
movie_keyword.head()

(46419, 2)


Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [13]:
# 장르 데이터와 키워드 데이터를 merge
# id를 기준으로 두 df를 결합
movie_data = movie_data.astype({'id':'int'})
movie_keyword = movie_keyword.astype({'id':'int'})
print(movie_data['id'].dtypes, movie_keyword['id'].dtypes)

movie_data = pd.merge(movie_data, movie_keyword, on='id')
print(movie_data.shape)
movie_data.head()

int32 int32
(32852, 5)


Unnamed: 0,id,original_language,title,genres,keywords
0,862,en,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,en,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,en,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,en,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,en,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


(+추가)

id에 중복값이 존재할수 있다는 사실을 확인했다...

고유값처럼 보여도 이상치가 있는지 없는지 잘 확인해보도록 하자

In [14]:
movie_data['id'].value_counts()

159849    6
11115     4
152795    4
15028     4
4912      4
         ..
14447     1
114390    1
34283     1
13394     1
461257    1
Name: id, Length: 32251, dtype: int64

In [15]:
movie_data.drop_duplicates(['id'], keep='first', inplace=True, ignore_index=True)
movie_data['id'].value_counts()

862       1
110123    1
83429     1
127668    1
126757    1
         ..
45838     1
13793     1
15026     1
13056     1
461257    1
Name: id, Length: 32251, dtype: int64

In [16]:
movie_data.loc[movie_data['id']==23305]

Unnamed: 0,id,original_language,title,genres,keywords
6992,23305,en,The Warrior,"[{'id': 12, 'name': 'Adventure'}, {'id': 16, '...",[]


### 장르와 키워드 전처리

장르와 키워드의 구조를 확인 하면 list 내부에 dict으로 구성되어 있는 것을 확인할 수 있다.

이런 경우를 해결하기 위해서 ast의 literal_eval을 사용

> **ast란?**
> 
> 파이썬 추상 구문 문법의 트리를 처리하는 데 도움을 주는 모듈.  
> => 텍스트로 된 파이썬 추상 구문을 파이썬 객체로 바꿔준다.

In [17]:
# literal_eval로 장르를 파이썬 객체로 변환 후 id를 제거, name정보만 추출
movie_data['genres'] = movie_data['genres'].apply(literal_eval)
movie_data['genres'] = movie_data['genres'].apply(lambda x : [d['name'] for d in x]).apply(lambda x : ", ".join(x))

In [18]:
# literal_eval로 키워드를 파이썬 객체로 변환 후 id를 제거, name정보만 추출
movie_data['keywords'] = movie_data['keywords'].apply(literal_eval)
movie_data['keywords'] = movie_data['keywords'].apply(lambda x : [d['name'] for d in x]).apply(lambda x : ", ".join(x))

In [19]:
movie_data.head()

Unnamed: 0,id,original_language,title,genres,keywords
0,862,en,Toy Story,"Animation, Comedy, Family","jealousy, toy, boy, friendship, friends, rival..."
1,8844,en,Jumanji,"Adventure, Fantasy, Family","board game, disappearance, based on children's..."
2,15602,en,Grumpier Old Men,"Romance, Comedy","fishing, best friend, duringcreditsstinger, ol..."
3,31357,en,Waiting to Exhale,"Comedy, Drama, Romance","based on novel, interracial relationship, sing..."
4,11862,en,Father of the Bride Part II,Comedy,"baby, midlife crisis, confidence, aging, daugh..."


### TF-IDF 벡터화

전처리한 데이터를 TF-IDF 방법을 이용해 벡터로 변환

장르와 키워드를 구분없이 하나로 합친 뒤 tfidf vector로 제작

In [20]:
tfidf_vector = TfidfVectorizer()
tfidf_matrix = tfidf_vector.fit_transform(movie_data['genres'] + ", " + movie_data['keywords']).toarray()
tfidf_matrix_feature = tfidf_vector.get_feature_names_out()

In [21]:
tfidf_matrix.shape

(32251, 11437)

In [22]:
tfidf_matrix = pd.DataFrame(tfidf_matrix, columns=tfidf_matrix_feature, index = movie_data.title)
print(tfidf_matrix.shape)
tfidf_matrix.head()

(32251, 11437)


Unnamed: 0_level_0,077,10,11,13,1500s,15th,16th,17th,1812,18th,1900s,1910s,1917,1920s,1930s,1940s,1950s,1960s,1970s,1980s,1990s,1992,1995,19th,1st,2000,2001,2002,2079,20th,21st,230,25th,2nd,360,3d,500,51,60s,66,70s,80,95,aaron,abandoned,abandonment,abba,abbess,abc,abdication,...,youtube,youtuber,yucatec,yugo,yugoslavia,yukon,yun,yuppie,zagreb,zaire,zanzibar,zealand,zealot,zealous,zebra,zeit,zeitgeist,zen,zeppelin,zero,zeus,ziegfeld,zimbabwe,zinnia,zionism,zip,zither,zodiac,zombie,zombification,zone,zoo,zookeeper,zoom,zoophilia,zorro,zulu,zurich,øverste,żółty,βάφτηκε,γη,κόκκινο,το,χώμα,миньоны,卧底肥妈,绝地奶霸,自然界大事件,超级妈妈
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
Toy Story,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Jumanji,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Grumpier Old Men,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Waiting to Exhale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Father of the Bride Part II,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
print(tfidf_matrix.dtypes)

tfidf_matrix = tfidf_matrix.astype(np.float16)
print(tfidf_matrix.dtypes)

077        float64
10         float64
11         float64
13         float64
1500s      float64
            ...   
миньоны    float64
卧底肥妈       float64
绝地奶霸       float64
自然界大事件     float64
超级妈妈       float64
Length: 11437, dtype: object
077        float16
10         float16
11         float16
13         float16
1500s      float16
            ...   
миньоны    float16
卧底肥妈       float16
绝地奶霸       float16
自然界大事件     float16
超级妈妈       float16
Length: 11437, dtype: object


### 유사도 계산

만들어진 tf-idf vector를 코사인 유사도를 활용해서 유사도 값을 계산

영화 개수(n)만큼 n x n의 matirx 형태가 나오게 된다.

In [24]:
%%time
cosine_sim = cosine_similarity(tfidf_matrix)

CPU times: total: 15min 2s
Wall time: 1min 46s


In [25]:
print(cosine_sim.shape)
print(cosine_sim.dtype)

cosine_sim = cosine_sim.astype(np.float16)
print(cosine_sim.dtype)

cosine_sim

(32251, 32251)
float64
float16


array([[1.    , 0.0415, 0.0087, ..., 0.    , 0.    , 0.    ],
       [0.0415, 1.    , 0.    , ..., 0.    , 0.    , 0.    ],
       [0.0087, 0.    , 1.    , ..., 0.    , 0.    , 0.    ],
       ...,
       [0.    , 0.    , 0.    , ..., 1.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , ..., 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , ..., 0.    , 0.    , 0.    ]],
      dtype=float16)

In [26]:
# 제목 검색 뿐만 아니라 id로도 검색할 수 있도록 행과 열 중 하나를 title, 다른 하나를 id로 지정

cosine_sim_df = pd.DataFrame(cosine_sim, index = movie_data.id, columns = movie_data.title, dtype=np.float16)
print(cosine_sim_df.shape)
cosine_sim_df.head()

(32251, 32251)


title,Toy Story,Jumanji,Grumpier Old Men,Waiting to Exhale,Father of the Bride Part II,Heat,Sabrina,Tom and Huck,Sudden Death,GoldenEye,The American President,Dracula: Dead and Loving It,Balto,Nixon,Cutthroat Island,Casino,Sense and Sensibility,Four Rooms,Ace Ventura: When Nature Calls,Money Train,Get Shorty,Copycat,Assassins,Powder,Leaving Las Vegas,Othello,Now and Then,Persuasion,Dangerous Minds,Twelve Monkeys,Babe,Carrington,Dead Man Walking,Across the Sea of Time,It Takes Two,Clueless,"Cry, the Beloved Country",Richard III,Dead Presidents,Restoration,Mortal Kombat,To Die For,How To Make An American Quilt,Se7en,Pocahontas,When Night Is Falling,The Usual Suspects,Guardian Angel,Mighty Aphrodite,The Big Green,...,Take Me,A Death in the Gunj,Black Sun,The Incredible Jessica James,It Stains the Sands Red,Opus II,Opus III,Bloodletting,Can't Buy My Love,Hopeless Romantic,The Sparrow's Fluttering,Savages,Swing,Pro Lyuboff,Kuka,Dead Birds,The Hunters,"Whiffles, Cubic Artist",Cop and a Half: New Recruit,Dyketactics,Arabian Nights,The Fortunes and Misfortunes of Moll Flanders,An American Vampire Story,The Sublet,Fit to Kill,TechnoCalyps,Starquest II,Rivers of Sand,Altar of Fire,The Wonders of Aladdin,Phobos. Fear Kills,The Final Storm,In a Heartbeat,Jungle Woman,To Be Fat Like Me,Cadet Kelly,The Scheming Gambler's Paradise,The Hilarious Posters,The Devilish Tenant,Pooh's Heffalump Halloween Movie,Deep Hearts,The Morning After,House of Horrors,Shadow of the Blair Witch,The Burkittsville 7,Caged Heat 3000,Robin Hood,Betrayal,Satan Triumphant,Queerama
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
862,1.0,0.041504,0.008698,0.006927,0.005585,0.0,0.006454,0.059235,0.0,0.0,0.037933,0.014008,0.027191,0.0,0.0,0.0,0.026794,0.027878,0.008606,0.054291,0.005302,0.0,0.035736,0.073364,0.0,0.154297,0.054626,0.0,0.0,0.0,0.014992,0.0,0.006416,0.051514,0.033875,0.006371,0.0,0.0,0.0,0.055511,0.034943,0.033386,0.011421,0.0,0.041473,0.0,0.0,0.0,0.004833,0.054077,...,0.037567,0.0,0.0,0.041016,0.010063,0.126343,0.0,0.0,0.0,0.0,0.0,0.051056,0.0,0.0,0.0,0.035767,0.0,0.066833,0.035461,0.0,0.063599,0.036407,0.038849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.080078,0.0,0.018723,0.017563,0.0,0.011032,0.012192,0.051758,0.0,0.051056,0.028244,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8844,0.041504,1.0,0.0,0.065063,0.0,0.0,0.0,0.165771,0.028366,0.011482,0.042725,0.0,0.030426,0.0,0.024475,0.0,0.043976,0.031403,0.0289,0.067505,0.049774,0.0,0.013527,0.014381,0.0,0.0,0.01918,0.0,0.0,0.0,0.034119,0.0,0.008911,0.144165,0.033112,0.0,0.0,0.0,0.0,0.0,0.130859,0.065063,0.015854,0.0,0.041504,0.0,0.0,0.0,0.0,0.093567,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.049591,0.0,0.0,0.034668,0.0,0.205444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038055,0.043274,0.025986,0.0,0.0,0.04837,0.053467,0.028717,0.0,0.0,0.039185,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15602,0.008698,0.0,1.0,0.035828,0.010872,0.0,0.033356,0.0,0.0,0.0,0.036865,0.027283,0.0,0.0,0.0,0.0,0.015091,0.010208,0.016754,0.010971,0.010323,0.0,0.0,0.0,0.012657,0.0,0.011314,0.157715,0.0,0.0,0.008644,0.024719,0.0,0.0,0.051819,0.032959,0.0,0.0,0.0,0.021637,0.0,0.012215,0.015472,0.0,0.008339,0.042908,0.0,0.0,0.024994,0.01828,...,0.07312,0.0,0.0,0.212036,0.019608,0.0,0.0,0.0,0.043518,0.070801,0.139893,0.099426,0.106812,0.139893,0.139893,0.0,0.0,0.130127,0.020432,0.0,0.0,0.188232,0.075623,0.0,0.023376,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05954,0.0,0.0,0.03421,0.0,0.021484,0.023743,0.0,0.0,0.099426,0.0,0.0,0.0,0.0,0.106812,0.0,0.0,0.0
31357,0.006927,0.065063,0.035828,1.0,0.093445,0.003817,0.06366,0.027527,0.0,0.0,0.037292,0.021729,0.0,0.007092,0.0,0.008804,0.105225,0.008133,0.062744,0.041077,0.102051,0.006474,0.0,0.022461,0.014435,0.010895,0.015442,0.02594,0.007538,0.0,0.011795,0.028198,0.002985,0.023956,0.04129,0.033295,0.016922,0.011642,0.005661,0.024673,0.056335,0.016678,0.017654,0.0,0.051514,0.04892,0.006626,0.03772,0.065552,0.014565,...,0.058258,0.0,0.0,0.168945,0.021194,0.0,0.0,0.0,0.034668,0.056427,0.159546,0.135742,0.121826,0.159546,0.159546,0.0,0.0,0.103699,0.016281,0.009209,0.029572,0.190308,0.060272,0.027954,0.018631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047455,0.010933,0.008705,0.027252,0.0,0.01712,0.018921,0.0,0.0,0.135742,0.0,0.0,0.0,0.0,0.121826,0.03772,0.0,0.0
11862,0.005585,0.0,0.010872,0.093445,1.0,0.0,0.037933,0.0,0.0,0.0,0.008919,0.017517,0.0,0.0,0.0,0.0,0.0,0.006554,0.050568,0.033112,0.00663,0.0,0.0,0.015175,0.0,0.0,0.145874,0.0,0.0,0.0,0.005547,0.0,0.0,0.0,0.012535,0.007965,0.0,0.0,0.0,0.0,0.0,0.007843,0.0,0.0,0.011971,0.0,0.0,0.0,0.042847,0.011742,...,0.046967,0.0,0.0,0.05127,0.012581,0.0,0.0,0.0,0.0,0.0,0.0,0.063843,0.0,0.0,0.0,0.0,0.0,0.083557,0.013115,0.0,0.0,0.045532,0.048553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014404,0.0,0.0,0.021957,0.0,0.013786,0.015244,0.0,0.0,0.063843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
cosine_sim_df.tail()

title,Toy Story,Jumanji,Grumpier Old Men,Waiting to Exhale,Father of the Bride Part II,Heat,Sabrina,Tom and Huck,Sudden Death,GoldenEye,The American President,Dracula: Dead and Loving It,Balto,Nixon,Cutthroat Island,Casino,Sense and Sensibility,Four Rooms,Ace Ventura: When Nature Calls,Money Train,Get Shorty,Copycat,Assassins,Powder,Leaving Las Vegas,Othello,Now and Then,Persuasion,Dangerous Minds,Twelve Monkeys,Babe,Carrington,Dead Man Walking,Across the Sea of Time,It Takes Two,Clueless,"Cry, the Beloved Country",Richard III,Dead Presidents,Restoration,Mortal Kombat,To Die For,How To Make An American Quilt,Se7en,Pocahontas,When Night Is Falling,The Usual Suspects,Guardian Angel,Mighty Aphrodite,The Big Green,...,Take Me,A Death in the Gunj,Black Sun,The Incredible Jessica James,It Stains the Sands Red,Opus II,Opus III,Bloodletting,Can't Buy My Love,Hopeless Romantic,The Sparrow's Fluttering,Savages,Swing,Pro Lyuboff,Kuka,Dead Birds,The Hunters,"Whiffles, Cubic Artist",Cop and a Half: New Recruit,Dyketactics,Arabian Nights,The Fortunes and Misfortunes of Moll Flanders,An American Vampire Story,The Sublet,Fit to Kill,TechnoCalyps,Starquest II,Rivers of Sand,Altar of Fire,The Wonders of Aladdin,Phobos. Fear Kills,The Final Storm,In a Heartbeat,Jungle Woman,To Be Fat Like Me,Cadet Kelly,The Scheming Gambler's Paradise,The Hilarious Posters,The Devilish Tenant,Pooh's Heffalump Halloween Movie,Deep Hearts,The Morning After,House of Horrors,Shadow of the Blair Witch,The Burkittsville 7,Caged Heat 3000,Robin Hood,Betrayal,Satan Triumphant,Queerama
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
222848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.110535,0.0,0.0,0.0,0.0,0.0,0.157471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.328857,0.448242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
30840,0.0,0.0,0.106812,0.121826,0.0,0.061676,0.079285,0.445068,0.106323,0.04306,0.125366,0.0,0.0,0.033936,0.091736,0.042114,0.107117,0.0,0.0,0.070923,0.0,0.030975,0.05069,0.01738,0.069031,0.052124,0.030762,0.124023,0.036041,0.0,0.023483,0.134888,0.014282,0.114624,0.123169,0.112,0.080933,0.055695,0.091431,0.118042,0.067566,0.033234,0.084412,0.0,0.045471,0.234009,0.031708,0.609375,0.059387,0.118225,...,0.0,0.0,0.0,0.503906,0.026642,0.0,0.0,0.0,0.165894,0.269775,0.763184,0.270264,1.0,0.763184,0.763184,0.0,0.0,0.0,0.13208,0.044037,0.141479,0.640137,0.0,0.133789,0.18042,0.0,0.0,0.0,0.0,0.0,0.0,0.297852,0.141479,0.052277,0.041656,0.0,0.0,0.0,0.0,0.0,0.0,0.270264,0.0,0.0,0.0,0.0,1.0,0.609375,0.0,0.0
67758,0.0,0.0,0.0,0.03772,0.0,0.101196,0.0,0.45752,0.201782,0.081726,0.038818,0.0,0.0,0.034882,0.094299,0.043274,0.050964,0.0,0.0,0.072937,0.058136,0.095947,0.096252,0.053833,0.021362,0.053558,0.031616,0.038391,0.037048,0.051239,0.024139,0.041779,0.014679,0.117859,0.0,0.034668,0.083191,0.057251,0.093994,0.03653,0.069458,0.102905,0.026138,0.042145,0.014076,0.072449,0.098267,1.0,0.0,0.121521,...,0.0,0.0,0.0,0.0,0.02739,0.0,0.0,0.0,0.0,0.0,0.236328,0.277832,0.609375,0.236328,0.236328,0.0,0.0,0.0,0.135742,0.045258,0.145386,0.19812,0.0,0.414307,0.17334,0.0,0.145874,0.0,0.0,0.0,0.397461,0.56543,0.0,0.053741,0.042816,0.0,0.0,0.0,0.0,0.0,0.0,0.277832,0.071655,0.0,0.0,0.0,0.609375,1.0,0.0,0.0
227506,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
461257,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
print(cosine_sim_df.dtypes)
cosine_sim_df.info(memory_usage='deep')

title
Toy Story                      float16
Jumanji                        float16
Grumpier Old Men               float16
Waiting to Exhale              float16
Father of the Bride Part II    float16
                                ...   
Caged Heat 3000                float16
Robin Hood                     float16
Betrayal                       float16
Satan Triumphant               float16
Queerama                       float16
Length: 32251, dtype: object
<class 'pandas.core.frame.DataFrame'>
Int64Index: 32251 entries, 862 to 461257
Columns: 32251 entries, Toy Story to Queerama
dtypes: float16(32251)
memory usage: 1.9 GB


### Content Based Recommend

Content Based Recommend 결과를 뽑아내기 위한 메소드를 제작

target title(조회할 영화 제목)에 따라 코사인 유사도를 구한 matrix에서 유사도를 가져옴

- 유사도 데이터 중 가장 유사도 값이 큰 데이터를 가져옴
- 가져올 때 top k개를 가져옴
- 해당 추천 값 출력

(+추가)

기존의 genre_recommendations 메소드는 영화 제목을 기준으로 검색을 시도하는 메소드로 사소한 오류가 있음을 확인했다.

바로 중복된 제목을 가진 경우 메소드 수행시 인덱스가 전체 색인 범위를 넘어갈 수 있다는 것인데, 다음 케이스를 보자.

In [29]:
print(cosine_sim_df.loc[:, 'Robin Hood'].values.shape)
cosine_sim_df.loc[:, 'Robin Hood']

(32251, 4)


title,Robin Hood,Robin Hood,Robin Hood,Robin Hood
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
862,0.041260,0.000000,0.000000,0.000000
8844,0.014297,0.000000,0.016708,0.000000
15602,0.000000,0.106812,0.000000,0.106812
31357,0.000000,0.121826,0.000000,0.121826
11862,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...
222848,0.000000,0.000000,0.000000,0.000000
30840,0.000000,1.000000,0.062622,1.000000
67758,0.000000,0.609375,0.064392,0.609375
227506,0.000000,0.000000,0.000000,0.000000


영화의 제목이 'Robin Hood'와 같은 예시를 보면 같은 이름을 가진 영화가 4개나 나오는 것을 볼 수 있다.

이럴 경우 shape도 (32852, 4)와 같은 2차원 배열로 나오기 때문에 reshape(1, -1)로 1차원 매트릭스로 변환할 때 최대 인덱스를 초과하는 길이의 1차원 배열이 생성되며, 인덱스를 초과하는 값을 반환했을 때 에러가 발생하게 된다...

해당 문제를 해결하기 위하여 아이디 기반 검색(search by id)을 도입하고, title의 경우 별도의 title_idx 파라미터를 추가해 선택한 idx의 행에 해당하는 영화 추천 정보만 출력하도록 수정한다.

In [30]:
# 영화 제목 기반 추천 알고리즘
# title_index를 추가해서 중복된 이름을 가진 경우 title_idx에 해당하는 행만 출력하도록 수정

def genre_recommendations_by_title(target_title, matrix, items, k=10, title_index=0):
    # 영화 타이틀을 검색했을 때 1개만 있는 경우와 아닌 경우를 구분
    if matrix.loc[:, target_title].values.ndim == 1:
        recom_idx = matrix.loc[:, target_title].values.reshape(1, -1).argsort()[:, ::-1].flatten()[1:k+1]
    else:
        recom_idx = matrix.loc[:, target_title].values[:, title_index].reshape(1, -1).argsort()[:, ::-1].flatten()[1:k+1]
    
    recom_id = items.iloc[recom_idx, :].id.values
    recom_title = items.iloc[recom_idx, :].title.values
    recom_genre = items.iloc[recom_idx, :].genres.values
    target_id_list = np.full(len(range(k)), items[items.title == target_title].id.values[title_index])
    target_title_list = np.full(len(range(k)), target_title)
    target_genre_list = np.full(len(range(k)), items[items.title == target_title].genres.values[title_index])
    
    d = {
        'target_id': target_id_list,
        'target_title': target_title_list,
        'target_genre': target_genre_list,
        'recom_id'    : recom_id,
        'recom_title' : recom_title,
        'recom_genre' : recom_genre
    }
    
    return pd.DataFrame(d)

In [31]:
genre_recommendations_by_title('The Dark Knight Rises', cosine_sim_df, movie_data)

Unnamed: 0,target_id,target_title,target_genre,recom_id,recom_title,recom_genre
0,49026,The Dark Knight Rises,"Action, Crime, Drama, Thriller",155,The Dark Knight,"Drama, Action, Crime, Thriller"
1,49026,The Dark Knight Rises,"Action, Crime, Drama, Thriller",33843,The Burglar,"Crime, Drama"
2,49026,The Dark Knight Rises,"Action, Crime, Drama, Thriller",272,Batman Begins,"Action, Crime, Drama"
3,49026,The Dark Knight Rises,"Action, Crime, Drama, Thriller",415,Batman & Robin,"Action, Crime, Fantasy"
4,49026,The Dark Knight Rises,"Action, Crime, Drama, Thriller",268,Batman,"Fantasy, Action"
5,49026,The Dark Knight Rises,"Action, Crime, Drama, Thriller",200654,Raffles,"Adventure, Comedy, Crime, Drama, History, Roma..."
6,49026,The Dark Knight Rises,"Action, Crime, Drama, Thriller",44004,Hero at Large,"Action, Comedy, Drama"
7,49026,The Dark Knight Rises,"Action, Crime, Drama, Thriller",76420,DC Showcase: Catwoman,"Action, Adventure, Animation, Science Fiction"
8,49026,The Dark Knight Rises,"Action, Crime, Drama, Thriller",401650,DC Super Hero Girls: Hero of the Year,Animation
9,49026,The Dark Knight Rises,"Action, Crime, Drama, Thriller",364,Batman Returns,"Action, Fantasy"


In [32]:
genre_recommendations_by_title('Robin Hood', cosine_sim_df, movie_data)

Unnamed: 0,target_id,target_title,target_genre,recom_id,recom_title,recom_genre
0,11886,Robin Hood,"Animation, Family",115972,Sword of Sherwood Forest,"Fantasy, Adventure"
1,11886,Robin Hood,"Animation, Family",3171,Bambi Meets Godzilla,"Animation, Comedy"
2,11886,Robin Hood,"Animation, Family",113175,Robin of Locksley,"Adventure, Drama"
3,11886,Robin Hood,"Animation, Family",10808,Dr. Dolittle 2,"Comedy, Family, Romance, Fantasy"
4,11886,Robin Hood,"Animation, Family",10010,Brother Bear 2,"Adventure, Animation, Family"
5,11886,Robin Hood,"Animation, Family",20662,Robin Hood,"Action, Adventure"
6,11886,Robin Hood,"Animation, Family",14175,Valiant,"Animation, Family, Adventure"
7,11886,Robin Hood,"Animation, Family",33539,Once Upon a Forest,"Animation, Family, Adventure"
8,11886,Robin Hood,"Animation, Family",33371,The Little Bear Movie,"Animation, Family"
9,11886,Robin Hood,"Animation, Family",339669,Blinky Bill the Movie,"Family, Animation"


In [33]:
genre_recommendations_by_title('Robin Hood', cosine_sim_df, movie_data, title_index=1)

Unnamed: 0,target_id,target_title,target_genre,recom_id,recom_title,recom_genre
0,71066,Robin Hood,"Drama, Action, Romance",30840,Robin Hood,"Drama, Action, Romance"
1,71066,Robin Hood,"Drama, Action, Romance",61966,Swing,"Romance, Drama, Action"
2,71066,Robin Hood,"Drama, Action, Romance",71066,Robin Hood,"Drama, Action, Romance"
3,71066,Robin Hood,"Drama, Action, Romance",37737,Kites,"Drama, Action, Romance"
4,71066,Robin Hood,"Drama, Action, Romance",62643,The Charge,"Action, Drama, Romance"
5,71066,Robin Hood,"Drama, Action, Romance",41945,The Rainbow,"Action, Drama, Romance"
6,71066,Robin Hood,"Drama, Action, Romance",211275,Kachche Dhaage,"Action, Drama, Romance"
7,71066,Robin Hood,"Drama, Action, Romance",166255,Jungle,"Romance, Action, Drama"
8,71066,Robin Hood,"Drama, Action, Romance",335205,Sorry I Love You,"Romance, Action, Drama"
9,71066,Robin Hood,"Drama, Action, Romance",76094,The Trail of the Lonesome Pine,"Action, Romance"


In [34]:
# 영화 id 기반 추천 알고리즘
def genre_recommendations_by_id(target_id, matrix, items, k=10):
    recom_idx = matrix.loc[target_id, :].values.reshape(1, -1).argsort()[:, ::-1].flatten()[1:k+1]
    recom_id = items.iloc[recom_idx, :].id.values
    recom_title = items.iloc[recom_idx, :].title.values
    recom_genre = items.iloc[recom_idx, :].genres.values
    target_id_list = np.full(len(range(k)), target_id)
    target_title_list = np.full(len(range(k)), items[items.id == target_id].title.values)
    target_genre_list = np.full(len(range(k)), items[items.id == target_id].genres.values)
    
    d = {
        'target_id': target_id_list,
        'target_title': target_title_list,
        'target_genre': target_genre_list,
        'recom_id'    : recom_id,
        'recom_title' : recom_title,
        'recom_genre' : recom_genre
    }
    
    return pd.DataFrame(d)

In [35]:
genre_recommendations_by_id(49026, cosine_sim_df, movie_data)

Unnamed: 0,target_id,target_title,target_genre,recom_id,recom_title,recom_genre
0,49026,The Dark Knight Rises,"Action, Crime, Drama, Thriller",155,The Dark Knight,"Drama, Action, Crime, Thriller"
1,49026,The Dark Knight Rises,"Action, Crime, Drama, Thriller",33843,The Burglar,"Crime, Drama"
2,49026,The Dark Knight Rises,"Action, Crime, Drama, Thriller",272,Batman Begins,"Action, Crime, Drama"
3,49026,The Dark Knight Rises,"Action, Crime, Drama, Thriller",415,Batman & Robin,"Action, Crime, Fantasy"
4,49026,The Dark Knight Rises,"Action, Crime, Drama, Thriller",268,Batman,"Fantasy, Action"
5,49026,The Dark Knight Rises,"Action, Crime, Drama, Thriller",200654,Raffles,"Adventure, Comedy, Crime, Drama, History, Roma..."
6,49026,The Dark Knight Rises,"Action, Crime, Drama, Thriller",44004,Hero at Large,"Action, Comedy, Drama"
7,49026,The Dark Knight Rises,"Action, Crime, Drama, Thriller",76420,DC Showcase: Catwoman,"Action, Adventure, Animation, Science Fiction"
8,49026,The Dark Knight Rises,"Action, Crime, Drama, Thriller",401650,DC Super Hero Girls: Hero of the Year,Animation
9,49026,The Dark Knight Rises,"Action, Crime, Drama, Thriller",364,Batman Returns,"Action, Fantasy"


In [36]:
genre_recommendations_by_id(11886, cosine_sim_df, movie_data)

Unnamed: 0,target_id,target_title,target_genre,recom_id,recom_title,recom_genre
0,11886,Robin Hood,"Animation, Family",115972,Sword of Sherwood Forest,"Fantasy, Adventure"
1,11886,Robin Hood,"Animation, Family",3171,Bambi Meets Godzilla,"Animation, Comedy"
2,11886,Robin Hood,"Animation, Family",113175,Robin of Locksley,"Adventure, Drama"
3,11886,Robin Hood,"Animation, Family",10808,Dr. Dolittle 2,"Comedy, Family, Romance, Fantasy"
4,11886,Robin Hood,"Animation, Family",10010,Brother Bear 2,"Adventure, Animation, Family"
5,11886,Robin Hood,"Animation, Family",20662,Robin Hood,"Action, Adventure"
6,11886,Robin Hood,"Animation, Family",14175,Valiant,"Animation, Family, Adventure"
7,11886,Robin Hood,"Animation, Family",33539,Once Upon a Forest,"Animation, Family, Adventure"
8,11886,Robin Hood,"Animation, Family",33371,The Little Bear Movie,"Animation, Family"
9,11886,Robin Hood,"Animation, Family",339669,Blinky Bill the Movie,"Family, Animation"


In [37]:
# 데이터 저장 경로
path_save_movie = path.join(DIR_SAVE_PATH, 'pre_movies_metadata.csv')
print(path_save_movie)

# 파일 저장
os.makedirs(DIR_SAVE_PATH, exist_ok=True)
movie_data.to_csv(path_save_movie, index = False)

.\kaggle-the-movies-dataset\output\pre_movies_metadata.csv


In [38]:
# # path_save_movie_genre는 너무 크기 때문에 있는 그대로 저장하지 않는다.
# # 데이터 저장 경로
# path_save_movie_genre = path.join(DIR_SAVE_PATH, 'movies_genre_cosine_sim_df.pkl')
# print(path_save_movie_genre)

# # 파일 저장
# os.makedirs(DIR_SAVE_PATH, exist_ok=True)
# cosine_sim_df.to_pickle(path_save_movie_genre)

In [39]:
# 장르가 비슷한 추천 영화를 리스트 형태로 반환 메소드
# 중복되는 이름을 가진 영화가 존재할 수 있으므로 고유 값인 id를 기반으로 리스트업을 한다.

def genre_recommendation_list_by_id(target_id, matrix, items, k=10):
    try:
        recom_idx = matrix.loc[target_id, :].values.reshape(1, -1).argsort()[:, ::-1].flatten()[1:k+1]
        recom_id = items.iloc[recom_idx, :].id.values
        recom_title = items.iloc[recom_idx, :].title.values
    except:
        print(target_id, matrix.loc[target_id, :].values.ndim)
        print(recom_idx)
    
    recom_list = [dict(id = id, title = title) for id, title in zip(recom_id, recom_title)]
    
    return recom_list

In [40]:
movie_data['id'].value_counts()

862       1
110123    1
83429     1
127668    1
126757    1
         ..
45838     1
13793     1
15026     1
13056     1
461257    1
Name: id, Length: 32251, dtype: int64

In [41]:
# 장르 유사도 기준으로 추천 영화의 상위 5개를 출력
movie_recom = movie_data.copy()[['id', 'title']]
# movie_recom['test'] = tmp_movie_recom.apply(lambda x: x.title, axis=1)
movie_recom['recommendation'] = movie_recom.apply(lambda x: genre_recommendation_list_by_id(x.id, cosine_sim_df, movie_data, k=5), axis=1)
movie_recom

Unnamed: 0,id,title,recommendation
0,862,Toy Story,"[{'id': 256835, 'title': 'Toy Story That Time ..."
1,8844,Jumanji,"[{'id': 262788, 'title': 'The Games Maker'}, {..."
2,15602,Grumpier Old Men,"[{'id': 100529, 'title': 'Lola Versus'}, {'id'..."
3,31357,Waiting to Exhale,"[{'id': 96995, 'title': 'A Change of Heart'}, ..."
4,11862,Father of the Bride Part II,"[{'id': 180721, 'title': 'Blueberry Hill'}, {'..."
...,...,...,...
32246,222848,Caged Heat 3000,"[{'id': 293082, 'title': '10,000 Days'}, {'id'..."
32247,30840,Robin Hood,"[{'id': 30840, 'title': 'Robin Hood'}, {'id': ..."
32248,67758,Betrayal,"[{'id': 256740, 'title': 'Wicked Blood'}, {'id..."
32249,227506,Satan Triumphant,"[{'id': 43962, 'title': 'Malaya'}, {'id': 1429..."


In [44]:
# 데이터 저장 경로
path_save_movie_recom = path.join(DIR_SAVE_PATH, 'pre_movies_cbf_recom.json')
print(path_save_movie_recom)

# 파일 저장
os.makedirs(DIR_SAVE_PATH, exist_ok=True)
movie_recom.to_json(path_save_movie_recom, orient='records', indent=2)

.\kaggle-the-movies-dataset\output\pre_movies_cbf_recom.json


### 그래서 커피 추천 알고리즘에 어떻게 적용할 수 있는가?

tf-idf 기반으로 코사인 유사도를 측정한다는 것은 텍스트로 되어있는 값끼리의 유사도를 비교적 쉽게 측정할 수 있는 것으로 고려된다.

커피의 맛과 향에 대한 텍스트 테그 값을 정제해 코사인 유사도 행렬을 제작하면 비슷한 맛이나 향 정보를 통해 유사한 종류의 커피(원두, 캡슐) 상품을 추천해 줄 수 있을 것이라 생각한다. 

## 추가 정리

현재 참고하고 있는 코드는 아이템의 키워드를 중심으로 유사도를 분석하는 아이템 기반 추천 알고리즘이 되겠다.

tf-idf vector는 word embedding 방식이기 때문에 많은 sparse matrix로 생성된다.  
어느정도의 compress 작업을 통해 sparse matrix의 용량을 줄이는것도 하나의 방법이다.
그러나 메모리상에서 분석 할 때는 nxn규모의 희소행렬을 유지해야하므로 data storage 측면에서 효율을 제외하면 그리 유의미한 방법은 아니다.

연산의 규모나 시간을 줄이기 위해서 유사한 관계끼리 미리 그룹화 하는 방법을 고려해 봤으나 장단점이 뚜렷하다.

**장점**
- 그룹화(필터링)된 아이템간의 유사도 측정을 하면 되므로 연산량이 상당 부분 줄어들 수 있다.
- 기획한 서비스에 맞게 잘 필터링하면 사람이 느끼기에 뜬금 없는 아이템의 추천을 방지할 수 있다.(휴리스틱)

**단점**
- 필터링된 그룹안의 유사도는 모든 vocabulary(word token)간의 관계성(유사도, 해당 알고리즘에선 cosine_similarity)을 대표하지 않아 오차가 있다.
- 사람이 일반적으로 특정하지 못하는 데이터의 의외성을 확보하지 못한다.  
  ex) 강아지를 키우는 사람들에게 유모차를 추천해 주는 것과 같이 데이터 상으론 유의미하지만 일반적인 인싱에 벗어나는 상품 추천

### 큰 데이터를 축소하는 방법

tf-idf vector는 용량을 크게 먹는다. 이러한 용량을 줄이는 방법을 몇 개 찾아봤다.

데이터 타입을 경량화 한다. pandas는 정밀도를 위해 기본적으로 float64로 계산하나, float32또는 float16로 타입 지정해도 충분 할 거 같다.

분석에 필요한 word가 많으면 embedding해야 할 vector의 개수도 많아지므로 어느 정도 개수를 제한하는 것이 방법이다.

tf-idf vector가 아닌 다른 word embedding를 시도하는 것도 방법이다. word2vec같은 연관성을 내포한 dense vector matrix로 생성하는 방법이라던가...