## Import Module

In [1]:
import os
import os.path as path
import re
import math
import json

In [2]:
import numpy as np
import pandas as pd

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Setting

In [5]:
pd.set_option('display.max_row', 100)
pd.set_option('display.max_columns', 100)

In [6]:
# 디렉토리 기본 경로 지정
DIR_PATH = path.join('.', 'kaggle-the-movies-dataset')
DIR_SAVE_PATH = path.join('.', 'kaggle-the-movies-dataset', 'output')

print(DIR_PATH)
print(DIR_SAVE_PATH)

.\kaggle-the-movies-dataset
.\kaggle-the-movies-dataset\output


### 데이터 전처리

In [7]:
# movie 경로
path_movie = path.join(DIR_PATH, 'movies_metadata.csv')
print(path_movie)

.\kaggle-the-movies-dataset\movies_metadata.csv


In [8]:
# 데이터 로드 후 확인
movie_data = pd.read_csv(path_movie, low_memory=False)
# 일단 언어가 영어인 영화 자료만 활용
movie_data = movie_data.loc[movie_data['original_language'] == 'en']
movie_data.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [9]:
movie_data.shape

(32269, 24)

In [10]:
movie_data.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [11]:
# 오로지 장르와 키워드 기반 유사도를 측정할 계획이므로 여기선 장르 정보만 추출하면 됨
movie_data = movie_data[['id', 'original_language', 'title', 'genres']]
movie_data.head()

Unnamed: 0,id,original_language,title,genres
0,862,en,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,8844,en,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,15602,en,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."
3,31357,en,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
4,11862,en,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]"


In [12]:
# keywords 경로
path_keyword = path.join(DIR_PATH, 'keywords.csv')
print(path_keyword)

.\kaggle-the-movies-dataset\keywords.csv


In [13]:
# 데이터 로드 후 확인
movie_keyword = pd.read_csv(path_keyword, low_memory=False)
print(movie_keyword.shape)
movie_keyword.head()

(46419, 2)


Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [14]:
# 장르 데이터와 키워드 데이터를 merge
# id를 기준으로 두 df를 결합
movie_data = movie_data.astype({'id':'int'})
movie_keyword = movie_keyword.astype({'id':'int'})
print(movie_data['id'].dtypes, movie_keyword['id'].dtypes)

movie_data = pd.merge(movie_data, movie_keyword, on='id')
print(movie_data.shape)
movie_data.head()

int32 int32
(32852, 5)


Unnamed: 0,id,original_language,title,genres,keywords
0,862,en,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,en,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,en,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,en,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,en,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


### 장르와 키워드 전처리

장르와 키워드의 구조를 확인 하면 list 내부에 dict으로 구성되어 있는 것을 확인할 수 있다.

이런 경우를 해결하기 위해서 ast의 literal_eval을 사용

> **ast란?**
> 
> 파이썬 추상 구문 문법의 트리를 처리하는 데 도움을 주는 모듈.  
> => 텍스트로 된 파이썬 추상 구문을 파이썬 객체로 바꿔준다.

In [15]:
# literal_eval로 장르를 파이썬 객체로 변환 후 id를 제거, name정보만 추출
movie_data['genres'] = movie_data['genres'].apply(literal_eval)
movie_data['genres'] = movie_data['genres'].apply(lambda x : [d['name'] for d in x]).apply(lambda x : ", ".join(x))

In [16]:
# literal_eval로 키워드를 파이썬 객체로 변환 후 id를 제거, name정보만 추출
movie_data['keywords'] = movie_data['keywords'].apply(literal_eval)
movie_data['keywords'] = movie_data['keywords'].apply(lambda x : [d['name'] for d in x]).apply(lambda x : ", ".join(x))

In [17]:
movie_data.head()

Unnamed: 0,id,original_language,title,genres,keywords
0,862,en,Toy Story,"Animation, Comedy, Family","jealousy, toy, boy, friendship, friends, rival..."
1,8844,en,Jumanji,"Adventure, Fantasy, Family","board game, disappearance, based on children's..."
2,15602,en,Grumpier Old Men,"Romance, Comedy","fishing, best friend, duringcreditsstinger, ol..."
3,31357,en,Waiting to Exhale,"Comedy, Drama, Romance","based on novel, interracial relationship, sing..."
4,11862,en,Father of the Bride Part II,Comedy,"baby, midlife crisis, confidence, aging, daugh..."


### TF-IDF 벡터화

전처리한 데이터를 TF-IDF 방법을 이용해 벡터로 변환

장르와 키워드를 구분없이 하나로 합친 뒤 tfidf vector로 제작

In [18]:
tfidf_vector = TfidfVectorizer()
tfidf_matrix = tfidf_vector.fit_transform(movie_data['genres'] + ", " + movie_data['keywords']).toarray()
tfidf_matrix_feature = tfidf_vector.get_feature_names_out()

In [19]:
tfidf_matrix.shape

(32852, 11437)

In [20]:
tfidf_matrix = pd.DataFrame(tfidf_matrix, columns=tfidf_matrix_feature, index = movie_data.title)
print(tfidf_matrix.shape)
tfidf_matrix.head()

(32852, 11437)


Unnamed: 0_level_0,077,10,11,13,1500s,15th,16th,17th,1812,18th,1900s,1910s,1917,1920s,1930s,1940s,1950s,1960s,1970s,1980s,1990s,1992,1995,19th,1st,2000,2001,2002,2079,20th,21st,230,25th,2nd,360,3d,500,51,60s,66,70s,80,95,aaron,abandoned,abandonment,abba,abbess,abc,abdication,...,youtube,youtuber,yucatec,yugo,yugoslavia,yukon,yun,yuppie,zagreb,zaire,zanzibar,zealand,zealot,zealous,zebra,zeit,zeitgeist,zen,zeppelin,zero,zeus,ziegfeld,zimbabwe,zinnia,zionism,zip,zither,zodiac,zombie,zombification,zone,zoo,zookeeper,zoom,zoophilia,zorro,zulu,zurich,øverste,żółty,βάφτηκε,γη,κόκκινο,το,χώμα,миньоны,卧底肥妈,绝地奶霸,自然界大事件,超级妈妈
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
Toy Story,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Jumanji,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Grumpier Old Men,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Waiting to Exhale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Father of the Bride Part II,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 유사도 계산

만들어진 tf-idf vector를 코사인 유사도를 활용해서 유사도 값을 계산

영화 개수(n)만큼 n x n의 matirx 형태가 나오게 된다.

In [21]:
%%time
cosine_sim = cosine_similarity(tfidf_matrix)

CPU times: total: 15min 24s
Wall time: 1min 39s


In [22]:
cosine_sim.shape

(32852, 32852)

In [23]:
cosine_sim_df = pd.DataFrame(cosine_sim, index = movie_data.title, columns = movie_data.title)
print(cosine_sim_df.shape)
cosine_sim_df.head()

(32852, 32852)


title,Toy Story,Jumanji,Grumpier Old Men,Waiting to Exhale,Father of the Bride Part II,Heat,Sabrina,Tom and Huck,Sudden Death,GoldenEye,The American President,Dracula: Dead and Loving It,Balto,Nixon,Cutthroat Island,Casino,Sense and Sensibility,Four Rooms,Ace Ventura: When Nature Calls,Money Train,Get Shorty,Copycat,Assassins,Powder,Leaving Las Vegas,Othello,Now and Then,Persuasion,Dangerous Minds,Twelve Monkeys,Babe,Carrington,Dead Man Walking,Across the Sea of Time,It Takes Two,Clueless,"Cry, the Beloved Country",Richard III,Dead Presidents,Restoration,Mortal Kombat,To Die For,How To Make An American Quilt,Se7en,Pocahontas,When Night Is Falling,The Usual Suspects,Guardian Angel,Mighty Aphrodite,The Big Green,...,Take Me,A Death in the Gunj,Black Sun,The Incredible Jessica James,It Stains the Sands Red,Opus II,Opus III,Bloodletting,Can't Buy My Love,Hopeless Romantic,The Sparrow's Fluttering,Savages,Swing,Pro Lyuboff,Kuka,Dead Birds,The Hunters,"Whiffles, Cubic Artist",Cop and a Half: New Recruit,Dyketactics,Arabian Nights,The Fortunes and Misfortunes of Moll Flanders,An American Vampire Story,The Sublet,Fit to Kill,TechnoCalyps,Starquest II,Rivers of Sand,Altar of Fire,The Wonders of Aladdin,Phobos. Fear Kills,The Final Storm,In a Heartbeat,Jungle Woman,To Be Fat Like Me,Cadet Kelly,The Scheming Gambler's Paradise,The Hilarious Posters,The Devilish Tenant,Pooh's Heffalump Halloween Movie,Deep Hearts,The Morning After,House of Horrors,Shadow of the Blair Witch,The Burkittsville 7,Caged Heat 3000,Robin Hood,Betrayal,Satan Triumphant,Queerama
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
Toy Story,1.0,0.041569,0.008708,0.006937,0.005595,0.0,0.006456,0.059202,0.0,0.0,0.037992,0.014028,0.027154,0.0,0.0,0.0,0.026835,0.027932,0.008608,0.054428,0.005304,0.0,0.03579,0.073276,0.0,0.154472,0.054495,0.0,0.0,0.0,0.014994,0.0,0.006409,0.05149,0.033851,0.006383,0.0,0.0,0.0,0.055614,0.035008,0.033426,0.011421,0.0,0.041473,0.0,0.0,0.0,0.00484,0.054118,...,0.037583,0.0,0.0,0.041051,0.010091,0.126213,0.0,0.0,0.0,0.0,0.0,0.05111,0.0,0.0,0.0,0.03586,0.0,0.066837,0.035483,0.0,0.063561,0.036457,0.038887,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.08003,0.0,0.018732,0.017572,0.0,0.011035,0.012202,0.051696,0.0,0.05111,0.028298,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Jumanji,0.041569,1.0,0.0,0.065065,0.0,0.0,0.0,0.165721,0.028302,0.011462,0.042854,0.0,0.030389,0.0,0.024436,0.0,0.043985,0.031506,0.02884,0.067754,0.049746,0.0,0.013532,0.014379,0.0,0.0,0.019181,0.0,0.0,0.0,0.034107,0.0,0.008901,0.144131,0.033083,0.0,0.0,0.0,0.0,0.0,0.130866,0.065192,0.01586,0.0,0.041478,0.0,0.0,0.0,0.0,0.093615,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0498,0.0,0.0,0.034677,0.0,0.205454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038033,0.043283,0.026014,0.0,0.0,0.048323,0.053438,0.028703,0.0,0.0,0.039299,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Grumpier Old Men,0.008708,0.0,1.0,0.035846,0.010906,0.0,0.033363,0.0,0.0,0.0,0.036866,0.027344,0.0,0.0,0.0,0.0,0.015091,0.010224,0.016779,0.010994,0.010339,0.0,0.0,0.0,0.012643,0.0,0.011334,0.158036,0.0,0.0,0.008659,0.024702,0.0,0.0,0.051823,0.032985,0.0,0.0,0.0,0.021607,0.0,0.012235,0.015472,0.0,0.008328,0.042917,0.0,0.0,0.025009,0.018344,...,0.07326,0.0,0.0,0.212122,0.019671,0.0,0.0,0.0,0.043483,0.070889,0.139978,0.099628,0.106819,0.139978,0.139978,0.0,0.0,0.130284,0.020492,0.0,0.0,0.188386,0.075801,0.0,0.02336,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.059577,0.0,0.0,0.034253,0.0,0.021509,0.023786,0.0,0.0,0.099628,0.0,0.0,0.0,0.0,0.106819,0.0,0.0,0.0
Waiting to Exhale,0.006937,0.065065,0.035846,1.0,0.093741,0.003806,0.063686,0.027484,0.0,0.0,0.037236,0.021784,0.0,0.007075,0.0,0.008811,0.10523,0.008145,0.062841,0.041173,0.101956,0.006449,0.0,0.022491,0.014404,0.010855,0.015441,0.025874,0.007519,0.0,0.011797,0.028143,0.002975,0.023903,0.041285,0.033316,0.016866,0.01161,0.005642,0.024617,0.056322,0.016669,0.017628,0.0,0.051502,0.048896,0.00661,0.037622,0.065678,0.014614,...,0.058363,0.0,0.0,0.168988,0.021235,0.0,0.0,0.0,0.034641,0.056474,0.15948,0.135728,0.121701,0.15948,0.15948,0.0,0.0,0.103791,0.016325,0.009184,0.029507,0.19028,0.060387,0.027899,0.01861,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047462,0.010898,0.008696,0.027288,0.0,0.017135,0.018949,0.0,0.0,0.135728,0.0,0.0,0.0,0.0,0.121701,0.037622,0.0,0.0
Father of the Bride Part II,0.005595,0.0,0.010906,0.093741,1.0,0.0,0.038016,0.0,0.0,0.0,0.008936,0.01757,0.0,0.0,0.0,0.0,0.0,0.00657,0.050684,0.033209,0.006643,0.0,0.0,0.01522,0.0,0.0,0.146221,0.0,0.0,0.0,0.005564,0.0,0.0,0.0,0.012561,0.007995,0.0,0.0,0.0,0.0,0.0,0.007862,0.0,0.0,0.011996,0.0,0.0,0.0,0.042965,0.011787,...,0.047073,0.0,0.0,0.051416,0.012639,0.0,0.0,0.0,0.0,0.0,0.0,0.064015,0.0,0.0,0.0,0.0,0.0,0.083713,0.013167,0.0,0.0,0.045663,0.048706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014441,0.0,0.0,0.022009,0.0,0.013821,0.015283,0.0,0.0,0.064015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Content Based Recommend

Content Based Recommend 결과를 뽑아내기 위한 메소드를 제작

target title(조회할 영화 제목)에 따라 코사인 유사도를 구한 matrix에서 유사도를 가져옴

- 유사도 데이터 중 가장 유사도 값이 큰 데이터를 가져옴
- 가져올 때 top k개를 가져옴
- 해당 추천 값 출력

In [24]:
def genre_recommendations(target_title, matrix, items, k=10):
    recom_idx = matrix.loc[:, target_title].values.reshape(1, -1).argsort()[:, ::-1].flatten()[1:k+1]
    recom_title = items.iloc[recom_idx, :].title.values
    recom_genre = items.iloc[recom_idx, :].genres.values
    target_title_list = np.full(len(range(k)), target_title)
    target_genre_list = np.full(len(range(k)), items[items.title == target_title].genres.values)
    d = {
        'target_title': target_title_list,
        'target_genre': target_genre_list,
        'recom_title' : recom_title,
        'recom_genre' : recom_genre
    }
    return pd.DataFrame(d)

In [25]:
genre_recommendations('The Dark Knight Rises', cosine_sim_df, movie_data)

Unnamed: 0,target_title,target_genre,recom_title,recom_genre
0,The Dark Knight Rises,"Action, Crime, Drama, Thriller",The Dark Knight,"Drama, Action, Crime, Thriller"
1,The Dark Knight Rises,"Action, Crime, Drama, Thriller",The Burglar,"Crime, Drama"
2,The Dark Knight Rises,"Action, Crime, Drama, Thriller",Batman Begins,"Action, Crime, Drama"
3,The Dark Knight Rises,"Action, Crime, Drama, Thriller",Batman & Robin,"Action, Crime, Fantasy"
4,The Dark Knight Rises,"Action, Crime, Drama, Thriller",Batman,"Fantasy, Action"
5,The Dark Knight Rises,"Action, Crime, Drama, Thriller",Raffles,"Adventure, Comedy, Crime, Drama, History, Roma..."
6,The Dark Knight Rises,"Action, Crime, Drama, Thriller",Hero at Large,"Action, Comedy, Drama"
7,The Dark Knight Rises,"Action, Crime, Drama, Thriller",DC Showcase: Catwoman,"Action, Adventure, Animation, Science Fiction"
8,The Dark Knight Rises,"Action, Crime, Drama, Thriller",DC Super Hero Girls: Hero of the Year,Animation
9,The Dark Knight Rises,"Action, Crime, Drama, Thriller",Batman Returns,"Action, Fantasy"


In [26]:
genre_recommendations('Jumanji', cosine_sim_df, movie_data)

Unnamed: 0,target_title,target_genre,recom_title,recom_genre
0,Jumanji,"Adventure, Fantasy, Family",The Games Maker,"Adventure, Family"
1,Jumanji,"Adventure, Fantasy, Family",Mostly Ghostly,"Family, Fantasy, Horror"
2,Jumanji,"Adventure, Fantasy, Family",Middle School: The Worst Years of My Life,"Family, Comedy"
3,Jumanji,"Adventure, Fantasy, Family",Mostly Ghostly: Have You Met My Ghoulfriend?,"Family, Fantasy, Horror"
4,Jumanji,"Adventure, Fantasy, Family",Where the Wild Things Are,"Family, Fantasy"
5,Jumanji,"Adventure, Fantasy, Family",In the Name of the King III,"Action, Adventure, Drama, Fantasy"
6,Jumanji,"Adventure, Fantasy, Family",Mostly Ghostly 3: One Night in Doom House,"Family, Fantasy, Horror"
7,Jumanji,"Adventure, Fantasy, Family",Clue,"Comedy, Thriller, Crime, Mystery"
8,Jumanji,"Adventure, Fantasy, Family",Zenon: Girl of the 21st Century,"Adventure, Comedy, Family, TV Movie"
9,Jumanji,"Adventure, Fantasy, Family",The Strange World of Planet X,"Science Fiction, Horror, Drama"


### 그래서 커피 추천 알고리즘에 어떻게 적용할 수 있는가?

tf-idf 기반으로 코사인 유사도를 측정한다는 것은 텍스트로 되어있는 값끼리의 유사도를 비교적 쉽게 측정할 수 있는 것으로 고려된다.

커피의 맛과 향에 대한 텍스트 테그 값을 정제해 코사인 유사도 행렬을 제작하면 비슷한 맛이나 향 정보를 통해 유사한 종류의 커피(원두, 캡슐) 상품을 추천해 줄 수 있을 것이라 생각한다. 