## Import Module

In [1]:
import os
import os.path as path
import gc
import re
import math
import json

In [2]:
import numpy as np
import pandas as pd

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Setting

In [5]:
pd.set_option('display.max_row', 100)
pd.set_option('display.max_columns', 100)

In [6]:
# 디렉토리 기본 경로 지정
DIR_PATH = path.join('.', 'kaggle-the-movies-dataset')
DIR_SAVE_PATH = path.join('.', 'kaggle-the-movies-dataset', 'output')

print(DIR_PATH)
print(DIR_SAVE_PATH)

.\kaggle-the-movies-dataset
.\kaggle-the-movies-dataset\output


### 데이터 전처리

In [7]:
# movie 경로
path_movie = path.join(DIR_PATH, 'movies_metadata.csv')
print(path_movie)

.\kaggle-the-movies-dataset\movies_metadata.csv


In [8]:
# 데이터 로드 후 확인
movie_data = pd.read_csv(path_movie, low_memory=False)
# 일단 언어가 영어인 영화 자료만 활용
movie_data = movie_data.loc[movie_data['original_language'] == 'en']
movie_data.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [9]:
movie_data.shape

(32269, 24)

In [10]:
movie_data.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [11]:
# 오로지 장르와 키워드 기반 유사도를 측정할 계획이므로 여기선 장르 정보만 추출하면 됨
movie_data = movie_data[['id', 'original_language', 'title', 'genres']]
movie_data.head()

Unnamed: 0,id,original_language,title,genres
0,862,en,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,8844,en,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,15602,en,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."
3,31357,en,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
4,11862,en,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]"


In [12]:
# keywords 경로
path_keyword = path.join(DIR_PATH, 'keywords.csv')
print(path_keyword)

.\kaggle-the-movies-dataset\keywords.csv


In [13]:
# 데이터 로드 후 확인
movie_keyword = pd.read_csv(path_keyword, low_memory=False)
print(movie_keyword.shape)
movie_keyword.head()

(46419, 2)


Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [14]:
# 장르 데이터와 키워드 데이터를 merge
# id를 기준으로 두 df를 결합
movie_data = movie_data.astype({'id':'int'})
movie_keyword = movie_keyword.astype({'id':'int'})
print(movie_data['id'].dtypes, movie_keyword['id'].dtypes)

movie_data = pd.merge(movie_data, movie_keyword, on='id')
print(movie_data.shape)
movie_data.head()

int32 int32
(32852, 5)


Unnamed: 0,id,original_language,title,genres,keywords
0,862,en,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,en,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,en,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,en,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,en,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


### 장르와 키워드 전처리

장르와 키워드의 구조를 확인 하면 list 내부에 dict으로 구성되어 있는 것을 확인할 수 있다.

이런 경우를 해결하기 위해서 ast의 literal_eval을 사용

> **ast란?**
> 
> 파이썬 추상 구문 문법의 트리를 처리하는 데 도움을 주는 모듈.  
> => 텍스트로 된 파이썬 추상 구문을 파이썬 객체로 바꿔준다.

In [15]:
# literal_eval로 장르를 파이썬 객체로 변환 후 id를 제거, name정보만 추출
movie_data['genres'] = movie_data['genres'].apply(literal_eval)
movie_data['genres'] = movie_data['genres'].apply(lambda x : [d['name'] for d in x]).apply(lambda x : ", ".join(x))

In [16]:
# literal_eval로 키워드를 파이썬 객체로 변환 후 id를 제거, name정보만 추출
movie_data['keywords'] = movie_data['keywords'].apply(literal_eval)
movie_data['keywords'] = movie_data['keywords'].apply(lambda x : [d['name'] for d in x]).apply(lambda x : ", ".join(x))

In [17]:
movie_data.head()

Unnamed: 0,id,original_language,title,genres,keywords
0,862,en,Toy Story,"Animation, Comedy, Family","jealousy, toy, boy, friendship, friends, rival..."
1,8844,en,Jumanji,"Adventure, Fantasy, Family","board game, disappearance, based on children's..."
2,15602,en,Grumpier Old Men,"Romance, Comedy","fishing, best friend, duringcreditsstinger, ol..."
3,31357,en,Waiting to Exhale,"Comedy, Drama, Romance","based on novel, interracial relationship, sing..."
4,11862,en,Father of the Bride Part II,Comedy,"baby, midlife crisis, confidence, aging, daugh..."


### TF-IDF 벡터화

전처리한 데이터를 TF-IDF 방법을 이용해 벡터로 변환

장르와 키워드를 구분없이 하나로 합친 뒤 tfidf vector로 제작

In [18]:
tfidf_vector = TfidfVectorizer()
tfidf_matrix = tfidf_vector.fit_transform(movie_data['genres'] + ", " + movie_data['keywords']).toarray()
tfidf_matrix_feature = tfidf_vector.get_feature_names_out()

In [19]:
tfidf_matrix.shape

(32852, 11437)

In [20]:
tfidf_matrix = pd.DataFrame(tfidf_matrix, columns=tfidf_matrix_feature, index = movie_data.title)
print(tfidf_matrix.shape)
tfidf_matrix.head()

(32852, 11437)


Unnamed: 0_level_0,077,10,11,13,1500s,15th,16th,17th,1812,18th,1900s,1910s,1917,1920s,1930s,1940s,1950s,1960s,1970s,1980s,1990s,1992,1995,19th,1st,2000,2001,2002,2079,20th,21st,230,25th,2nd,360,3d,500,51,60s,66,70s,80,95,aaron,abandoned,abandonment,abba,abbess,abc,abdication,...,youtube,youtuber,yucatec,yugo,yugoslavia,yukon,yun,yuppie,zagreb,zaire,zanzibar,zealand,zealot,zealous,zebra,zeit,zeitgeist,zen,zeppelin,zero,zeus,ziegfeld,zimbabwe,zinnia,zionism,zip,zither,zodiac,zombie,zombification,zone,zoo,zookeeper,zoom,zoophilia,zorro,zulu,zurich,øverste,żółty,βάφτηκε,γη,κόκκινο,το,χώμα,миньоны,卧底肥妈,绝地奶霸,自然界大事件,超级妈妈
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
Toy Story,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Jumanji,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Grumpier Old Men,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Waiting to Exhale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Father of the Bride Part II,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
print(tfidf_matrix.dtypes)

tfidf_matrix = tfidf_matrix.astype(np.float16)
print(tfidf_matrix.dtypes)

077        float64
10         float64
11         float64
13         float64
1500s      float64
            ...   
миньоны    float64
卧底肥妈       float64
绝地奶霸       float64
自然界大事件     float64
超级妈妈       float64
Length: 11437, dtype: object
077        float16
10         float16
11         float16
13         float16
1500s      float16
            ...   
миньоны    float16
卧底肥妈       float16
绝地奶霸       float16
自然界大事件     float16
超级妈妈       float16
Length: 11437, dtype: object


### 유사도 계산

만들어진 tf-idf vector를 코사인 유사도를 활용해서 유사도 값을 계산

영화 개수(n)만큼 n x n의 matirx 형태가 나오게 된다.

In [23]:
%%time
cosine_sim = cosine_similarity(tfidf_matrix)

CPU times: total: 17min 37s
Wall time: 1min 55s


In [26]:
print(cosine_sim.shape)
print(cosine_sim.dtype)

cosine_sim = cosine_sim.astype(np.float16)
print(cosine_sim.dtype)

cosine_sim

(32852, 32852)
float64
float16


array([[1.      , 0.04156 , 0.008705, ..., 0.      , 0.      , 0.      ],
       [0.04156 , 1.      , 0.      , ..., 0.      , 0.      , 0.      ],
       [0.008705, 0.      , 1.      , ..., 0.      , 0.      , 0.      ],
       ...,
       [0.      , 0.      , 0.      , ..., 1.      , 0.      , 0.      ],
       [0.      , 0.      , 0.      , ..., 0.      , 0.      , 0.      ],
       [0.      , 0.      , 0.      , ..., 0.      , 0.      , 0.      ]],
      dtype=float16)

In [27]:
cosine_sim_df = pd.DataFrame(cosine_sim, index = movie_data.title, columns = movie_data.title, dtype=np.float16)
print(cosine_sim_df.shape)
cosine_sim_df.head()

(32852, 32852)


title,Toy Story,Jumanji,Grumpier Old Men,Waiting to Exhale,Father of the Bride Part II,Heat,Sabrina,Tom and Huck,Sudden Death,GoldenEye,The American President,Dracula: Dead and Loving It,Balto,Nixon,Cutthroat Island,Casino,Sense and Sensibility,Four Rooms,Ace Ventura: When Nature Calls,Money Train,Get Shorty,Copycat,Assassins,Powder,Leaving Las Vegas,Othello,Now and Then,Persuasion,Dangerous Minds,Twelve Monkeys,Babe,Carrington,Dead Man Walking,Across the Sea of Time,It Takes Two,Clueless,"Cry, the Beloved Country",Richard III,Dead Presidents,Restoration,Mortal Kombat,To Die For,How To Make An American Quilt,Se7en,Pocahontas,When Night Is Falling,The Usual Suspects,Guardian Angel,Mighty Aphrodite,The Big Green,...,Take Me,A Death in the Gunj,Black Sun,The Incredible Jessica James,It Stains the Sands Red,Opus II,Opus III,Bloodletting,Can't Buy My Love,Hopeless Romantic,The Sparrow's Fluttering,Savages,Swing,Pro Lyuboff,Kuka,Dead Birds,The Hunters,"Whiffles, Cubic Artist",Cop and a Half: New Recruit,Dyketactics,Arabian Nights,The Fortunes and Misfortunes of Moll Flanders,An American Vampire Story,The Sublet,Fit to Kill,TechnoCalyps,Starquest II,Rivers of Sand,Altar of Fire,The Wonders of Aladdin,Phobos. Fear Kills,The Final Storm,In a Heartbeat,Jungle Woman,To Be Fat Like Me,Cadet Kelly,The Scheming Gambler's Paradise,The Hilarious Posters,The Devilish Tenant,Pooh's Heffalump Halloween Movie,Deep Hearts,The Morning After,House of Horrors,Shadow of the Blair Witch,The Burkittsville 7,Caged Heat 3000,Robin Hood,Betrayal,Satan Triumphant,Queerama
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
Toy Story,1.0,0.041565,0.008705,0.006939,0.005596,0.0,0.006458,0.059235,0.0,0.0,0.038025,0.014023,0.027161,0.0,0.0,0.0,0.02684,0.027939,0.008606,0.054474,0.005302,0.0,0.035797,0.073303,0.0,0.154541,0.054504,0.0,0.0,0.0,0.014992,0.0,0.006413,0.051483,0.033875,0.006386,0.0,0.0,0.0,0.055634,0.035034,0.033447,0.011421,0.0,0.041473,0.0,0.0,0.0,0.004837,0.054138,...,0.037598,0.0,0.0,0.041046,0.010094,0.126221,0.0,0.0,0.0,0.0,0.0,0.051117,0.0,0.0,0.0,0.035889,0.0,0.066833,0.035492,0.0,0.063599,0.036469,0.03891,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.080017,0.0,0.018738,0.017578,0.0,0.011032,0.012207,0.051697,0.0,0.051117,0.02832,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Jumanji,0.041565,1.0,0.0,0.065063,0.0,0.0,0.0,0.165649,0.028305,0.011467,0.042847,0.0,0.03038,0.0,0.024445,0.0,0.043976,0.031494,0.028839,0.067749,0.049744,0.0,0.013535,0.014374,0.0,0.0,0.01918,0.0,0.0,0.0,0.034088,0.0,0.008904,0.144165,0.033112,0.0,0.0,0.0,0.0,0.0,0.130859,0.065186,0.015854,0.0,0.041473,0.0,0.0,0.0,0.0,0.093628,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.049774,0.0,0.0,0.034698,0.0,0.205444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038025,0.043274,0.026016,0.0,0.0,0.048309,0.053406,0.028702,0.0,0.0,0.039307,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Grumpier Old Men,0.008705,0.0,1.0,0.035828,0.010902,0.0,0.033356,0.0,0.0,0.0,0.036865,0.027328,0.0,0.0,0.0,0.0,0.015083,0.010223,0.016769,0.010994,0.01033,0.0,0.0,0.0,0.012634,0.0,0.01133,0.157959,0.0,0.0,0.008659,0.024704,0.0,0.0,0.051788,0.03299,0.0,0.0,0.0,0.021591,0.0,0.012238,0.015465,0.0,0.008324,0.042908,0.0,0.0,0.024994,0.018326,...,0.073242,0.0,0.0,0.212036,0.019669,0.0,0.0,0.0,0.043457,0.070862,0.139893,0.099548,0.10675,0.139893,0.139893,0.0,0.0,0.130249,0.020477,0.0,0.0,0.188354,0.075806,0.0,0.023346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05954,0.0,0.0,0.034241,0.0,0.0215,0.023773,0.0,0.0,0.099548,0.0,0.0,0.0,0.0,0.10675,0.0,0.0,0.0
Waiting to Exhale,0.006939,0.065063,0.035828,1.0,0.09375,0.003807,0.06366,0.027481,0.0,0.0,0.037231,0.02179,0.0,0.007072,0.0,0.008812,0.105225,0.008148,0.062866,0.041199,0.10199,0.006447,0.0,0.022491,0.014397,0.010857,0.015442,0.025864,0.007523,0.0,0.011795,0.028137,0.002975,0.023895,0.04126,0.033325,0.016861,0.011604,0.005642,0.024612,0.056335,0.016678,0.017624,0.0,0.051514,0.048889,0.006607,0.037628,0.065674,0.01461,...,0.05838,0.0,0.0,0.168945,0.02124,0.0,0.0,0.0,0.034637,0.056458,0.159424,0.135742,0.121643,0.159424,0.159424,0.0,0.0,0.103821,0.016327,0.009178,0.02951,0.190308,0.060425,0.027908,0.0186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047455,0.010902,0.008698,0.027298,0.0,0.017136,0.018951,0.0,0.0,0.135742,0.0,0.0,0.0,0.0,0.121643,0.037628,0.0,0.0
Father of the Bride Part II,0.005596,0.0,0.010902,0.09375,1.0,0.0,0.037994,0.0,0.0,0.0,0.008942,0.017578,0.0,0.0,0.0,0.0,0.0,0.006573,0.05069,0.033203,0.006645,0.0,0.0,0.015213,0.0,0.0,0.14624,0.0,0.0,0.0,0.005566,0.0,0.0,0.0,0.012566,0.007996,0.0,0.0,0.0,0.0,0.0,0.007866,0.0,0.0,0.011986,0.0,0.0,0.0,0.042969,0.011787,...,0.047089,0.0,0.0,0.051453,0.012642,0.0,0.0,0.0,0.0,0.0,0.0,0.064026,0.0,0.0,0.0,0.0,0.0,0.08374,0.013168,0.0,0.0,0.045685,0.048737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014442,0.0,0.0,0.022018,0.0,0.013824,0.015289,0.0,0.0,0.064026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
print(cosine_sim_df.dtypes)
cosine_sim_df.info(memory_usage='deep')

title
Toy Story                      float16
Jumanji                        float16
Grumpier Old Men               float16
Waiting to Exhale              float16
Father of the Bride Part II    float16
                                ...   
Caged Heat 3000                float16
Robin Hood                     float16
Betrayal                       float16
Satan Triumphant               float16
Queerama                       float16
Length: 32852, dtype: object
<class 'pandas.core.frame.DataFrame'>
Index: 32852 entries, Toy Story to Queerama
Columns: 32852 entries, Toy Story to Queerama
dtypes: float16(32852)
memory usage: 2.0 GB


### Content Based Recommend

Content Based Recommend 결과를 뽑아내기 위한 메소드를 제작

target title(조회할 영화 제목)에 따라 코사인 유사도를 구한 matrix에서 유사도를 가져옴

- 유사도 데이터 중 가장 유사도 값이 큰 데이터를 가져옴
- 가져올 때 top k개를 가져옴
- 해당 추천 값 출력

In [29]:
def genre_recommendations(target_title, matrix, items, k=10):
    recom_idx = matrix.loc[:, target_title].values.reshape(1, -1).argsort()[:, ::-1].flatten()[1:k+1]
    recom_title = items.iloc[recom_idx, :].title.values
    recom_genre = items.iloc[recom_idx, :].genres.values
    target_title_list = np.full(len(range(k)), target_title)
    target_genre_list = np.full(len(range(k)), items[items.title == target_title].genres.values)
    d = {
        'target_title': target_title_list,
        'target_genre': target_genre_list,
        'recom_title' : recom_title,
        'recom_genre' : recom_genre
    }
    return pd.DataFrame(d)

In [30]:
genre_recommendations('The Dark Knight Rises', cosine_sim_df, movie_data)

Unnamed: 0,target_title,target_genre,recom_title,recom_genre
0,The Dark Knight Rises,"Action, Crime, Drama, Thriller",The Dark Knight,"Drama, Action, Crime, Thriller"
1,The Dark Knight Rises,"Action, Crime, Drama, Thriller",The Burglar,"Crime, Drama"
2,The Dark Knight Rises,"Action, Crime, Drama, Thriller",Batman Begins,"Action, Crime, Drama"
3,The Dark Knight Rises,"Action, Crime, Drama, Thriller",Batman & Robin,"Action, Crime, Fantasy"
4,The Dark Knight Rises,"Action, Crime, Drama, Thriller",Batman,"Fantasy, Action"
5,The Dark Knight Rises,"Action, Crime, Drama, Thriller",Raffles,"Adventure, Comedy, Crime, Drama, History, Roma..."
6,The Dark Knight Rises,"Action, Crime, Drama, Thriller",Hero at Large,"Action, Comedy, Drama"
7,The Dark Knight Rises,"Action, Crime, Drama, Thriller",DC Showcase: Catwoman,"Action, Adventure, Animation, Science Fiction"
8,The Dark Knight Rises,"Action, Crime, Drama, Thriller",DC Super Hero Girls: Hero of the Year,Animation
9,The Dark Knight Rises,"Action, Crime, Drama, Thriller",Batman Returns,"Action, Fantasy"


In [31]:
genre_recommendations('Jumanji', cosine_sim_df, movie_data)

Unnamed: 0,target_title,target_genre,recom_title,recom_genre
0,Jumanji,"Adventure, Fantasy, Family",The Games Maker,"Adventure, Family"
1,Jumanji,"Adventure, Fantasy, Family",Mostly Ghostly,"Family, Fantasy, Horror"
2,Jumanji,"Adventure, Fantasy, Family",Middle School: The Worst Years of My Life,"Family, Comedy"
3,Jumanji,"Adventure, Fantasy, Family",Mostly Ghostly: Have You Met My Ghoulfriend?,"Family, Fantasy, Horror"
4,Jumanji,"Adventure, Fantasy, Family",Where the Wild Things Are,"Family, Fantasy"
5,Jumanji,"Adventure, Fantasy, Family",In the Name of the King III,"Action, Adventure, Drama, Fantasy"
6,Jumanji,"Adventure, Fantasy, Family",Mostly Ghostly 3: One Night in Doom House,"Family, Fantasy, Horror"
7,Jumanji,"Adventure, Fantasy, Family",Clue,"Comedy, Thriller, Crime, Mystery"
8,Jumanji,"Adventure, Fantasy, Family",Zenon: Girl of the 21st Century,"Adventure, Comedy, Family, TV Movie"
9,Jumanji,"Adventure, Fantasy, Family",The Strange World of Planet X,"Science Fiction, Horror, Drama"


### 그래서 커피 추천 알고리즘에 어떻게 적용할 수 있는가?

tf-idf 기반으로 코사인 유사도를 측정한다는 것은 텍스트로 되어있는 값끼리의 유사도를 비교적 쉽게 측정할 수 있는 것으로 고려된다.

커피의 맛과 향에 대한 텍스트 테그 값을 정제해 코사인 유사도 행렬을 제작하면 비슷한 맛이나 향 정보를 통해 유사한 종류의 커피(원두, 캡슐) 상품을 추천해 줄 수 있을 것이라 생각한다. 

In [32]:
# 데이터 저장 경로
path_save_movie = path.join(DIR_SAVE_PATH, 'pre_movies_metadata.csv')
print(path_save_movie)

# 파일 저장
os.makedirs(DIR_SAVE_PATH, exist_ok=True)
movie_data.to_csv(path_save_movie, index = False)

.\kaggle-the-movies-dataset\output\pre_movies_metadata.csv


In [None]:
# # path_save_movie_genre는 너무 크기 때문에 있는 그대로 저장하지 않는다.
# # 데이터 저장 경로
# path_save_movie_genre = path.join(DIR_SAVE_PATH, 'movies_genre_cosine_sim_df.pkl')
# print(path_save_movie_genre)

# # 파일 저장
# os.makedirs(DIR_SAVE_PATH, exist_ok=True)
# cosine_sim_df.to_pickle(path_save_movie_genre)

## 추가 정리

현재 참고하고 있는 코드는 아이템의 키워드를 중심으로 유사도를 분석하는 아이템 기반 추천 알고리즘이 되겠다.

tf-idf vector는 word embedding 방식이기 때문에 많은 sparse matrix로 생성된다.  
어느정도의 compress 작업을 통해 sparse matrix의 용량을 줄이는것도 하나의 방법이다.
그러나 메모리상에서 분석 할 때는 nxn규모의 희소행렬을 유지해야하므로 data storage 측면에서 효율을 제외하면 그리 유의미한 방법은 아니다.

연산의 규모나 시간을 줄이기 위해서 유사한 관계끼리 미리 그룹화 하는 방법을 고려해 봤으나 장단점이 뚜렷하다.

**장점**
- 그룹화(필터링)된 아이템간의 유사도 측정을 하면 되므로 연산량이 상당 부분 줄어들 수 있다.
- 기획한 서비스에 맞게 잘 필터링하면 사람이 느끼기에 뜬금 없는 아이템의 추천을 방지할 수 있다.(휴리스틱)

**단점**
- 필터링된 그룹안의 유사도는 모든 vocabulary(word token)간의 관계성(유사도, 해당 알고리즘에선 cosine_similarity)을 대표하지 않아 오차가 있다.
- 사람이 일반적으로 특정하지 못하는 데이터의 의외성을 확보하지 못한다.  
  ex) 강아지를 키우는 사람들에게 유모차를 추천해 주는 것과 같이 데이터 상으론 유의미하지만 일반적인 인싱에 벗어나는 상품 추천

### 큰 데이터를 축소하는 방법

tf-idf vector는 용량을 크게 먹는다. 이러한 용량을 줄이는 방법을 몇 개 찾아봤다.

데이터 타입을 경량화 한다. pandas는 정밀도를 위해 기본적으로 float64로 계산하나, float32또는 float16로 타입 지정해도 충분 할 거 같다.

분석에 필요한 word가 많으면 embedding해야 할 vector의 개수도 많아지므로 어느 정도 개수를 제한하는 것이 방법이다.

tf-idf vector가 아닌 다른 word embedding를 시도하는 것도 방법이다. word2vec같은 연관성을 내포한 dense vector matrix로 생성하는 방법이라던가...