# [실습] 영화 추천 시스템
* ref : https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset

<img src="./imgs/lec10_logo.png">

In [26]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords

## Data Load

* 영화정보 데이터 (movies_metadata.csv) : 45,466 * 24


In [27]:
data = pd.read_csv("./datasets/movies_metadata.csv")[:20000]
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  20000 non-null  object 
 1   belongs_to_collection  2399 non-null   object 
 2   budget                 20000 non-null  object 
 3   genres                 20000 non-null  object 
 4   homepage               3055 non-null   object 
 5   id                     20000 non-null  object 
 6   imdb_id                19993 non-null  object 
 7   original_language      19999 non-null  object 
 8   original_title         20000 non-null  object 
 9   overview               19865 non-null  object 
 10  popularity             19998 non-null  object 
 11  poster_path            19907 non-null  object 
 12  production_companies   19999 non-null  object 
 13  production_countries   19999 non-null  object 
 14  release_date           19983 non-null  object 
 15  re

In [28]:
data.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States of America'}]",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]",,8844,tt0113497,en,Jumanji,"When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures.",17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'name': 'Teitler Film', 'id': 2550}, {'name': 'Interscope Communications', 'id': 10201}]","[{'iso_3166_1': 'US', 'name': 'United States of America'}]",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'fr', 'name': 'Français'}]",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


## 결측처리
* overview

In [29]:
print('overview 열의 결측값의 수:', data['overview'].isnull().sum())
data['overview'] = data['overview'].fillna('')
print('overview 열의 결측값의 수:', data['overview'].isnull().sum())

overview 열의 결측값의 수: 135
overview 열의 결측값의 수: 0


In [30]:
# stop_words_list = stopwords.words('english')
# print('불용어 개수 :', len(stop_words_list))
# print('불용어 10개 출력 :',stop_words_list[:10])

## TF-IDF를 이용해 코사인 유사도 계산
* cosine_sim

In [31]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['overview'])
print('TF-IDF 행렬의 크기(shape) :',tfidf_matrix.shape)

# print(tfidf_matrix[0].toarray()) # [[0. 0. 0. ... 0. 0. 0.]]
# print(tfidf.vocabulary_)         # {'led': 24361, 'woody': 46617, 'andy': 2051 ... }

TF-IDF 행렬의 크기(shape) : (20000, 47487)


In [32]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print('코사인 유사도 연산 결과 :',cosine_sim.shape)

코사인 유사도 연산 결과 : (20000, 20000)


In [33]:
data["title"][:5]

0                      Toy Story
1                        Jumanji
2               Grumpier Old Men
3              Waiting to Exhale
4    Father of the Bride Part II
Name: title, dtype: object

In [34]:
cos_df = pd.DataFrame(cosine_sim, columns=data["title"], index=data["title"])
cos_df.head()

title,Toy Story,Jumanji,Grumpier Old Men,Waiting to Exhale,Father of the Bride Part II,Heat,Sabrina,Tom and Huck,Sudden Death,GoldenEye,The American President,Dracula: Dead and Loving It,Balto,Nixon,Cutthroat Island,Casino,Sense and Sensibility,Four Rooms,Ace Ventura: When Nature Calls,Money Train,Get Shorty,Copycat,Assassins,Powder,Leaving Las Vegas,Othello,Now and Then,Persuasion,The City of Lost Children,Shanghai Triad,Dangerous Minds,Twelve Monkeys,Wings of Courage,Babe,Carrington,Dead Man Walking,Across the Sea of Time,It Takes Two,Clueless,"Cry, the Beloved Country",Richard III,Dead Presidents,Restoration,Mortal Kombat,To Die For,How To Make An American Quilt,Se7en,Pocahontas,When Night Is Falling,The Usual Suspects,...,Kiss and Tell,"Oui, mais...",Space Adventure Cobra,L'aventure c'est l'aventure,A Season for Miracles,The Awful Truth,1911,Looking for Jackie,The Shaolin Temple,Shaolin Temple 2: Kids from Shaolin,Carol Channing: Larger Than Life,Memory,Martial Arts of Shaolin,Snowmageddon,Stand Up and Fight,Jazz,Barbara,A Liar's Autobiography: The Untrue Story of Monty Python's Graham Chapman,Little Miss Broadway,Miss Annie Rooney,Mr. Belvedere Goes to College,The Hobbit: An Unexpected Journey,Mon Paradis - Der Winterpalast,No Flesh Shall Be Spared,I am Von Höfler Variation on Werther,Born to Defend,Dragon Fight,The Master,Kung Fu Cult Master,The Fitzgerald Family Christmas,Hyde Park on Hudson,Lay the Favorite,Audition/Talent Competition,All's Faire in Love,Crooked Arrows,The Enforcer,Dr. Wai in the Scriptures with No Words,Once Upon a Time in China and America,Contract Killer,The Executor,Never Say... Never!,Calmos,How to Make Love to a Woman,"After Fall, Winter",Violeta Went to Heaven,Rebellion,Versailles,Two in the Wave,Lotte Reiniger: Homage to the Inventor of the Silhouette Film,"RKO Production 601: The Making of 'Kong, the Eighth Wonder of the World'"
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
Toy Story,1.0,0.015757,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04114,0.0,0.0,0.009941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019831,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006348,0.0,0.0,0.009321,0.0,0.0,0.0,0.013822,...,0.0,0.0,0.011365,0.0,0.0,0.0,0.011791,0.014688,0.013539,0.007355,0.011161,0.0,0.01452,0.0,0.0,0.0,0.0,0.0,0.019853,0.027106,0.0,0.0,0.0,0.0,0.0,0.0,0.117982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009979,0.060997,0.0,0.0,0.0,0.024585,0.0,0.0,0.0
Jumanji,0.015757,1.0,0.049073,0.0,0.0,0.05183,0.0,0.0,0.106355,0.0,0.007616,0.0,0.0,0.008233,0.0,0.0,0.0,0.028753,0.0,0.0,0.0,0.0,0.006043,0.0,0.0,0.024285,0.024941,0.006489,0.0,0.0,0.0,0.04725,0.0,0.00556,0.004326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025758,0.0,0.0,0.01848,0.008127,0.0,0.016429,...,0.0,0.0,0.004063,0.0,0.0,0.0,0.006281,0.0,0.0,0.0,0.030391,0.0,0.0,0.027595,0.043131,0.0,0.011296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007418,0.01348,0.009529,0.0,0.0,0.060063,0.0,0.0,0.0,0.013191,0.0,0.0,0.0,0.0,0.0,0.0,0.029242,0.0,0.0,0.004271,0.0,0.0,0.0,0.0,0.0,0.0
Grumpier Old Men,0.0,0.049073,1.0,0.0,0.025005,0.0,0.0,0.0065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01191,0.0,0.010392,0.0,0.006408,0.0,0.006683,0.0,0.0,0.005282,0.0,0.0,0.0,0.0,0.0,0.0,0.00748,0.0,0.0,0.0,0.0165,0.016821,0.0,...,0.0,0.0,0.0,0.0,0.007415,0.0,0.0,0.0,0.0,0.015443,0.0,0.007561,0.0,0.031215,0.0,0.0,0.0,0.014109,0.0,0.009995,0.02621,0.0,0.0,0.0,0.006808,0.0,0.015605,0.0,0.0,0.023065,0.0,0.0,0.016035,0.0,0.0,0.010563,0.0,0.00914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Waiting to Exhale,0.0,0.0,0.0,1.0,0.0,0.007139,0.0,0.009396,0.0,0.0,0.0,0.0,0.0,0.008025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015244,0.007419,0.0,0.0,0.0,0.017338,0.0,0.0,0.020718,0.0,0.0,0.008459,0.0,0.007564,0.015534,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009875,0.0,0.0,0.0,0.0,...,0.017565,0.0,0.003453,0.0,0.0,0.0,0.017731,0.011984,0.0,0.0,0.0,0.0,0.0,0.007155,0.0,0.0,0.0,0.0,0.0,0.023705,0.0,0.0,0.0,0.0,0.0,0.0,0.00544,0.011392,0.0,0.0,0.0,0.0,0.0,0.0,0.021712,0.0,0.0,0.0,0.0,0.0,0.008305,0.0,0.008952,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Father of the Bride Part II,0.0,0.0,0.025005,0.0,1.0,0.0,0.03298,0.0,0.032751,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016175,0.0,0.022556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010453,0.0,0.0,0.0,0.0,0.0,0.012483,0.0,0.0,0.008551,0.0,0.0,0.0,0.0,0.012535,0.005066,0.0,0.0,0.0,0.0,0.02618,0.0,0.0,...,0.027553,0.0,0.028289,0.0,0.01364,0.0,0.0,0.0,0.0,0.009036,0.009067,0.0,0.0,0.008951,0.0,0.0,0.0,0.0,0.023089,0.0,0.0,0.014366,0.0,0.0,0.042243,0.0,0.010164,0.0,0.0,0.018671,0.0,0.0,0.0,0.0,0.0,0.015346,0.0,0.0,0.0,0.0,0.009277,0.0,0.0,0.025417,0.0,0.0,0.078359,0.0,0.0,0.0


## 유사영화 Top10 검색
* 영화제목을 넣으면 코사인유사도 Top10 영화 제목을 리턴한다.

In [35]:
cos_df.loc['The Dark Knight Rises'].sort_values(ascending=False)[1:11]

title
The Dark Knight                            0.321521
Batman Forever                             0.315328
Batman Returns                             0.305010
Batman: Under the Red Hood                 0.296967
Batman                                     0.276030
Batman Beyond: Return of the Joker         0.236289
Batman: Year One                           0.209219
Batman: The Dark Knight Returns, Part 1    0.206999
Batman: Mask of the Phantasm               0.196292
Batman Begins                              0.179930
Name: The Dark Knight Rises, dtype: float64

## 비추
* 교재 예제 코드 참고용

### 영화검색을 위한 색인 dict 만들기
* title_to_index : 제목 넣으면 영화 인덱스 가져오기
* movie_dic = {'영화제목' : 영화의인덱스 }

In [36]:
list(  zip(['a','b'], [0,1])  )

[('a', 0), ('b', 1)]

In [37]:
dict(  zip(['a','b'], [0,1])  )

{'a': 0, 'b': 1}

In [38]:
# movie_dic = {'영화제목' : 영화의인덱스 } 
title_to_index = dict(zip(data['title'], data.index))

# 영화 제목 Father of the Bride Part II의 인덱스를 리턴
idx = title_to_index['Father of the Bride Part II']
print(idx)
print(data.loc[4, 'title'])

4
Father of the Bride Part II


In [39]:
data['title'].shape

(20000,)

In [40]:
# -------------------------------------------------------------
# 영화제목을 넣으면 코사인유사도 Top10 영화 제목을 리턴한다.
# -------------------------------------------------------------
def my_top10_movie(title, cosine_sim=cosine_sim):
    # 선택한 영화의 타이틀로부터 해당 영화의 인덱스를 받아온다.
    idx = title_to_index[title]

    # 해당 영화와 모든 영화와의 유사도를 가져온다.
    sim_scores = list(enumerate(cosine_sim[idx]))

    # 유사도에 따라 영화들을 정렬한다.
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # 가장 유사한 10개의 영화를 받아온다.
    sim_scores = sim_scores[1:11]
    print(sim_scores)
    
    # 가장 유사한 10개의 영화의 인덱스를 얻는다.
    movie_indices = [tupl[0] for tupl in sim_scores]

    # 가장 유사한 10개의 영화의 제목을 리턴한다.
    return data.loc[movie_indices, 'title']

my_top10_movie('The Dark Knight Rises')

[(12481, 0.32152142350025487), (150, 0.31532758128008986), (1328, 0.30500953745851006), (15511, 0.29696687233514857), (585, 0.2760295597760307), (9230, 0.2362886826517737), (18035, 0.20921858266951263), (19792, 0.2069989680094894), (3095, 0.1962924866191582), (10122, 0.17992977744685185)]


12481                            The Dark Knight
150                               Batman Forever
1328                              Batman Returns
15511                 Batman: Under the Red Hood
585                                       Batman
9230          Batman Beyond: Return of the Joker
18035                           Batman: Year One
19792    Batman: The Dark Knight Returns, Part 1
3095                Batman: Mask of the Phantasm
10122                              Batman Begins
Name: title, dtype: object