# 영화추천 시스템
- 코사인 유사도 활용
- TfidfVectorizer
- 영화의 줄거리

In [3]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import files
up = files.upload()

In [5]:
movie = pd.read_csv('movies_metadata.csv', low_memory=False)
movie.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [6]:
movie.shape

(45466, 24)

In [7]:
df = movie[['title', 'overview']]
df.head(3)

Unnamed: 0,title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...


In [8]:
df.overview[0]

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."

#### **데이터 전처리**

In [9]:
# null data 확인
df.isnull().sum()

title         6
overview    954
dtype: int64

In [10]:
# 결측치 제거
df.dropna(inplace=True)
df.shape

(44506, 2)

In [11]:
# 중복 확인
df.title.nunique()

41371

In [12]:
# 중복값 제거
df.drop_duplicates(subset=['title'], inplace=True)
df.shape

(41371, 2)

In [13]:
df.tail(3)

Unnamed: 0,title,overview
45462,Century of Birthing,An artist struggles to finish his work while a...
45464,Satan Triumphant,"In a small town live two brothers, one a minis..."
45465,Queerama,50 years after decriminalisation of homosexual...


In [14]:
df.iloc[41370,0] # 결측치 제거했기 때문에 인덱스 넘버랑 상이함. [1,0]에서 0은 열이니까 title

'Queerama'

In [15]:
# So, 인덱스 넘버와 일치시켜주는 게 좋음. (인덱스 정리 : 인덱스와 행 번호 일치시키기)
df.set_index('title', inplace=True) # title 기준으로 인덱스 넘버 재지정
df.reset_index(inplace=True)
df.tail(3)

Unnamed: 0,title,overview
41368,Century of Birthing,An artist struggles to finish his work while a...
41369,Satan Triumphant,"In a small town live two brothers, one a minis..."
41370,Queerama,50 years after decriminalisation of homosexual...


In [16]:
#총 45,466건의 영화 중에서 20,000개의 영화 정보만 이용
df = df.head(20000)

#### **텍스트 전처리**

In [17]:
# 구둣점, 숫자 제거
df['clean_doc'] = df.overview.str.replace('[^A-Za-z ]', '') # 공백(스페이스바)도 지정해주기
df.head(5)

Unnamed: 0,title,overview,clean_doc
0,Toy Story,"Led by Woody, Andy's toys live happily in his ...",Led by Woody Andys toys live happily in his ro...
1,Jumanji,When siblings Judy and Peter discover an encha...,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Cheated on mistreated and stepped on the women...
4,Father of the Bride Part II,Just when George Banks has recovered from his ...,Just when George Banks has recovered from his ...


- **DTM 변환**

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

tvect = TfidfVectorizer(stop_words='english')
dtm = tvect.fit_transform(df.overview)
dtm.shape # 47999건

(20000, 47999)

In [19]:
dtm_clean = tvect.fit_transform(df.clean_doc)
dtm_clean.shape # clean_doc이 되면 단어수가 더 줄어들 줄 알았는데 오히려 늘어남 (왜?)

(20000, 54842)

In [20]:
df.clean_doc[1]

'When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world they unwittingly invite Alan  an adult whos been trapped inside the game for  years  into their living room Alans only hope for freedom is to finish the game which proves risky as all three find themselves running from giant rhinoceroses evil monkeys and other terrifying creatures'

In [21]:
df.overview[1]

"When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures."

#### **영화의 타이틀과 인덱스를 가진 테이블**

In [22]:
indices = pd.Series(df.index, index=df.title)
indices.head(5)

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
dtype: int64

In [23]:
indices['Jumanji']

1

#### **코사인 유사도 - 유사 영화 도출**

In [24]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim =linear_kernel(dtm, dtm)
cosine_clean = linear_kernel(dtm_clean, dtm_clean)

In [25]:
cosine_sim.shape, cosine_clean.shape

((20000, 20000), (20000, 20000))

#### **overview로 도출**

In [26]:
index = indices['The Dark Knight Rises']
index

17362

In [28]:
sim_scores = pd.Series(cosine_sim[index])
sim_scores.head(10)

0    0.000000
1    0.006117
2    0.000000
3    0.000000
4    0.000000
5    0.000000
6    0.000000
7    0.000000
8    0.000000
9    0.032129
dtype: float64

In [30]:
sim_scores.sort_values(ascending=False).head(11).tail(10)

12041    0.319791
149      0.313563
1311     0.301648
14858    0.295149
583      0.277144
8966     0.237472
17165    0.208331
18775    0.205745
3042     0.193898
19171    0.187890
dtype: float64

In [31]:
movie_indices = sim_scores.sort_values(ascending=False).head(11).tail(10).index
movie_indices

Int64Index([12041, 149, 1311, 14858, 583, 8966, 17165, 18775, 3042, 19171], dtype='int64')

In [None]:
recommended_movies = df.title.iloc[movie_indices]
recommended_movies

In [34]:
def get_recommendation(title, cos_sim):
    index = indices[title]
    sim_scores = pd.Series(cosine_sim[index])
    movie_indices = sim_scores.sort_values(ascending=False).head(11).tail(10).index
    return df.title.iloc[movie_indices]

In [35]:
get_recommendation('Toy Story', cosine_sim)

14706               Toy Story 3
2945                Toy Story 2
9984     The 40 Year Old Virgin
1056      Rebel Without a Cause
11016    For Your Consideration
1910                  Condorman
3004            Man on the Moon
483                      Malice
11209              Factory Girl
16400                 Group Sex
Name: title, dtype: object

#### **clean doc으로부터 가져오기**

In [36]:
get_recommendation('Toy Story', cosine_clean)

14706               Toy Story 3
2945                Toy Story 2
9984     The 40 Year Old Virgin
1056      Rebel Without a Cause
11016    For Your Consideration
1910                  Condorman
3004            Man on the Moon
483                      Malice
11209              Factory Girl
16400                 Group Sex
Name: title, dtype: object