### 영화추천 시스템
- 영화의 줄거리 --> TfidfVectorizer --> 코사인 유사도

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

##### 1. 데이터 탐색

In [2]:
movie = pd.read_csv('../data/movies_metadata.csv', low_memory=False)
movie.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [3]:
movie.shape

(45466, 24)

In [4]:
df = movie[['title', 'overview']]
df.head()

Unnamed: 0,title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...


In [5]:
print(df.overview[0])

Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.


##### 2. 데이터 전처리

In [6]:
# 결측치 데이터 확인
df.isna().sum()

title         6
overview    954
dtype: int64

In [7]:
# 결측치 제거
df.dropna(how='any', inplace=True)
df.isna().sum(), df.shape

(title       0
 overview    0
 dtype: int64,
 (44506, 2))

In [8]:
# 중복데이터 확인
df.title.nunique()

41371

In [9]:
# 중복 데이터 제거
df.drop_duplicates(subset=['title'], inplace=True)
df.shape

(41371, 2)

In [10]:
df.tail(3)

Unnamed: 0,title,overview
45462,Century of Birthing,An artist struggles to finish his work while a...
45464,Satan Triumphant,"In a small town live two brothers, one a minis..."
45465,Queerama,50 years after decriminalisation of homosexual...


In [11]:
# 인덱스 정리
df.set_index('title', inplace=True)
df.reset_index(inplace=True)
df.tail()

Unnamed: 0,title,overview
41366,Caged Heat 3000,It's the year 3000 AD. The world's most danger...
41367,Subdue,Rising and falling between a man and woman.
41368,Century of Birthing,An artist struggles to finish his work while a...
41369,Satan Triumphant,"In a small town live two brothers, one a minis..."
41370,Queerama,50 years after decriminalisation of homosexual...


- 모든 데이터로 하기에는 메모리가 문제가 발생할 소지가 있음
- 20000 건의 데이터로 영화 추천시스템 만들기

In [12]:
df = df.head(20000)

##### 3. 텍스트 전처리

In [13]:
# 숫자, 구둣점 제거
df['clean_doc'] = df.overview.str.replace('[^A-Za-z]', ' ', regex=True).str.strip()
df.head(3)

Unnamed: 0,title,overview,clean_doc
0,Toy Story,"Led by Woody, Andy's toys live happily in his ...",Led by Woody Andy s toys live happily in his ...
1,Jumanji,When siblings Judy and Peter discover an encha...,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...,A family wedding reignites the ancient feud be...


##### 4. Feature 변환

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tvect = TfidfVectorizer(stop_words='english')
overview_tv = tvect.fit_transform(df.overview)
overview_tv.shape

(20000, 47999)

In [15]:
clean_doc_tv = tvect.fit_transform(df.clean_doc)
clean_doc_tv.shape

(20000, 47120)

In [16]:
# 검색을 쉽게 하기 위해서 영화 타이틀과 인덱스를 가진 테이블
indices = pd.Series(df.index, index=df.title)
indices.head()

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
dtype: int64

##### 5. 코사인 유사도

In [17]:
from sklearn.metrics.pairwise import linear_kernel

%time cosine_sim_overview = linear_kernel(overview_tv, overview_tv)

CPU times: total: 3.69 s
Wall time: 4.05 s


In [18]:
cosine_sim_overview.shape

(20000, 20000)

In [19]:
%time cosine_sim_clean_doc = linear_kernel(clean_doc_tv, clean_doc_tv)

CPU times: total: 3.7 s
Wall time: 3.74 s


- 영화 Jumanjs와 비슷한 줄거리를 가진 영화

In [20]:
index = indices['Jumanji']
index

1

In [21]:
sim_scores = pd.Series(cosine_sim_overview[index])
sim_scores.head(10)

0    0.016027
1    1.000000
2    0.049391
3    0.000000
4    0.000000
5    0.051762
6    0.000000
7    0.000000
8    0.105602
9    0.000000
dtype: float64

In [22]:

sim_scores.sort_values(ascending=False).head(10)



1        1.000000
6057     0.179827
8566     0.178701
16430    0.174194
9230     0.171873
16089    0.159171
13083    0.158564
7891     0.158249
5948     0.155780
18716    0.153226
dtype: float64

In [23]:
# 나를 제외한 Top 10 Score
sim_scores.sort_values(ascending=False).head(11).tail(10)

6057     0.179827
8566     0.178701
16430    0.174194
9230     0.171873
16089    0.159171
13083    0.158564
7891     0.158249
5948     0.155780
18716    0.153226
2447     0.150296
dtype: float64

In [24]:
movie_indices = sim_scores.sort_values(ascending=False).head(11).tail(10).index
movie_indices

Index([6057, 8566, 16430, 9230, 16089, 13083, 7891, 5948, 18716, 2447], dtype='int64')

In [25]:
df.title.iloc[movie_indices]

6057                       Brainscan
8566                         Quintet
16430                 The Dark Angel
9230                       Word Wars
16089                         DeVour
13083    The Mindscape of Alan Moore
7891                         Masques
5948                Poolhall Junkies
18716                 Wreck-It Ralph
2447                        eXistenZ
Name: title, dtype: object

In [26]:
def get_recommendation(title, cos_sim):
    index = indices[title]
    sim_scores = pd.Series(cos_sim[index])
    movie_indices = sim_scores.sort_values(ascending=False).head(11).tail(10).index
    return df.title.iloc[movie_indices]

In [27]:
# Overview로 가져오기
get_recommendation('Toy Story', cosine_sim_overview)

14706               Toy Story 3
2945                Toy Story 2
9984     The 40 Year Old Virgin
1056      Rebel Without a Cause
11016    For Your Consideration
1910                  Condorman
3004            Man on the Moon
483                      Malice
11209              Factory Girl
16400                 Group Sex
Name: title, dtype: object

In [28]:
# Clean Doc오로 가져오기
get_recommendation('Toy Story', cosine_sim_clean_doc)

14706               Toy Story 3
2945                Toy Story 2
9984     The 40 Year Old Virgin
1056      Rebel Without a Cause
11016    For Your Consideration
1910                  Condorman
3004            Man on the Moon
483                      Malice
11209              Factory Girl
16400                 Group Sex
Name: title, dtype: object