## Import Package

In [1]:
import os
import os.path as path
import re
import math
import json

In [2]:
import numpy as np
import pandas as pd

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Setting

In [5]:
pd.set_option('display.max_row', 100)
pd.set_option('display.max_columns', 100)

데이터는 kaggle의 [The movies Dataset](https://www.kaggle.com/rounakbanik/the-movies-dataset)을 사용

In [6]:
# 디렉토리 기본 경로 지정
DIR_PATH = path.join('.', 'kaggle-the-movies-dataset')
DIR_SAVE_PATH = path.join('.', 'kaggle-the-movies-dataset', 'output')

print(DIR_PATH)
print(DIR_SAVE_PATH)

.\kaggle-the-movies-dataset
.\kaggle-the-movies-dataset\output


In [7]:
# movie 경로
path_movie = path.join(DIR_PATH, 'movies_metadata.csv')
print(path_movie)

.\kaggle-the-movies-dataset\movies_metadata.csv


In [8]:
# 데이터 로드 후 확인
data = pd.read_csv(path_movie, low_memory=False)
data.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [9]:
data.shape

(45466, 24)

In [10]:
data.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [11]:
data = data[['id', 'imdb_id', 'genres', 'original_language', 'original_title', 'overview', 'popularity', 'title', 'vote_average', 'vote_count']]
data.head(3)

Unnamed: 0,id,imdb_id,genres,original_language,original_title,overview,popularity,title,vote_average,vote_count
0,862,tt0114709,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,Toy Story,7.7,5415.0
1,8844,tt0113497,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,Jumanji,6.9,2413.0
2,15602,tt0113228,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,Grumpier Old Men,6.5,92.0


### 데이터 정규화

vote_average의 값을 어느정도 수정해줘야 한다.

vote_count가 적을수록 vote_average의 정확도가 떨어지는 **불공정**함이 발생한다.

이를 처리하는 방법론 중 하나는 다음과 같다. [관련 링크](https://www.quora.com/How-does-IMDbs-rating-system-work)

> **weighted rating(WR) = (v / (v + m)) * R + (m / (v + m)) * C**
>  
> r : 개별 영화 평점  
> v : 개별 영화에 평점을 투표한 횟수  
> m : 특정 순위 안에 들어야 하는 최소 투표 (custom)  
> c : 전체 영화에 대한 평균 평점  


In [12]:
# m의 값 찾기
# quantile로 적절한 분위 값을 찾기 (0.5가 중앙값)
# 전체 1000등 안에 들어가는 경우의 값을 찾아보기
tmp_per = 1 - round(1000 / data.shape[0], 3)
print(tmp_per)
tmp_m = data['vote_count'].quantile(tmp_per)
print(tmp_m)

0.978
1114.0


In [13]:
tmp_data = data.copy().loc[data['vote_count'] >= tmp_m]
tmp_data.shape

(1002, 10)

In [14]:
data = tmp_data
data.head()

Unnamed: 0,id,imdb_id,genres,original_language,original_title,overview,popularity,title,vote_average,vote_count
0,862,tt0114709,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,Toy Story,7.7,5415.0
1,8844,tt0113497,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,Jumanji,6.9,2413.0
5,949,tt0113277,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",en,Heat,"Obsessive master thief, Neil McCauley leads a ...",17.924927,Heat,7.7,1886.0
9,710,tt0113189,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",en,GoldenEye,James Bond must unmask the mysterious head of ...,14.686036,GoldenEye,6.6,1194.0
15,524,tt0112641,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",en,Casino,The life of the gambling paradise – Las Vegas ...,10.137389,Casino,7.8,1343.0


In [15]:
m = tmp_m
C = round(data['vote_average'].mean(), 3)
print('m: {}'.format(m))
print('C: {}'.format(C))

m: 1114.0
C: 6.794


In [16]:
# 투표수 가중치를 고려한 weighted_rating
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    
    return (v / (v + m)) * R + (m / (m + v)) * C

In [17]:
data['weighted_rating'] = data.apply(weighted_rating, axis = 1)
data.head()

Unnamed: 0,id,imdb_id,genres,original_language,original_title,overview,popularity,title,vote_average,vote_count,weighted_rating
0,862,tt0114709,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,Toy Story,7.7,5415.0,7.545415
1,8844,tt0113497,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,Jumanji,6.9,2413.0,6.86652
5,949,tt0113277,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",en,Heat,"Obsessive master thief, Neil McCauley leads a ...",17.924927,Heat,7.7,1886.0,7.363572
9,710,tt0113189,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",en,GoldenEye,James Bond must unmask the mysterious head of ...,14.686036,GoldenEye,6.6,1194.0,6.693638
15,524,tt0112641,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",en,Casino,The life of the gambling paradise – Las Vegas ...,10.137389,Casino,7.8,1343.0,7.343881


In [18]:
data.shape

(1002, 11)

### 장르 정제

장르의 구조를 확인 하면 list 내부에 dict으로 구성되어 있는 것을 확인할 수 있다.

이런 경우를 해결하기 위해서 ast의 literal_eval을 사용

> **ast란?**
> 
> 파이썬 추상 구문 문법의 트리를 처리하는 데 도움을 주는 모듈.  
> => 텍스트로 된 파이썬 추상 구문을 파이썬 객체로 바꿔준다.


In [19]:
# literal_eval로 장르를 파이썬 객체로 변환
data['genres'] = data['genres'].apply(literal_eval)
data['genres'].head(3)

0    [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
1    [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
5    [{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...
Name: genres, dtype: object

In [20]:
# 장르 id를 제거한 다음 name만 뽑아옴
data['genres'] = data['genres'].apply(lambda x : [d['name'] for d in x]).apply(lambda x : ", ".join(x))
data.head(3)

Unnamed: 0,id,imdb_id,genres,original_language,original_title,overview,popularity,title,vote_average,vote_count,weighted_rating
0,862,tt0114709,"Animation, Comedy, Family",en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,Toy Story,7.7,5415.0,7.545415
1,8844,tt0113497,"Adventure, Fantasy, Family",en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,Jumanji,6.9,2413.0,6.86652
5,949,tt0113277,"Action, Crime, Drama, Thriller",en,Heat,"Obsessive master thief, Neil McCauley leads a ...",17.924927,Heat,7.7,1886.0,7.363572


In [21]:
# 데이터 저장 경로
path_save_movie = path.join(DIR_SAVE_PATH, 'pre_movies_metadata.csv')
print(path_save_movie)

.\kaggle-the-movies-dataset\output\pre_movies_metadata.csv


In [22]:
# 파일 저장
os.makedirs(DIR_SAVE_PATH, exist_ok=True)
data.to_csv(path_save_movie, index = False)