<img src='logo.png'>

<font size=6><b>kaggle The Movies Dataset</b></font>
* Metadata on over 45,000 movies. 26 million ratings from over 270,000 users.
* ref : https://www.kaggle.com/rounakbanik/the-movies-dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel 
from ast import literal_eval

import warnings 
warnings.simplefilter('ignore')

from IPython.core.display import display, HTML
display(HTML("<style>.container{width:100% !important;}</style>"))
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('max_colwidth', None)


# Data Load
<pre>
genres : 영화 장르
keywords : 영화의 키워드
original_language : 영화 언어
title : 제목
vote_average : 평점 평균
vote_count : 평점 카운트
popularity : 인기도
overview : 개요 설명
</pre>

In [2]:
mdf = pd.read_csv("./dataset/movies_metadata_2.csv")

In [3]:
print(mdf.shape)
print(mdf.info())
mdf.head(2)

(45466, 25)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  obj

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year
0,False,"{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}",30000000,"['Animation', 'Comedy', 'Family']",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States of America'}]",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995
1,False,,65000000,"['Adventure', 'Fantasy', 'Family']",,8844,tt0113497,en,Jumanji,"When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures.",17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'name': 'Teitler Film', 'id': 2550}, {'name': 'Interscope Communications', 'id': 10201}]","[{'iso_3166_1': 'US', 'name': 'United States of America'}]",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'fr', 'name': 'Français'}]",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995


## id 중 오데이터

In [9]:
idx = mdf[mdf['id'].str.len()>6].index
mdf = mdf.drop(idx, axis=0)

In [30]:
mdf['id'] = mdf['id'].astype('int')

# Weighted Rarking  (Top %)
* vote, average

* url : https://www.quora.com/How-does-IMDbs-rating-system-work<br>
<img src = 'imdb_score.png' width=600>

## WR 구하기

* WR = (v / (v+m)) * R + (m/ (v+m)) *C
* R : 영화의 평점
* v : 영화의 투표수
* m : 상위 %안에 들어야 하는 최소 투표수
* C : 개별 영화의 평점

In [31]:
C = mdf['vote_average'].mean()
C

5.618207215134184

In [32]:
m = mdf['vote_count'].quantile(0.95)
m

434.0

In [33]:
def my_calc_wr(mdf):
    R = mdf['vote_average']
    v = mdf['vote_count']
    WR = (v / (v+m)) * R + (m/ (v+m)) *C
    return WR

In [34]:
mdf['wr'] = mdf.apply(my_calc_wr, axis=1)

In [35]:
mdf.head(1)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,wr
0,False,"{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}",30000000,"['Animation', 'Comedy', 'Family']",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States of America'}]",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,7.545529


## 상위 % 데이터만 가져오기

In [39]:
df5 = mdf[mdf['vote_count'] > 434.0][[ 'id','title', 'genres', 'vote_average', 'vote_count', 'year', 'wr' ]]
df5.shape

(2268, 7)

In [40]:
df5.head(2)

Unnamed: 0,id,title,genres,vote_average,vote_count,year,wr
0,862,Toy Story,"['Animation', 'Comedy', 'Family']",7.7,5415.0,1995,7.545529
1,8844,Jumanji,"['Adventure', 'Fantasy', 'Family']",6.9,2413.0,1995,6.704602


In [41]:
df5.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2268 entries, 0 to 45014
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            2268 non-null   int32  
 1   title         2268 non-null   object 
 2   genres        2268 non-null   object 
 3   vote_average  2268 non-null   float64
 4   vote_count    2268 non-null   float64
 5   year          2268 non-null   object 
 6   wr            2268 non-null   float64
dtypes: float64(3), int32(1), object(3)
memory usage: 132.9+ KB


## 장르 검색

In [45]:
df5['genres'].head()

0            ['Animation', 'Comedy', 'Family']
1           ['Adventure', 'Fantasy', 'Family']
5     ['Action', 'Crime', 'Drama', 'Thriller']
9          ['Adventure', 'Action', 'Thriller']
15                          ['Drama', 'Crime']
Name: genres, dtype: object

In [44]:
df5.loc[0, 'genres'],  type(df5.loc[0, 'genres'])

("['Animation', 'Comedy', 'Family']", str)

In [47]:
'c' in 'AAA'

False

In [50]:
df5[df5['genres'].str.contains('Family')].head()

Unnamed: 0,id,title,genres,vote_average,vote_count,year,wr
0,862,Toy Story,"['Animation', 'Comedy', 'Family']",7.7,5415.0,1995,7.545529
1,8844,Jumanji,"['Adventure', 'Fantasy', 'Family']",6.9,2413.0,1995,6.704602
33,9598,Babe,"['Fantasy', 'Drama', 'Comedy', 'Family']",6.0,756.0,1995,5.860758
47,10530,Pocahontas,"['Adventure', 'Animation', 'Drama', 'Family']",6.7,1509.0,1995,6.458364
155,8839,Casper,"['Fantasy', 'Comedy', 'Family']",6.0,1045.0,1995,5.887966


## 최종 코드

In [63]:
def my_calc_wr(mdf):
    R = mdf['vote_average']
    v = mdf['vote_count']
    WR = (v / (v+m)) * R + (m/ (v+m)) *C
    return WR

mdf = pd.read_csv("./dataset/movies_metadata_2.csv")
idx = mdf[mdf['id'].str.len()>6].index
mdf = mdf.drop(idx, axis=0)
mdf['id'] = mdf['id'].astype('int')

In [70]:
def my_search_wr_by_genres(search_genres ='Family', percnet=0.95):
    C = mdf['vote_average'].mean()
    m = mdf['vote_count'].quantile(percnet)
    mdf['wr'] = mdf.apply(my_calc_wr, axis=1)

    df5 = mdf[mdf['vote_count'] > m][[ 'id','title', 'genres', 'vote_average', 'vote_count', 'year', 'wr' ]]
    df5 = df5.sort_values('wr', ascending=False)
    return df5[df5['genres'].str.contains(search_genres)]

In [71]:
resdf = my_search_wr_by_genres('Fantasy', 0.97)
resdf.head()

Unnamed: 0,id,title,genres,vote_average,vote_count,year,wr
5481,129,Spirited Away,"['Fantasy', 'Adventure', 'Animation', 'Family']",8.3,3968.0,2001,8.035598
7000,122,The Lord of the Rings: The Return of the King,"['Adventure', 'Fantasy', 'Action']",8.1,8226.0,2003,7.975624
3030,497,The Green Mile,"['Fantasy', 'Drama', 'Crime']",8.2,4166.0,1999,7.956413
4863,120,The Lord of the Rings: The Fellowship of the Ring,"['Adventure', 'Fantasy', 'Action']",8.0,8892.0,2001,7.88916
5814,121,The Lord of the Rings: The Two Towers,"['Adventure', 'Fantasy', 'Action']",8.0,7641.0,2002,7.871988
