In [1]:
import pandas as pd
import numpy as np
import warnings
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
warnings.filterwarnings('ignore')

# [3월 23일]
---

## # 콘텐츠 기반 추천
---

### # 데이터 전처리
---

In [2]:
movies = pd.read_csv('../data/tmdb_5000_movies.csv')
print(movies.shape)
print(movies.columns)
movies.head(2)

(4803, 20)
Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [4]:
movies['genres'][1]

'[{"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 28, "name": "Action"}]'

In [5]:
# genres 칼럼은 문자열로 표기되어 있으며 문자열을 분해해서 개별 장르를 파이썬 리스트 객체로 추출 필요

from ast import literal_eval # 요소를 리스트 형태로 바꿔준다

movies_df = movies[['id','title','genres','vote_average','vote_count','popularity','keywords','overview']]

movies_df['genres'] = movies_df['genres'].apply(literal_eval)
movies_df['keywords'] = movies_df['keywords'].apply(literal_eval)
movies_df['genres'][1]

[{'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 28, 'name': 'Action'}]

In [6]:
movies_df['genres'] = movies_df['genres'].apply(lambda x:[y['name'] for y in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x:[y['name'] for y in x])
movies_df['genres'][1]

['Adventure', 'Fantasy', 'Action']

### # 장르 콘텐츠 유사도 측정
---
- genres를 피처 벡터화 행렬로 변환한 후 데이터 세트를 코사인 유사도를 통해 비교

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

# 카운트벡트화를 하기 위해 공백문자로 word 단위가 구분되는 문자열로 변환
movies_df['genres_literal'] = movies_df['genres'].apply(lambda x: (' ').join(x))
count_vect = CountVectorizer(min_df = 0, ngram_range = (1, 2))
genre_mat = count_vect.fit_transform(movies_df['genres_literal'])

# display(movies_df.head())
print(genre_mat.shape)

(4803, 276)


In [8]:
from sklearn.metrics.pairwise import cosine_similarity

# 코사인 유사도 측정
genre_sim = cosine_similarity(genre_mat, genre_mat)
print(genre_sim.shape)
# print(genre_sim)
print(genre_sim[:1])

(4803, 4803)
[[1.         0.59628479 0.4472136  ... 0.         0.         0.        ]]


In [9]:
# 값이 높은 순으로 정렬된 대상 행의 위치 인덱스 추출

genre_sim_sorted_ind = genre_sim.argsort()[:, ::-1]
print(genre_sim_sorted_ind[:1])

[[   0 3494  813 ... 3038 3037 2401]]


### # 장르 콘텐츠 필터링을 이용한 영화 추천
---

In [10]:
# 장르 유사도에 따라 영화를 추천하는 사용자 함수를 생성

def find_sim_movie(df, sorted_ind, title_name, top_n = 10):
    title_movie = df[df['title'] == title_name]
    
    # title_named를 가진 DataFrame의 index 객체를 배열로 반환
    # sorted_ind 인자로 입력된 genre_sim_sorted_ind 객체에서 유사도 순으로 top_n개의 index 추출
    title_index = title_movie.index.values
    similar_indexes = sorted_ind[title_index, :(top_n)]
    
    # 추출된 top_n index 출력 (top_n index는 2차원 데이터)
    # dataframe에서 index로 사용하기 위해서 1차원 array로 변경
    print(similar_indexes)
    similar_indexes = similar_indexes.reshape(-1)
    
    return df.iloc[similar_indexes]

In [11]:
# 사용자 함수를 이용하여 유사한 영화 10개를 추천

similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'Gravity', 20)

similar_movies[['title', 'vote_average', 'vote_count']]

[[ 545  239 1111 4693  454 1473 4032 4764  502 4003 3427 1275 2865 4787
  2743 1786 1422  632 4691  929]]


Unnamed: 0,title,vote_average,vote_count
545,Unbreakable,6.9,1946
239,Gravity,7.3,5751
1111,Victor Frankenstein,5.6,685
4693,H.,6.5,4
454,The Day the Earth Stood Still,5.2,1043
1473,The Astronaut's Wife,5.4,254
4032,Sleep Dealer,6.0,40
4764,Dawn of the Crescent Moon,2.0,1
502,The Invasion,5.7,359
4003,Timecrimes,7.0,308


#### # 평가 횟수를 반영하는 새로운 평가 방식
---

- 가중평점(weighted raiting) = (v/(v + m)) * R + (m/(v + m)) * C
  - v : 개별 영화에 평점을 투표한 횟수 
  - m : 평점을 부여하기 위한 최소 투표 횟수
  - R : 개별 영화에 대한 평균 평점
  - C : 전체 영화에 대한 평균 평점

In [12]:
C = movies_df['vote_average'].mean()
m = movies_df['vote_count'].quantile(0.6)

print('C :', round(C, 3), '/ m :', round(m, 3))

C : 6.092 / m : 370.2


In [16]:
m = movies_df['vote_count'].quantile([0.1, 0.2])
m

0.1    12.0
0.2    36.0
Name: vote_count, dtype: float64

In [278]:
# 가중 평점으로 변경하는 사용자 함수

def weighted_vote_average(record):
    v = record['vote_count']
    R = record['vote_average']
    
    return ((v/(v+m)) * R) + ((m/(m+v)) * C)

movies_df['weighted_vote'] = movies_df.apply(weighted_vote_average, axis = 1)
movies_df.head()

Unnamed: 0,id,title,genres,vote_average,vote_count,popularity,keywords,overview,genres_literal,weighted_vote
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]",7.2,11800,150.437577,"[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...",Action Adventure Fantasy Science Fiction,7.166301
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]",6.9,4500,139.082615,"[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha...",Adventure Fantasy Action,6.838594
2,206647,Spectre,"[Action, Adventure, Crime]",6.3,4466,107.376788,"[spy, based on novel, secret agent, sequel, mi...",A cryptic message from Bond’s past sends him o...,Action Adventure Crime,6.284091
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]",7.6,9106,112.31295,"[dc comics, crime fighter, terrorist, secret i...",Following the death of District Attorney Harve...,Action Crime Drama Thriller,7.541095
4,49529,John Carter,"[Action, Adventure, Science Fiction]",6.1,2124,43.926995,"[based on novel, mars, medallion, space travel...","John Carter is a war-weary, former military ca...",Action Adventure Science Fiction,6.098838


In [301]:
# 가중 평가를 반영한 장르 유사도에 따라 영화를 추천하는 사용자 함수를 생성

def new_find_sim_movie(df, sorted_ind, title_name, top_n = 10):
    title_movie = df[df['title'] == title_name]
    
    # title_named를 가진 DataFrame의 index 객체를 배열로 반환
    title_index = title_movie.index.values
    
    # sorted_ind 인자로 입력된 genre_sim_sorted_ind 객체에서 유사도 순으로 top_n개의 index 추출
    # top_n의 2배에 해당하는 장르 유사성이 높은 인덱스 추출
    similar_indexes = sorted_ind[title_index, :(top_n*2)]
    
    # dataframe에서 index로 사용하기 위해서 1차원 array로 변경
    similar_indexes = similar_indexes.reshape(-1)
    
    # 기준영화 인덱스는 제외
    similar_indexes = similar_indexes[similar_indexes != title_index]
    
    # top_n의 2배에 해당하는 후보군에서 weighted_vote가 높은 순으로 top_n만큼 추출
    return df.iloc[similar_indexes].sort_values('weighted_vote', ascending = False)[:top_n]

In [297]:
movies_df[movies_df['title'] == 'Gravity'].index

Int64Index([239], dtype='int64')

In [308]:
new_similar_movies = new_find_sim_movie(movies_df, genre_sim_sorted_ind, 'Gravity', 20)
new_similar_movies[['title', 'vote_average', 'weighted_vote', 'vote_count']]

Unnamed: 0,title,vote_average,weighted_vote,vote_count
1725,Blade Runner,7.9,7.727475,3509
2743,The Butterfly Effect,7.3,7.116008,2060
1275,Sunshine,7.0,6.783483,1182
545,Unbreakable,6.9,6.770884,1946
2253,Equilibrium,6.9,6.744419,1552
3556,10 Cloverfield Lane,6.8,6.707675,2468
1535,Frequency,7.0,6.596157,462
4003,Timecrimes,7.0,6.504456,308
900,The Adjustment Bureau,6.5,6.42534,1652
3427,The Andromeda Strain,7.0,6.400075,190


In [177]:
np.array(np.arange(9)).reshape(3, -1)[:, :-4:-1]

array([[2, 1, 0],
       [5, 4, 3],
       [8, 7, 6]])