In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition.truncated_svd import TruncatedSVD
from sklearn.cluster import KMeans, MiniBatchKMeans
from scipy.spatial.distance import euclidean
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [9]:
class movie_recommendation_cluster:
    def __init__(self, **kargs):
        self.topn = kargs.get('topn', 10)
        self.df = kargs.get('data', pd.read_csv('./data/merged.csv'))
        self.a, self.b, self.c = kargs.get('a',0.8), kargs.get('b',0.1), kargs.get('c',0.1)
        self.n_clusters = kargs.get('n_clusters',30)# kmeans
        self.n_components = kargs.get('n_components', 500)# svd
        self.vote_thres = kargs.get('vote_thres',100)# vote_count
        self.verbose = kargs.get('verbose', 1)
        self.re_cluster = kargs.get('re_cluster', 1)# kmeans
        self.batch_size = kargs.get('batch_size', 2000)
        self.max_iter = kargs.get('max_iter', 500)
        
        self.cvec = CountVectorizer(min_df=0, ngram_range=(1,2))
        self.stops = []
        with open('./data/total_stopwords', encoding='utf-8') as f:
            self.stops.append(f.readline()[:-2])
        
        if self.verbose == 1:
            print('-'*35)
            print('# Parameters')
            print('      a, b, c        : {0}, {1}, {2}'.format(self.a, self.b, self.c))
            print('vote count threshold :', self.vote_thres)
            print("n_components of SVD  :", self.n_components)
            print("n_clusters of KMeans :", self.n_clusters)
            print('batch_size of Kmeans :', self.batch_size)
            print('max_iter of Kmeans   :', self.max_iter)
            print('weighted_sum = dist_scaled*{0}(a) + genre_scaled*{1}(b) + wvote_scaled*{2}(c)'.format(self.a, self.b, self.c))
            print('-'*35)
    
    def search_title(self, title_name):
        return self.df[self.df['title'].str.contains(title_name)].title
    
    def genre_sim_sorted(self, title_idx):
        genre_literal = self.df['genre'].apply(lambda x: x.replace('|',' '))
        genre = self.cvec.fit_transform(genre_literal)
        genre_sim = cosine_similarity(genre,genre)
        
        return np.array([(idx,sim) for idx,sim in enumerate(genre_sim[title_idx])])
    
    def raw_to_tfidf(self, data_preprocess):
        tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1,3),stop_words=self.stops,
                                     min_df=3, max_df=0.95, max_features=10000)
        return tfidf.fit_transform(data_preprocess)
    def tfidf_to_svd(self, data_tfidf):
        svd = TruncatedSVD(n_components=self.n_components, n_iter=10)
        return svd.fit_transform(data_tfidf)
    
    def similar_cluster_movies(self, title_idx):
        do_cluster, loop_cnt = True, 0
        
        # data preprocessing
        data_tfidf = self.raw_to_tfidf(list(map(str, self.df['plot_preprocessed_kkma'].values)))
        data_svd = self.tfidf_to_svd(data_tfidf)
        
        # K-means clustering
        print('Clustering...')
        while do_cluster:
            kmeans = MiniBatchKMeans(n_clusters=self.n_clusters, batch_size=self.batch_size,
                                     max_iter=self.max_iter, verbose=0 ,n_init=3)

            vote_over_thres_idx = self.df[self.df['vote_count'] > self.vote_thres].index
            data_svd_idx = np.array([(idx,val) for idx,val in zip(self.df.index,data_svd)])
            data_svd_to_km = [val for idx,val in data_svd_idx if idx in vote_over_thres_idx]
            data_svd_dict = dict([(idx,val) for idx,val in filter(lambda x: x[0] in vote_over_thres_idx, data_svd_idx)])
            
            # (optional)avoid biggest cluster
            km = kmeans.fit(data_svd_to_km)
            km_dict = dict([(df_idx,label_) for df_idx,label_ in zip(vote_over_thres_idx,km.labels_)])
            km_cluster = list(filter(lambda x: km_dict.get(x) == km_dict.get(title_idx), km_dict.keys()))

            clusters = [0]*self.n_clusters
            for label_ in km.labels_:
                clusters[label_] += 1

            clusters_idx = np.array(clusters).argsort()
            bad_clusters = clusters_idx[-3:]
            
            if self.re_cluster:            
                if km_dict.get(title_idx) not in bad_clusters:
                    do_cluster=False
                elif loop_cnt >= 20:
                    print('Loop count exceeded')
                    do_cluster=False
                else:
                    del kmeans
                    loop_cnt += 1
                    print('Re-clustering...(%d)'%(loop_cnt))
                    
            else:
                do_cluster = False

        if self.verbose == 1:
            print('-'*35)
            print('# K-means clustering distribution')
            for i,size in enumerate(clusters):
                postfix = '<==' if i == km_dict.get(title_idx) else ''
                print('cluster #%3d : %4d items %s'%(i,size,postfix))
            print('-'*35)

        closest = []
        for i in km_cluster:
            if i != title_idx:
                closest.append((i,euclidean(data_svd_dict.get(title_idx), data_svd_dict.get(i))))

        return np.array(closest), self.df.loc[np.array(sorted(closest, key=lambda x: x[1]))[:,0]]

    def result_by_weights(self, dataf):
        dataf['weighted_sum'] = dataf['dist_scaled']*self.a + dataf['genre_scaled']*self.b + dataf['wvote_scaled']*self.c
        
        return dataf.sort_values('weighted_sum', ascending=False)

            
    def getMovies(self, title):
        # no title result
        try: title_idx = self.df[self.df['title']== title].index.values[0]
        except:
            raise ValueError('There is no such title name. Search with "search_title" function')
        
        # get movies in same cluster
        dist, result = self.similar_cluster_movies(title_idx)
        
        # merge with distance
        Result = result.rename(columns={'key_0': 'key_00'})
        result = pd.merge(Result, pd.Series(dist[:,1], name='dist'), left_on=result.index, right_on=dist[:,0])
        result.rename(columns={'key_0':'idx'}, inplace=True)
        
        # IMDB's weighted_vote
        def weighted_vote_average(record):
            v, r = record['vote_count'], record['rating']
            return (v/(v+m))*r + (m/(m+v))*c
        c = result['rating'].mean()
        m = result['vote_count'].quantile(.6)
        result['weighted_vote'] = result.apply(weighted_vote_average,axis=1)
        
        # merge with genre
        genre_sim = self.genre_sim_sorted(title_idx)
        Result = result.rename(columns={'key_0': 'key_00'})
        result_with_genre = pd.merge(Result, pd.Series(genre_sim[:,1], name='genre_sim'), left_on=Result.index, right_on=genre_sim[:,0],)
        
        # minmax scale
        result_with_genre['wvote_scaled'] = MinMaxScaler().fit_transform(result_with_genre['weighted_vote'].values.reshape(-1,1))
        result_with_genre['genre_scaled'] = MinMaxScaler().fit_transform(result_with_genre['genre_sim'].values.reshape(-1,1))
        result_with_genre['dist_scaled'] = MinMaxScaler().fit_transform(result_with_genre['dist'].max() - result_with_genre['dist'].values.reshape(-1,1))
        
        # (optional)remove data with 0 genre score
        no_genre_score_idx = result_with_genre[result_with_genre['genre_sim'] == 0].index
        result_with_genre.drop(no_genre_score_idx, inplace=True)
        
        result_with_genre = self.result_by_weights(result_with_genre)
        return result_with_genre.head(self.topn)

In [10]:
recom = movie_recommendation_cluster(re_cluster=False)

-----------------------------------
# Parameters
      a, b, c        : 0.8, 0.1, 0.1
vote count threshold : 100
n_components of SVD  : 500
n_clusters of KMeans : 30
batch_size of Kmeans : 2000
max_iter of Kmeans   : 500
weighted_sum = dist_scaled*0.8(a) + genre_scaled*0.1(b) + wvote_scaled*0.1(c)
-----------------------------------


In [11]:
result = recom.getMovies(title='아이언맨 2')

Clustering...
-----------------------------------
# K-means clustering distribution
cluster #  0 :   38 items 
cluster #  1 :  332 items 
cluster #  2 : 1707 items <==
cluster #  3 :  492 items 
cluster #  4 :  246 items 
cluster #  5 :   43 items 
cluster #  6 :   17 items 
cluster #  7 :   27 items 
cluster #  8 :   31 items 
cluster #  9 :   25 items 
cluster # 10 :   26 items 
cluster # 11 :  327 items 
cluster # 12 :   36 items 
cluster # 13 :   53 items 
cluster # 14 :   12 items 
cluster # 15 :  165 items 
cluster # 16 :  465 items 
cluster # 17 :   35 items 
cluster # 18 :   15 items 
cluster # 19 :   54 items 
cluster # 20 :   27 items 
cluster # 21 :  104 items 
cluster # 22 :  294 items 
cluster # 23 :  904 items 
cluster # 24 :  176 items 
cluster # 25 :   88 items 
cluster # 26 :   18 items 
cluster # 27 :  693 items 
cluster # 28 :  389 items 
cluster # 29 :   18 items 
-----------------------------------


In [12]:
result['title']

7                 헬릭스
22                가필드
23          슬랩스틱 브라더스
29          신과함께-인과 연
11                막걸스
18               노랑머리
35                  창
72           뮨: 달의 요정
45    밤쉘: 세상을 바꾼 폭탄선언
31           장군의 아들 3
Name: title, dtype: object

In [13]:
recom.getMovies(title='어벤져스')

Clustering...
-----------------------------------
# K-means clustering distribution
cluster #  0 :   88 items 
cluster #  1 :  247 items 
cluster #  2 :   94 items 
cluster #  3 :   80 items 
cluster #  4 :   14 items 
cluster #  5 :  156 items 
cluster #  6 :  143 items 
cluster #  7 :  254 items 
cluster #  8 :  212 items 
cluster #  9 :  180 items 
cluster # 10 :  157 items 
cluster # 11 :   22 items 
cluster # 12 :  962 items 
cluster # 13 :   27 items 
cluster # 14 :   56 items 
cluster # 15 : 1385 items <==
cluster # 16 :  174 items 
cluster # 17 :  187 items 
cluster # 18 :   10 items 
cluster # 19 :    6 items 
cluster # 20 :   10 items 
cluster # 21 :  112 items 
cluster # 22 :   37 items 
cluster # 23 :   44 items 
cluster # 24 :   14 items 
cluster # 25 :   95 items 
cluster # 26 : 2015 items 
cluster # 27 :   18 items 
cluster # 28 :   45 items 
cluster # 29 :   13 items 
-----------------------------------


Unnamed: 0.1,key_0,idx,key_00,Unnamed: 0,title,genre,year,date,rating,vote_count,...,img_url,keywords,plot_preprocessed_kkma,dist,weighted_vote,genre_sim,wvote_scaled,genre_scaled,dist_scaled,weighted_sum
7,7,8931.0,8931,8933,킥 애스: 영웅의 탄생,액션|드라마,2010,4.22,8.47,3554,...,https://movie-phinf.pstatic.net/20111223_23/13...,"['데이브', '수호', '마약', '디', '세상', '영웅', '필요', '민'...",지금 세상 영웅 필요 슈퍼 히 어로 되 하 의문 가지 데 이브 정의 수호 위하 닉네...,0.693984,8.252088,0.258199,0.834654,0.333333,0.727595,0.698875
11,11,8086.0,8086,8087,메가마인드,애니메이션|코미디|가족|SF,2011,1.13,8.77,2737,...,https://movie-phinf.pstatic.net/20111223_266/1...,"['메트로', '맨', '자신', '얼떨결', '승리', '기쁨', '유일한', '...",메트로 시티 막강 라이벌 메트로 맨 메가 마인드 도시 영웅 메트로 맨 슈퍼 악당 메...,0.722913,8.418024,0.258199,0.860638,0.333333,0.67843,0.662141
22,22,12506.0,12506,12510,한니발,범죄|스릴러|공포,2001,4.28,7.66,739,...,https://movie-phinf.pstatic.net/20130422_106/1...,"['한니발', '마약', '납치', '메이슨', '싸이코', '딸', '의원', '...",전 요원 클라 리스 스탈 링 싸이코 살인 마인 한니발 렉 터 박사 도움 받 납치 상...,0.75193,7.606509,0.447214,0.733561,0.57735,0.629116,0.634384
23,23,12139.0,12139,12143,오스틴 파워: 골드 멤버,코미디|액션|모험|SF,2002,11.15,7.47,146,...,https://movie-phinf.pstatic.net/20111222_138/1...,"['이블', '닥터', '영입', '파워', '요원', '멤버', '지구', '골드...",스파이 업계 최고 비밀 요원 오스틴 파워 세계 정복 야심 헛물 키 오 닥터 이 블 ...,0.752274,7.560215,0.447214,0.726312,0.57735,0.628532,0.633192
18,18,11334.0,11334,11338,불멸의 이순신,드라마|액션|전쟁,2004,9.04,9.77,454,...,https://movie-phinf.pstatic.net/20111222_205/1...,"['이순신', '유정', '제독', '명나라', '매수', '장', '장수', '피...",전란 막바지 삼도 수군 통제사 이순신 전쟁 종지부 찍 최후 결전 앞두 되 순천 예교...,0.746305,8.198505,0.258199,0.826263,0.333333,0.638676,0.6269
29,29,10049.0,10049,10052,묵공,전쟁|드라마,2007,1.1,7.32,1835,...,https://movie-phinf.pstatic.net/20111222_41/13...,"['양성', '공격', '혁', '리', '눈앞', '양성', '방어', '무고',...",피 혼돈 춘추 전국 시대 천하 통일 눈앞 두 조나라 대륙 대군 마지막 길목 양성 함...,0.758227,7.416399,0.447214,0.703792,0.57735,0.618415,0.622846
89,89,2987.0,2987,2987,저수지 게임,스릴러|다큐멘터리,2017,9.07,9.14,2848,...,https://movie-phinf.pstatic.net/20170912_208/1...,"['제보', '집념', '위험', '감수', '파도', '아무도', '말', '꼬리...",탐사 보도 전문 주 진우 알 말 분의 찾 해외 넘나들 돈 관련 연결고리 실체 추적 ...,0.788606,8.692163,0.447214,0.903566,0.57735,0.566786,0.60152
31,31,13321.0,13321,13325,화성 침공,SF|코미디,1997,4.05,7.88,380,...,https://movie-phinf.pstatic.net/20111222_211/1...,"['화성인', '데일', '제임스', '지구', '침공', '백악관', '대통령',...",불분명 날 화성인 지구 출현 세계 평화 자유 진영 지도자 자처 미국 대통령 제임스 ...,0.759736,7.64892,0.2,0.740203,0.258199,0.615851,0.592521
72,72,1570.0,1570,1570,다키스트 아워,드라마|전쟁,2018,1.17,9.12,1044,...,https://movie-phinf.pstatic.net/20180110_280/1...,"['승리', '작전', '시작', '굴복', '생존']",우리 굴복 승리 없 생존 작전 시작 다 키스 트\n,0.782066,8.312408,0.338062,0.844099,0.436436,0.5779,0.590374
60,60,11081.0,11081,11085,찰리와 초콜릿 공장,판타지|모험|코미디|가족,2005,9.16,8.79,5084,...,https://movie-phinf.pstatic.net/20111222_196/1...,"['초콜릿', '공장', '자신', '욕심', '목적', '달성', '집념', '설...",전 세계 사랑 받 세계 최고 초콜릿 공장 윌 카 초콜릿 공장 양의 초콜릿 생산 세계...,0.77775,8.567125,0.258199,0.883986,0.333333,0.585235,0.58992
