In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
import warnings
warnings.filterwarnings(action='ignore')



In [2]:
movie = pd.read_csv('ratings.csv', low_memory = False)
movie.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179


In [3]:
# 개봉한지 얼마안된 순서
movie_df = movie.sort_values(by='timestamp', ascending=True).reset_index()
movie_df.head()

Unnamed: 0,index,userId,movieId,rating,timestamp
0,52635,383,21,3.0,789652009
1,52641,383,47,5.0,789652009
2,52684,383,1079,3.0,789652009
3,56907,409,21,5.0,828212412
4,56909,409,25,4.0,828212412


In [4]:
# 영화의 metadata를 불러와서 movie_id에 맞는 title를 구해줌
movie_info = pd.read_csv('movies_metadata.csv', low_memory = False)


In [5]:
movie_info.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [6]:
movie_info = movie_info.rename(columns = {'id': 'movieId'})
movie_df['movieId'] = movie_df['movieId'].astype(str)
movie_info['movieId'] = movie_info['movieId'].astype(str)

movie_df = pd.merge(movie_df, movie_info[['movieId', 'original_title']], how = 'left', on = 'movieId')

In [8]:
movie_df.head(3)

Unnamed: 0,index,userId,movieId,rating,timestamp,original_title
0,52635,383,21,3.0,789652009,The Endless Summer
1,52641,383,47,5.0,789652009,
2,52684,383,1079,3.0,789652009,


In [9]:
movie_df = movie_df[movie_df['original_title'].notnull()].reset_index(drop=True)

In [10]:
agg = movie_df.groupby(['userId'])['original_title'].agg({'unique'})
agg.head()

Unnamed: 0_level_0,unique
userId,Unnamed: 1_level_1
1,"[Jay and Silent Bob Strike Back, Vivement dima..."
2,"[Terminator 3: Rise of the Machines, The Conve..."
3,"[300, The Killing, Shortbus, Finding Neverland..."
4,"[David, The Wedding Planner, Casablanca, Sleep..."
5,"[Gleaming the Cube, Cool Hand Luke, Hidalgo, U..."


In [11]:
movie_df['original_title'].unique()

array(['The Endless Summer', 'Jarhead', '彼女の想いで', ...,
       'The Lonedale Operator', 'Violeta se fue a los cielos',
       'To Kill a Priest'], dtype=object)

# Word2Vec 적용

- 사용자가 시청한 영화 하나하나를 단어로 보고, 영화 간의 유사도 계산

In [12]:
# int형식은 Word2Vec에서 학습이 안되어 String으로 변경함
sentence = []
for user_sentence in agg['unique'].values:
    sentence.append(list(map(str, user_sentence)))

In [13]:
# Word2vec의 학습을 진행
from gensim.models import Word2Vec

embedding_model = Word2Vec(sentence, vector_size=20, window=5, min_count=1, workers=4, epochs=200, sg=1)

In [14]:
embedding_model.wv.most_similar(positive=['300'], topn=10)

[('Le Professionnel', 0.7847306728363037),
 ('Strange Days', 0.7835767269134521),
 ('Fallout', 0.7811231017112732),
 ('The Cider House Rules', 0.7797590494155884),
 ('Terminator Salvation', 0.7793862819671631),
 ('Novecento', 0.7727855443954468),
 ('Rocky Balboa', 0.7706599235534668),
 ('Berlin: Die Sinfonie der Grosstadt', 0.7682653069496155),
 ('15 Minutes', 0.7602836489677429),
 ('Requiem', 0.7571473121643066)]

# Doc2Vec

In [15]:
from gensim.models import doc2vec

In [16]:
movie_info_2 = movie_info[movie_info['original_title'].notnull()].reset_index()
movie_info_2 = movie_info[movie_info['overview'].notnull()].reset_index()

In [17]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gunso\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gunso\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
from tqdm.notebook import tqdm
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords  
import re

stop_words = set(stopwords.words('english'))

overview = []
for words in tqdm(movie_info_2['overview']):
    word_tokens = word_tokenize(words)
    sentence = re.sub('[^A-Za-z0-9]+', ' ', str(word_tokens))
    sentence = sentence.strip()
    
    sentence_tokens = word_tokenize(sentence)
    result = ''
    for token in sentence_tokens:
        if token not in stop_words:
            result += ' ' + token
    result = result.strip().lower()
    overview.append(result)

  0%|          | 0/44512 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
movie_info_2['pre_overview'] = overview

In [None]:
doc_vectorizer = doc2vec.Doc2Vec(
    dm = 0,
    dbow_words = 1,
    window = 10,
    vector_size = 100,
    alpha = 0.025,
    seed = 1234,
    min_count = 5,
    min_alpha = 0.025,
    workers = 4, 
    hs = 1,
    negative = 10
)

In [None]:
from collections import namedtuple

agg = movie_info_2[['movieId', 'original_title', 'pre_overview']]
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument((c), [d]) for d, c in agg [['original_title', 'pre_overview']].values ]

In [None]:
doc_vectorizer.build_vocab(tagged_train_docs)
print(str(doc_vectorizer))

In [None]:
from time import time

start = time()

for epoch in tqdm(range(5)):
    doc_vectorizer.train(tagged_train_docs, total_examples = doc_vectorizer.corpus_count, epochs=doc_vectorizer.epochs)
    doc_vectorizer.alpha -= 0.002
    doc_vectorizer.min_alpha = doc_vectorizer.alpha

end = time()
print("During Time: {}".format(end-start))

In [None]:
doc_vectorizer.docvecs.most_similar('Harry Potter and the Deathly Hallows: Part 1', topn = 20)