In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


import os,json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import re
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer
from gensim.models.keyedvectors import KeyedVectors

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
class DocSim:
    def __init__(self, w2v_model, stopwords=None):
        self.w2v_model = w2v_model
        self.stopwords = stopwords if stopwords is not None else []

    def vectorize(self, doc: str) -> np.ndarray:
        """
        Identify the vector values for each word in the given document
        :param doc:
        :return:
        """
        doc = doc.lower()
        words = [w for w in doc.split(" ") if w not in self.stopwords]
        word_vecs = []
        for word in words:
            try:
                vec = self.w2v_model[word]
                word_vecs.append(vec)
            except KeyError:
                # Ignore, if the word doesn't exist in the vocabulary
                pass

        # Assuming that document vector is the mean of all the word vectors
        # PS: There are other & better ways to do it.
        vector = np.mean(word_vecs, axis=0)
        return vector

    def _cosine_sim(self, vecA, vecB):
        """Find the cosine similarity distance between two vectors."""
        csim = np.dot(vecA, vecB) / (np.linalg.norm(vecA) * np.linalg.norm(vecB))
        if np.isnan(np.sum(csim)):
            return 0
        return csim

    def calculate_similarity(self, source_doc, target_docs=None, threshold=0):
        """Calculates & returns similarity scores between given source document & all
        the target documents."""
        if not target_docs:
            return []

        if isinstance(target_docs, str):
            target_docs = [target_docs]

        source_vec = self.vectorize(source_doc)
        results = []
        for doc in target_docs:
            target_vec = self.vectorize(doc)
            sim_score = self._cosine_sim(source_vec, target_vec)
            if sim_score > threshold:
                results.append({"score": sim_score, "doc": doc})
                #results.append(sim_score)
            # Sort results by score in desc order
            results.sort(key=lambda k: k["score"], reverse=True)
            #results.sort(reverse=True)

        return results

In [None]:
def textPreprocessing(text): 
    # 소문자로
    text = text.lower()
    
    # 구두점 제거
    text = text.translate(text.maketrans('', '', string.punctuation))
    
    # 영문자 이외 문자는 공백으로 변환
    text = re.sub('[^a-zA-Z]', ' ', text)
    
    # 불용어 제거
    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(text)
    result = []
    for w in word_tokens: 
        if w not in stop_words and len(w)>1: 
            result.append(w) 
            
    # 표제어 추출 (어간추출보다 단어 복원 성능이 좋아서 선택)
    n=WordNetLemmatizer()
    lemmatized_words = [n.lemmatize(word) for word in result]
    return (' '.join(lemmatized_words))

In [None]:
model_path = '../input/word2vec-model/GoogleNews-vectors-negative300.bin' 
w2v_model = KeyedVectors.load_word2vec_format(model_path, binary=True)
ds = DocSim(w2v_model,stopwords=stopwords.words('english'))

In [None]:
if __name__ == '__main__':
    path = '/kaggle/input/netflix-shows/netflix_titles.csv'
    data = pd.read_csv(path)
    
    data['processed_description'] = ''
    # Description 텍스트 전처리
    for i in range(len(data)):
        data['title'][i] = data['title'][i].lower()
        data['processed_description'][i] = textPreprocessing(data['description'][i])
    
    userInput = input("재미있게 봤던 영화나 드라마 제목을 입력해주세요!  ").lower()
    
    while userInput not in list(data['title']):
        userInput = input("제목이 올바르지 않습니다. 다시 입력해 주세요. 종료하고 싶으시면 0을 눌러주세요 ").lower()
        if userInput == '0':
            break
        
    for i in range(len(data)):
        if data['title'][i] == userInput: # 유저가 찾는 제목이랑 같으면
            target = data['processed_description'][i]
        
    # Word2vec Model을 이용해 코사인 유사도 점수 계산
    res = ds.calculate_similarity(target, list(data['processed_description']))[1:11]
    
    print("<",userInput,"과 가장 유사한 영화/TV쇼 리스트>")
    for i in range(len(res)):
        res[i] = res[i]['doc']        
        for j in range(len(data)):
            if data['processed_description'][j] == res[i]:
                print(i+1)
                print("Title: ",data['title'][j])
                print("Description: ",data['description'][j])
                print()