In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Movie Recommendation system using K-means and Fasttext embedding

In [None]:
import pandas as pd
import numpy as np
import random
import re

### 1.Preprocessing
Select rows which is movie type and nessecary columns<br/>
Remove special character and transform to lower case

In [None]:
orig_netflix = pd.read_csv("/kaggle/input/netflix-shows/netflix_titles.csv")
print(orig_netflix.shape)
orig_netflix.head()

In [None]:
orig_netflix = orig_netflix[orig_netflix["type"] == "Movie"].reset_index()
netflix = orig_netflix[["title", "listed_in", "description"]].copy()
netflix.head()

In [None]:
netflix.isnull().sum()

In [None]:
def preprocessing(desc):
    desc = desc.lower()
    desc = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', ' ', desc)
    desc = " ".join(desc.split())
    
    return desc

In [None]:
netflix["new_description"] = netflix["description"].apply(lambda x: preprocessing(x))
print(netflix.shape)
netflix.head()

In [None]:
print(netflix["description"].iloc[0])
print(netflix["new_description"].iloc[0])

### 2. Embedding
Split each sentence to make the corpus<br/>
Embedding the corpus with Fasttext method<br/>
Transform sentences to featrue vector

In [None]:
from gensim.models.fasttext import FastText as FT_gensim

corpus = netflix["new_description"].tolist()
sentences = [re.split(' ', str(sentence)) for sentence in corpus]
print(corpus[0])
print(sentences[0])

In [None]:
embedding_size = 30

FT_model = FT_gensim(vector_size=embedding_size, min_count=2, min_n=2, max_n=5, sg=1, negative=10,
                         sample=0.001, window=5, alpha=0.025, min_alpha=0.0001, epochs=50)

FT_model.build_vocab(sentences)

print('corpus_count: ', FT_model.corpus_count)
print('corpus_total_words: ', FT_model.corpus_total_words)

FT_model.train(sentences,
    epochs=FT_model.epochs,
    total_examples=FT_model.corpus_count, total_words=FT_model.corpus_total_words)

print(FT_model)

In [None]:
FT_vector = []

for item in corpus:
    FT_vector.append(FT_model.wv[str(item)])
FT_vector = np.asarray(FT_vector)

### 3. K-means Clustering
Train k-means clustering with feature vector<br/>
Add cluster_id on dataframe 

In [None]:
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist

kmeanModel = KMeans(n_clusters=50, random_state=42).fit(FT_vector)
cluster_id = kmeanModel.predict(FT_vector)
netflix["cluster_id"] = cluster_id

In [None]:
netflix.head()

### 4. Recommendation system
Searching similarity of new description between source movie and target movie in same cluster<br/>
Sorting dataframe with similarity and return title of most similar movie with number of top_k

In [None]:
def recommendation_system(title_name):
    top_k = 5
    title_row = netflix[netflix["title"] == title_name].copy()
    search_df = netflix[netflix["cluster_id"].isin(title_row["cluster_id"])].copy()
    search_df = search_df.drop(search_df[search_df["title"] == title_name].index)
    
    search_df["Similarity"] = search_df.apply(lambda x: FT_model.wv.similarity(title_row["new_description"], x["new_description"]), axis=1)
    search_df.sort_values(by=["Similarity"], ascending=False, inplace=True)
    
    return search_df[["title", "Similarity"]].head(top_k)

In [None]:
recommendation_system("Aakhri Adaalat")

In [None]:
recommendation_system("National Parks Adventure")