In [None]:
import os 
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import re

In [None]:
file_path = "../input/movies-similarity/movies.csv"

data = pd.read_csv(file_path)
data['plot'] = data['wiki_plot'].astype(str) + "\n" + data['imdb_plot'].astype(str)
data.head()

# Tokenization

In [None]:
nltk.download('punkt')

In [None]:
sent_tokenized = [sent for sent in nltk.sent_tokenize("""
                        Today (May 19, 2016) is his only daughter's wedding. 
                        Vito Corleone is the Godfather.
                        """)]

words_tokenized = [word for word in nltk.word_tokenize(sent_tokenized[0])]


filtered = [word for word in words_tokenized if re.search(r'[a-zA-Z]', word)]

filtered

# Stemming

In [None]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')
print("Without Stemming:",filtered)

stemmed_words = [stemmer.stem(word) for word in filtered]

print("After Stemming: ",stemmed_words)

In [None]:
def tokenize_and_stem(text):

    tokens = [y for x in nltk.sent_tokenize(text) for y in nltk.word_tokenize(x)]

    filtered_tokens = [token for token in tokens if re.search('[a-zA-Z]',token)]

    stems = [stemmer.stem(token) for token in filtered_tokens]

    return stems

words_stemmed = tokenize_and_stem(data['plot'][1])
print(words_stemmed)

# Tfidf Vectorizer

In [None]:
tfidf = TfidfVectorizer(max_df=0.8,max_features=200000,min_df=0.2,stop_words='english',use_idf=True,tokenizer=tokenize_and_stem,ngram_range=(1,3))

In [None]:
tfidf_matrix = tfidf.fit_transform( data['plot'])

print(tfidf_matrix.shape)

# KMeans CLustering

In [None]:
km = KMeans(n_clusters=5)
km.fit(tfidf_matrix)

data['clusters'] = km.labels_.tolist()

data['clusters'].value_counts()

# Cosine Similarity

In [None]:
similarity_scores = cosine_similarity(tfidf_matrix)

# Linkage Dendrogram

In [None]:
mergings = linkage(similarity_scores,method='complete')

dendrogram = dendrogram(mergings,labels=[x for x in data['title']],leaf_rotation=90,leaf_font_size=16)

fig = plt.gcf()
_ = [lbl.set_color('r') for lbl in plt.gca().get_xmajorticklabels()]
fig.set_size_inches(108, 21)

# Show the plotted dendrogram
plt.show()