In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Load the dataset

In [None]:
data = pd.read_csv('/kaggle/input/movies-similarity/movies.csv')
data.head()

## Basic data exploration

In [None]:
# Check the shape of the dataset
data.shape

In [None]:
# Check for duplicates
data.duplicated().sum()

In [None]:
# Check for missing values
data.isnull().sum()

It seems that we have 10 missing summaries in IMDb plot column.

To resolve this, we could create a new column called "Plot" and append "Wiki_Plot" with "Imdb_Plot"

In [None]:
# Create a new column "Plot"
data["plot"] = data["wiki_plot"].astype(str) + "\n" + data["imdb_plot"].astype(str)
data.head()

In [None]:
# Check for missing values again
data.isnull().sum()

Now, our new column "plot" doesn't have any missing value. We're going to use this column for our further processing.

## Use NLP techniques to convert "plot" into numerical vectors

### Tokenization

_Tokenization is the process by which we break down articles into individual sentences or words, as per the requirement. It is required to enable machines to understand context between 2 articles, if they are similar or far apart._
For example, consider a sentence "This is a sentence", if we try to match this whole sentence in the document, we might not find any matching sentence, but if we break the sentence into words and try find match again with words, it might match using different words in the document.

To apply tokenization, we need a python library called `nltk`. 

In [None]:
# Natural Language Tool Kit - required for textual processing
import nltk
# Used for Stemming purpose
from nltk.stem import SnowballStemmer
# Tfidf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Regular expression processing
import re

In [None]:
# Tokenize first plot i.e. data.plot[0]

sentences = [sent for sent in nltk.sent_tokenize(data['plot'][0])]
print(len(sentences))
print(sentences[:5])

In [None]:
# Tokenize first sentence into words
words = [word for word in nltk.word_tokenize(sentences[0])]
print(len(words))
print(words)

As we can observe from above example, that we have words with punctuations or even numbers which doesn't make sense in our current analysis of similarity, we can filter these words using Regular expressions.

In [None]:
words_filtered = [word for word in words if re.search("[a-zA-Z]", word)]
print(len(words_filtered))
print(words_filtered)

### Stemming

_It is the process of reducing infected words to their word stem. This is required to reduce complexity while processing different words._

Consider an example: "This is a sentence from a collection of sentences.". Here `sentence` and `sentences` are word of same stem but in different form. In our textual processing, most of the times, we just need to work with the stem word and not different forms of same stem. Therefore, using Stemming process, we can bring `sentence` and `sentences` to stem word i.e. `sentence`.

For Stemming, `nltk` provides different Stemmers, but we're going to use `SnowballStemmer`.



In [None]:
stemmer = SnowballStemmer('english')

tokens = [stemmer.stem(token) for token in words_filtered]

print(len(tokens))
print(words_filtered)
print(tokens)

As we can observe that, words are converted to base form, like, `only` => `onli`, `wedding` => `wed`, etc.

_Note: `Stemming` can convert to stem words which might not be present in the dictionary and it might not make sense, (Example: `Only` => `onli`). If we need sensical stem word, then there's a process called `Lemmatization` which takes care of dictionary words, but it is slower compared to `Stemming`_

In [None]:
a = "This is a sentence. This is another sentence."
s = nltk.sent_tokenize(a)
print(type(s))
print(s)
t = [nltk.word_tokenize(s_) for s_ in s]
print(type(t))
print(t)

In [None]:
# Define a function which takes text and apply "Tokenization" and "Stemming" and returns processed text.
def tokenize_and_stem(text):
    # Tokenize text into sentences
    sentences = nltk.sent_tokenize(text)
    
    # Create an empty list to contain all tokens (words)
    tokens = set()
    for sent in sentences:
        # Tokenize sentence into words provided each word contains atleast one alphabetical character
        words = {stemmer.stem(word) for word in nltk.word_tokenize(sent) if re.search('[a-zA-Z]', word)}
        tokens = tokens.union(words)
    
    return tokens

In [None]:
# Test above function
res = tokenize_and_stem("Today (May 19, 2016) is his only daughter's wedding.")
print(res)

### Convert tokens into numerical vectors

To apply any Machine Learning algorithm, we are required to convert text tokens into numerical vectors. We're going to use `TfIdfVectorizer` i.e. `Term Frequency Inverse Document Frequency Vectorizer`. This converts each token into a number within the context of whole document.

$$
\text{Term frequency} = \frac{\text{No. of repititions of word in the sentence}}{\text{No. of words in the sentence}}
$$

$$
\text{Inverse Document frequency} = log(\frac{\text{No. of sentences}}{\text{No. of sentences containing the word}})
$$

$$
\text{Final number} = \text{T.F.} \times \text{I.D.F.}
$$

In [None]:
help(TfidfVectorizer)

In [None]:
# Instantiate TfidfVectorizer with stopwords, tokenizer for efficient processing of text
tfidf_vect = TfidfVectorizer(tokenizer=tokenize_and_stem, 
                             stop_words='english', 
                             ngram_range=(1, 3), 
                             max_df=0.8, 
                             min_df=0.2, 
                             max_features=200000, 
                             use_idf=True)

`stopwords` are those words in a given text which do not contribute considerably towards the meaning of the sentence and are generally grammatical filler words. For example, in the sentence _'Dorothy Gale lives with her dog Toto on the farm of her Aunt Em and Uncle Henry'_, we could drop the words 'her' and 'the', and still have a similar overall meaning to the sentence. Thus, 'her' and 'the' are stopwords and can be conveniently dropped from the sentence. On setting the stopwords to 'english', we direct the vectorizer to drop all stopwords from a pre-defined list of English language stopwords present in the nltk module.

`ngram_range` defines the length of the ngrams to be formed while vectorizing the text.

Once we create a TF-IDF Vectorizer object, we must fit the text to it and then transform the text to produce the corresponding numeric form of the data which the computer will be able to understand and derive meaning from. To do this, we use the `fit_transform()` method of the `TfidfVectorizer` object.

In [None]:
tfidf_mat = tfidf_vect.fit_transform(data['plot'])

print(tfidf_mat.shape)

In [None]:
print(tfidf_mat[0])

Now, as we have converted our text (plot summaries) into numerical vectors, we can apply Clustering algorithm to cluster similar items together. 

`Clustering` is the method of grouping together a number of items such that they exhibit similar properties. According to the measure of similarity desired, a given sample of items can have one or more clusters.

We're going to use `KMeans` clustering. `KMeans` is the algorithm in which the given sample is divided into `K` clusters where each cluster is denoted by the mean of all the items lying in that cluster.

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
%matplotlib inline

from scipy.cluster.hierarchy import linkage, dendrogram

In [None]:
# KMeans object
km = KMeans(n_clusters = 5)

# Fit the KMeans object with the tfidf_mat
km.fit(tfidf_mat)

clusters = km.labels_.tolist()

# Create a cluster column in the dataframe
data['cluster'] = clusters

data['cluster'].value_counts()

In [None]:
# Display 5 movie titles for each cluster
data_clustered = data.groupby('cluster')

for c, data_ in data_clustered:
    print('Cluster:',c)
    print(data_['title'][:5])
    print('-'*10)

In [None]:
# Calculate cosing similarity distance between the tfidf matrix
similarity_distance = 1 - cosine_similarity(tfidf_mat)

In [None]:
# This gives distance between each vector therefore 100x100 matrix
similarity_distance.shape

We can visualize the similar items in the form of `dendrogram`. Dendrograms help visualize the results of hierarchical clustering, which is an alternative to k-means clustering. Two pairs of movies at the same level of hierarchical clustering are expected to have similar strength of similarity between the corresponding pairs of movies.

In [None]:
# Create mergings matrix
mergings = linkage(similarity_distance, method='complete')

# Plot the dendrogram, using title as label
dendrogram_ = dendrogram(mergings,
                        labels = [x for x in data['title']],
                        leaf_rotation = 90,
                        leaf_font_size = 16,)

# Adjust the plot
fig = plt.gcf()
_ = [lbl.set_color('r') for lbl in plt.gca().get_xmajorticklabels()]
fig.set_size_inches(108, 21)

plt.show()