In [2]:
import numpy as np
import pandas as pd
import nltk
from bs4 import BeautifulSoup
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3

In [3]:
data= pd.read_csv("C://Users/Saba Naseem/Desktop/Machine Learning/Codes/kaggle/Document Clustering/movies.csv")

In [168]:
data.head(25)

Unnamed: 0,title,link,syn_link,genres,plot_synopsis
0,The Shawshank Redemption,https://www.imdb.com/title/tt0111161/?pf_rd_m=...,https://www.imdb.com/title/tt0111161/synopsis?...,[' Drama'],"In 1947, Andy Dufresne (Tim Robbins), a banker..."
1,The Godfather,https://www.imdb.com/title/tt0068646/?pf_rd_m=...,https://www.imdb.com/title/tt0068646/synopsis?...,"[' Crime', ' Drama']","In late summer 1945, guests are gathered for t..."
2,The Godfather: Part II,https://www.imdb.com/title/tt0071562/?pf_rd_m=...,https://www.imdb.com/title/tt0071562/synopsis?...,"[' Crime', ' Drama']",The Godfather Part II presents two parallel st...
3,The Dark Knight,https://www.imdb.com/title/tt0468569/?pf_rd_m=...,https://www.imdb.com/title/tt0468569/synopsis?...,"[' Action', ' Crime', ' Drama', ' Thriller']",The movie begins with a gang of men with clown...
4,12 Angry Men,https://www.imdb.com/title/tt0050083/?pf_rd_m=...,https://www.imdb.com/title/tt0050083/synopsis?...,[' Drama'],"In a New York City courthouse, an eighteen-yea..."
5,Schindler's List,https://www.imdb.com/title/tt0108052/?pf_rd_m=...,https://www.imdb.com/title/tt0108052/synopsis?...,"[' Biography', ' Drama', ' History']",The relocation of Polish Jews from surrounding...
6,The Lord of the Rings: The Return of the King,https://www.imdb.com/title/tt0167260/?pf_rd_m=...,https://www.imdb.com/title/tt0167260/synopsis?...,"[' Adventure', ' Drama', ' Fantasy']","In the opening scene, a flashback, two hobbits..."
7,Pulp Fiction,https://www.imdb.com/title/tt0110912/?pf_rd_m=...,https://www.imdb.com/title/tt0110912/synopsis?...,"[' Crime', ' Drama']","Late one morning in the Hawthorne Grill, a res..."
8,"Il buono, il brutto, il cattivo",https://www.imdb.com/title/tt0060196/?pf_rd_m=...,https://www.imdb.com/title/tt0060196/synopsis?...,[' Western'],The film tells the story of three men who purs...
9,Fight Club,https://www.imdb.com/title/tt0137523/?pf_rd_m=...,https://www.imdb.com/title/tt0137523/synopsis?...,[' Drama'],We back out of the webbing of neurons and brai...


In [4]:
data.columns

Index(['title', 'link', 'syn_link', 'genres', 'plot_synopsis'], dtype='object')

In [8]:
title, synopsis, genre= data['title'], data['plot_synopsis'], data['genres']

In [13]:
synopsis[0][:200]

'In 1947, Andy Dufresne (Tim Robbins), a banker in Maine, is convicted of murdering his wife and her lover, a golf pro. Since the state of Maine has no death penalty, he is given two consecutive life s'

#### Stopwords, stemming, and tokenizing

In [91]:
stopwords= nltk.corpus.stopwords.words("english")
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [118]:
def tokenizing(text):
    
    #breaking each word and making them tokens
    tokens=[word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    
    #storing only alpha tokens
    filtered_tokens=[]
    for token in tokens:
        if (re.search('[a-zA-Z]', token)):
            filtered_tokens.append(token)

    return filtered_tokens


from nltk.stem.snowball import SnowballStemmer
stemmer= SnowballStemmer("english")

def stemming(text):
    
    stems =[stemmer.stem(t) for t in text]
    return stems

def token_stems(text):
        
    tokens=tokenizing(text) 
    stems=stemming(tokens)
   
    return stems       

In [119]:
a= token_stems(synopsis[0])
a[:10]

['in', 'andi', 'dufresn', 'tim', 'robbin', 'a', 'banker', 'in', 'main', 'is']

In [74]:
tokenized_only_vocab=[ tokenizing(word) for word in synopsis]
stemmed_vocab=[ stemming(token) for token in tokenized_only_vocab]

In [86]:
len(stemmed_vocab), len(tokenized_only_vocab)

(100, 100)

#### TF-IDF vectorization

In [97]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf= TfidfVectorizer(max_df=0.8, max_features=200000, min_df=0.2, stop_words='english',
                      use_idf=True, tokenizer=token_stems, ngram_range=(1,3))

%time tfidf_matriz= tfidf.fit_transform(synopsis)

print(tfidf_matriz.shape)

terms=tfidf.get_feature_names()

In [109]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matriz)

In [122]:
t=tfidf.vocabulary_

In [138]:
aa=tfidf.stop_words_

#### K Mean Clustering

In [139]:
from sklearn.cluster import KMeans

In [141]:
kmean= KMeans(5)

kmean.fit(tfidf_matriz)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [147]:
clusters= kmean.labels_
clusters

array([4, 2, 0, 2, 3, 1, 1, 2, 1, 2, 1, 0, 1, 4, 1, 1, 2, 4, 1, 4, 0, 1,
       4, 4, 2, 4, 4, 4, 4, 2, 4, 4, 4, 4, 0, 4, 4, 0, 0, 2, 2, 4, 4, 0,
       2, 4, 4, 1, 4, 1, 4, 4, 0, 0, 1, 2, 4, 1, 1, 4, 4, 4, 4, 4, 3, 0,
       4, 0, 4, 2, 0, 4, 4, 1, 4, 4, 4, 1, 0, 2, 0, 0, 2, 0, 1, 0, 2, 4,
       4, 4, 0, 4, 0, 4, 4, 4, 0, 4, 0, 2])

In [153]:
films=pd.DataFrame({'title': title, 'rank': ranks, 'synopsis': synopsis, 'cluster': clusters, 'genre': genre})

In [156]:
films.head()

Unnamed: 0,title,rank,synopsis,cluster,genre
0,The Shawshank Redemption,0,"In 1947, Andy Dufresne (Tim Robbins), a banker...",4,[' Drama']
1,The Godfather,1,"In late summer 1945, guests are gathered for t...",2,"[' Crime', ' Drama']"
2,The Godfather: Part II,2,The Godfather Part II presents two parallel st...,0,"[' Crime', ' Drama']"
3,The Dark Knight,3,The movie begins with a gang of men with clown...,2,"[' Action', ' Crime', ' Drama', ' Thriller']"
4,12 Angry Men,4,"In a New York City courthouse, an eighteen-yea...",3,[' Drama']


In [157]:
films['cluster'].value_counts()

4    44
0    21
1    17
2    16
3     2
Name: cluster, dtype: int64

In [160]:
grouped = films['rank'].groupby(films['cluster']) #groupby cluster for aggregation purposes

grouped.mean()

cluster
0    60.714286
1    35.764706
2    42.625000
3    34.000000
4    52.659091
Name: rank, dtype: float64

In [165]:
kmean.cluster_centers_

array([[0.00511642, 0.00348874, 0.00340236, ..., 0.00255123, 0.00937077,
        0.03758867],
       [0.02300653, 0.00382105, 0.003744  , ..., 0.        , 0.01132942,
        0.02186251],
       [0.00575595, 0.00721832, 0.01035289, ..., 0.00743109, 0.01481445,
        0.02664877],
       [0.        , 0.        , 0.        , ..., 0.01843613, 0.01877592,
        0.        ],
       [0.01381423, 0.01099983, 0.01174015, ..., 0.01189199, 0.01912986,
        0.02006866]])

In [169]:
from __future__ import print_function

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = kmean.cluster_centers_.argsort()[:, ::-1] 

for i in range(5):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print() #add whitespace
    print() #add whitespace
    
    print("Cluster %d titles:" % i, end='')
    for title in films.ix[i]['title'].values.tolist():
        print(' %s,' % title, end='')
    print() #add whitespace
    print() #add whitespace
    
print()
print()

Top terms per cluster:

Cluster 0 words:

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if sys.path[0] == '':


KeyError: "None of [['father']] are in the [index]"