Part 1: Loading Data

In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import nltk
import re
import os
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import lda

In [None]:
#we got 100 movie titles and their information (combined from imdb and wikipedia)
#read the movie titles
titles = open('plot_summaries_titles.txt').read().split('\n')
titles = titles[:100] # ensures that only the first 100 are read in

# The wiki information and imdb imforamtion of each movie is separated by the keywords "BREAKS HERE".
# Each information may consist of multiple paragraphs.
info_wiki = open('plot_summaries_wiki.txt').read().split('\n BREAKS HERE')
info_wiki = info_wiki[:100]

info_imdb = open('plot_summaries_imdb.txt').read().split('\n BREAKS HERE')
info_imdb = info_imdb[:100]

# Combine imdb and wiki to get full inforamtion for the movies.
info = []
for i in range(len(info_wiki)):
    item = info_wiki[i] + info_imdb[i]
    info.append(item)

# Because these movies information have already been ranked, we just need
# to generate a list of ordered numbers for future usage.
ranks = range(1, 1+len(titles)) # 1~100

Part 2: Tokenizing and Stemming'
Let's load stopwords and stemmer function from NLTK library. Some examples of stopwords are "a", "the", and "in", which don't convey siginificant meaning. Stemming is the process of breaking a word down into its plain form.

In [None]:
# Use nltk's English stopwords.
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = stopwords.words('english')  #nltk.corpus.

print ("We use" + str(len(stopwords)) + " stopwords from nltk library.")
print ("Examples:", stopwords[:10])

In [None]:
nltk.download('punkt')
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

def tokenization_and_stemming(text, stemming=True):
    tokens=[]
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if word not in stopwords:
                tokens.append(word.lower())
                
    # filter out any tokens which does not contain letters (e.g., numeric tokens, raw punctuation)
    
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
            
    if stemming:
        stems = [stemmer.stem(t) for t in filtered_tokens]
        return stems
    else:
        return filtered_tokens

In [None]:
# test
tokenization_and_stemming("Amy looked at her father's arm.")

In [None]:
# Use our defined functions to analyze (i.e. tokenize, stem) our information
docs_stemmed = []
docs_tokenized = []
for i in info:
    tokenized_and_stemmed_results = tokenization_and_stemming(i)
    docs_stemmed.extend(tokenized_and_stemmed_results)
    tokenized_results = tokenization_and_stemming(i, stemming=False)
    docs_tokenized.extend(tokenized_results)

In [None]:
# Let's create a mapping from stemmed words to original tokenized words for result interpretation
vocab_frame_dict = {docs_stemmed[x]:docs_tokenized[x] for x in range(len(docs_stemmed))}
#test
print (vocab_frame_dict['soldier'])

Part 3: TF-IDF
Transform the 100 information to a matrix of TF-IDF features.

In [None]:
# define vectorizer parameters
tfidf_model = TfidfVectorizer(max_df=0.8, min_df=0.2, stop_words='english', 
                              use_idf=True, tokenizer=tokenization_and_stemming, ngram_range=(1,1))

# fit the vectorizer to information
tfidf_matrix = tfidf_model.fit_transform(info)

print ("In total, there are " + str(tfidf_matrix.shape[0]) + \
        " information and " + str(tfidf_matrix.shape[1]) + " terms.")

In [None]:
# save the terms identified by TF_IDF
tf_selected_words = tfidf_model.get_feature_names()

Part 4: K-means clustering
Let's use K-means to group the movies

In [None]:
from sklearn.cluster import KMeans

num_clusters = 3   # arbitrary choice
km = KMeans(n_clusters=num_clusters, random_state=0)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

In [None]:
# Analyse K-means Results
# create DataFrame films from all of the input files.
films = { 'title': titles, 'rank': ranks, 'cluster': clusters}
frame = pd.DataFrame(films, columns = ['rank', 'title', 'cluster'])
frame.head(10)

In [None]:
print (" Number of movies included in each cluster: ")
frame['cluster'].value_counts().to_frame()

In [None]:
grouped = frame[['rank', 'cluster']].groupby('cluster')
print ("Average rank (1 to 100) per cluster: ")
grouped.mean()

In [None]:
print ("<Document clustering result by K-means>")

# km.cluster_centers_ denotes the importances of each items in centroid.
# need to sort it in descending order and get the top k items.
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

Cluster_keywords_summary = {}
for i in range(num_clusters):
    print ("Cluster " + str(i) + " words:") ,
    Cluster_keywords_summary[i] = []
    for ind in order_centroids[i, :10]:  # get the top 6 words of each cluster
        Cluster_keywords_summary[i].append(vocab_frame_dict[tf_selected_words[ind]])
        print (vocab_frame_dict[tf_selected_words[ind]] + ", "),
    print
    
    cluster_movies = frame.loc[frame.cluster == i, 'title'].values.tolist()
    print ("Cluster " + str(i) + " titles (" + str(len(cluster_movies)) + " movies): ")
    print (", ".join(cluster_movies), '\n')


In [None]:
# we can see from the above results to find the main topics of the three cluster of movies are:
# Families, love and life
# Wars and battles
# Detective and crime


# Plot K-means Result
# use PCA to select 2 principal components for visualization
pca = PCA(n_components=2)
tfidf_matrix_np=tfidf_matrix.toarray()
X = pca.fit_transform(tfidf_matrix_np)
xs, ys = X[:, 0], X[:, 1]

# Set up colors per clusters using a dictionary
cluster_colors = {0: 'k', 1: 'b', 2: 'r'}
# set up cluster names using a dictionary
cluster_names = {}
for i in range(num_clusters):
    cluster_names[i] = ", ".join(Cluster_keywords_summary[i])

In [None]:
%matplotlib inline

# create data frame with PCA cluster results
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=titles))
# groups = df.groupby(clusters)
groups = df.groupby('label')

# set up plot
fig, ax = plt.subplots(figsize=(16, 9))
#Set color for each cluster/group
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=12,
           label=cluster_names[name], color=cluster_colors[name],
           mec='none')

#show legend with only 1 point
ax.legend(numpoints=1, loc='lower right')
plt.show()

Part 5: Topic Modeling - Latent Dirichlet Allocation
Use LDA to group the movies

In [None]:
# Now use inforamtion to build a 100*538 matrix
terms_dict = {tf_selected_words[x]:x for x in range(len(tf_selected_words))}

feature_matrix_info_tf = []
for i in info:
    current_doc_stats = [0] * len(tf_selected_words)
    allwords_stemmed = tokenization_and_stemming(i)
    for get_terms in allwords_stemmed:
        if get_terms in tf_selected_words:
            current_doc_stats[terms_dict[get_terms]] += 1
    current_doc_stats = np.asarray(current_doc_stats)
    feature_matrix_info_tf.append(current_doc_stats)
    
feature_matrix_info_tf = np.asarray(feature_matrix_info_tf)

In [None]:
model = lda.LDA(n_topics=3, n_iter=500, random_state=0)

In [None]:
#@"model.topic_word_"saves the importance of tf_selected_words in LDA model, i.e. words similarity matrix.
# Its shape is (n_toipcs, num_of_selected_words)
#@"model.doc_topic_"saves the document topic results, i.e. document topic matrix. 
# Its shape is (num_of_documents, n_topics)

model.fit(feature_matrix_info_tf)
topic_word = model.topic_word_   # model.components_ also works
n_top_words = 10

topic_keywords_list = []
for topic_dist in topic_word:
    # Here we select the top 6 (n_top_words) words
    lda_topic_words = np.array(tf_selected_words)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    for i in range(len(lda_topic_words)):
        lda_topic_words[i] = vocab_frame_dict[lda_topic_words[i]]
    topic_keywords_list.append(lda_topic_words.tolist())

In [None]:
doc_topic = model.doc_topic_
topic_doc_dict = {}

print ("<Document clustering result by LDA>")
for i in range(len(doc_topic)):
    topicID = doc_topic[i].argmax()
    if topicID not in topic_doc_dict:
        topic_doc_dict[topicID] = [titles[i]]
    else:
        topic_doc_dict[topicID].append(titles[i])
for i in topic_doc_dict:
    print ("Cluster " + str(i) + " words: " + ", ".join(topic_keywords_list[i]))
    print ("Cluster " + str(i) + " titles (" + str(len(topic_doc_dict[i])) + " movies): ")
    print (', '.join(topic_doc_dict[i]), '\n')

We got a similar set of three clusters/topics as those we got with KMeans, but the way ther are grouped is different.