In [None]:
import pandas as pd
import numpy as np
from sklearn import decomposition
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re, nltk, string
from sklearn.decomposition import LatentDirichletAllocation
from pprint import pprint

# Loading the *dataset* and converting to data matrix

In [None]:
# load Wikipedia Movie Plots Dataset
df = pd.read_csv('Plots_preprocessed.csv')

# load the English names dataset
names_df = pd.read_csv('names.txt', names=['names'], header=0)

# sample 70% of the movies
df = df.sample(frac=.7)

# get clean and lemmatized plots into a list
plots = df['Plot_lemmatize_clean'].tolist()

# get the stop words
names = names_df['names'].tolist()
stop_words = list(ENGLISH_STOP_WORDS) + names

# instantiate the vectorizer
vectorizer = TfidfVectorizer(stop_words = stop_words)
vectors = vectorizer.fit_transform(plots).todense()  # Data Matrix (plots x vocabulary)

# get the vocabulary
vocab = np.array(vectorizer.get_feature_names())

# Calling the NMF function from scikit learn package 

In [None]:
# instantiate the NMF decomposition
nmf = decomposition.NMF(n_components=18, random_state=1, verbose=2)
# get the W and H matrices
W = nmf.fit_transform(vectors)
H = nmf.components_

# Topics Learnt

In [None]:
# function to get the k most important words for each topic
def get_top_words(topic, k=18):
    return [vocab[i] for i in np.argsort(topic)[:-k-1:-1]]

# function to get the topics learnt
def get_topics(matrix, k=18):
    topic_words = ([get_top_words(t, k) for t in matrix])
    return [' '.join(t) for t in topic_words]
  

get_topics(H)

['tell leave ask home say day nt night friend meet make return work party house apartment time want',
 'murder police killer officer case crime detective kill arrest suspect inspector investigate criminal evidence commit murderer prison investigation',
 'love fall girl marriage friend meet story college come wedding sister life daughter married rich know good want',
 'war army american soldier japanese british world agent officer force general pilot mission military camp order state united',
 'film story life movie character set revolve star director end scene base role relationship people actor plot event',
 'school student teacher girl high college boy class friend parent principal miss classmate university year study dance group',
 'child wife husband baby son daughter affair couple pregnant life married divorce marriage house year live birth mother',
 'gang money bank town steal robbery brother plan horse gangster car thief criminal police pay prison sheriff shoot',
 'kill house sh

# Testing the results on test dataset

In [None]:
data_original = pd.read_csv('wiki_movie_plots_deduped.csv')
all_preprocessed_plots = pd.read_csv('Plots_preprocessed.csv')

In [None]:
test_plots = []
for i in range(len(data_original)):
    if i not in df.index:
        test_plots.append(all_preprocessed_plots['Plot_lemmatize_clean'][i])

In [None]:
# instantiate the vectorizer
vectorizer.fit(plots)
test_vectors = vectorizer.transform(test_plots)  # (test_plots x vocabulary)

In [None]:
test_W = nmf.transform(test_vectors)