# Clustering The Last Jedi Reviews
This notebook does a simple k-means clustering trial on Rotten Tomatoes' User Reviews of the Last Jedi.
Vectorizing features via tf-idf.
Results are not so informative.

In [5]:
# Setup
from __future__ import print_function

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys
from time import time

import numpy as np
import pandas as pd
import os

In [6]:
# Load saved TLJ review data
#load the reviews file
os.chdir(r"O:\PDES\PRISM\Sullivan\Personal Projects")
reviewtbl = pd.read_csv("RT_Last_Jedi_2017-12-28.txt", sep="\t")

#check - well at least the export/import fixes some of my string issues
reviewtbl.head()

Unnamed: 0,userid,username,rating,text
0,840073561,['Jeffrey O'],2.5,"[""At least it was sort of original? Other than..."
1,977007867,['Claire R'],0.5,"[""The acting was great but the story writing w..."
2,976967449,['Caleb D'],0.5,['Would not recommend for so many reasons. Thi...
3,977007880,['James F'],0.5,['Horrible. Just watch any of the Youtube stuf...
4,977007877,['Michael C'],4.0,['Really good movie better than Force Awakens ...


In [22]:
# Do TF-IDF Vectorizing
#custom stop words
from nltk.corpus import stopwords
#stop_words = set(stopwords.words('english'))
mywords = set(["star", "wars", "movie", "film", "wa", "last", "jedi"]) #'wa' to handle tokenizer's truncation of was
#add custom words
stop_words = set(stopwords.words('english')) | mywords

t0 = time()
vectorizer = TfidfVectorizer(max_df=0.5,
                            min_df=2, stop_words=stop_words,
                            use_idf=True,
                            ngram_range=(1,2))
X = vectorizer.fit_transform(reviewtbl[reviewtbl["rating"]<=3.5]["text"].unique())

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)

done in 0.234375s
n_samples: 672, n_features: 4372


In [23]:
# Do the Clustering
kguess = 2
km = KMeans(n_clusters=kguess, init='k-means++', max_iter=100, n_init=1,
                verbose=False)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()
#prints top terms per cluster
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(kguess):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s\n' % terms[ind], end='')
    print()


Clustering sparse data with KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=2, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=False)
done in 0.188s

Cluster 0: luke
 like
 force
 story
 characters
 disney
 johnson
 rian
 new
 one

Cluster 1: worst
 plot
 bad
 good
 like
 terrible
 disappointing
 plot holes
 holes
 ever

