# DSCI-508 Project 7
### Matt Snyder

In [1]:
import pandas as pd
import numpy as np

In [2]:
import nltk

# To start, we need some text to play with. NLTK has many corpora and resources for you to explore natural language. 
# A one-off run of nltk.download() will get you all the resources in one go. Once you've done that you should have 
# a repository of interesting texts including stuff like Moby Dick and an Inaugural Address Corpus

nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/matt/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [3]:
from nltk.corpus import movie_reviews
print(movie_reviews.categories()) # 'pos' (positive) and 'neg' (negative)
# print(movie_reviews.fileids()) # Lists review filenames


['neg', 'pos']


In [4]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/matt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
from string import punctuation
customStopWords=set(stopwords.words('english')+list(punctuation))

In [6]:
from nltk.probability import FreqDist
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import math

## Create DataFrame of Positive Movie Reviews

In [7]:
pos_fileids = movie_reviews.fileids(categories='pos')

In [8]:
movie_reviews.raw(fileids=pos_fileids[0])

'films adapted from comic books have had plenty of success , whether they\'re about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there\'s never really been a comic book like from hell before . \nfor starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid \'80s with a 12-part series called the watchmen . \nto say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd . \nthe book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes . \nin other words , don\'t dismiss this film because of its source . \nif you can get past the whole comic book thing , you might find another stumbling block in from hell\'s directors , albert and allen hughes . \ngetting the hughes brothers to direct this seem

In [9]:
# make a dataframe with two columns: text of the movie review, tokenized words (unordered) minus stop words
# also a column for the tf/idf vector
df_dict = {'text':[], 'words':[], 'vector':[]}
for id in pos_fileids:
    df_dict['text'].append (movie_reviews.raw(fileids=id))
    df_dict['words'].append (list(movie_reviews.words(fileids=id)))
    df_dict['vector'].append ([])
df = pd.DataFrame(df_dict)
# remove stopwords and punctuation
df['words'] = df['words'].apply(lambda x: [term for term in x if term not in customStopWords])
df.head()

Unnamed: 0,text,words,vector
0,films adapted from comic books have had plenty...,"[films, adapted, comic, books, plenty, success...",[]
1,every now and then a movie comes along from a ...,"[every, movie, comes, along, suspect, studio, ...",[]
2,you've got mail works alot better than it dese...,"[got, mail, works, alot, better, deserves, ord...",[]
3,""" jaws "" is a rare film that grabs your atten...","[jaws, rare, film, grabs, attention, shows, si...",[]
4,moviemaking is a lot like being the general ma...,"[moviemaking, lot, like, general, manager, nfl...",[]


## Compute Vectors using TF/IDF

In [10]:
# master vector of union of all words in reviews
master_set = set(df.loc[0, 'words'])
for i in range(1, len(df)):
    master_set.update(df.loc[i, 'words']) # accumulate union of words in set
master_array = np.array(list(master_set)) # convert set to array
print (len(master_array))

30236


In [11]:
# compute tf/idf for each review

# first pass, assemble vectors with just term freq
tf_vectors = []
for i in range(0, len(df)):
    fdist = FreqDist(df.loc[i, 'words'])
    vector = np.array([fdist[word] for word in master_array])
    tf_vectors.append(vector)

In [12]:
# using vectors of tf, compute document counts of these same words

# convert non-zero term counts into 1's
def zero_or_not(x):
    return 1 if x > 0 else 0
zero_or_not_vectorized = np.vectorize(zero_or_not)
doc_count_vectors = [zero_or_not_vectorized(v) for v in tf_vectors]

# convert array list to matrix and sum the 1's 
doc_count_matrix = np.stack(doc_count_vectors)
doc_counts_per_word = np.sum(doc_count_matrix, axis=0)
print (doc_counts_per_word[0:20])

[ 2  2 15  1 18  1  1  7  2  6  1  1  6  1  1 23  2  8  2  1]


In [13]:
# compute idf = inverse document frequency
# = log of the number of documents divided by the log of one plus the number of documents containing that word
def idf(doc_count:int, total_docs:int):
    return math.log( total_docs / 1+doc_count )
idf_vectorized = np.vectorize(idf)
total_docs = len(df)
idf_per_word = idf_vectorized(doc_counts_per_word, total_docs)
print (idf_per_word[0:20])

[6.90975328 6.90975328 6.92264389 6.90875478 6.9255952  6.90875478
 6.90875478 6.91473089 6.90975328 6.91373735 6.90875478 6.90875478
 6.91373735 6.90875478 6.90875478 6.93049477 6.90975328 6.91572345
 6.90975328 6.90875478]


In [14]:
# product of tf vectors and idf vector gives final tf/idf vectors
tf_idf_vectors = [v * idf_per_word for v in tf_vectors]
print (tf_idf_vectors[0][0:100])

[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         6.95749737 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         6.90975328 0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.        ]


In [15]:
# add tf/idf vector column to dataframe
df['vector'] = tf_idf_vectors
print (df.loc[0, 'vector'][0:100])
df.head()

[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         6.95749737 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         6.90975328 0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.        ]


Unnamed: 0,text,words,vector
0,films adapted from comic books have had plenty...,"[films, adapted, comic, books, plenty, success...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,every now and then a movie comes along from a ...,"[every, movie, comes, along, suspect, studio, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,you've got mail works alot better than it dese...,"[got, mail, works, alot, better, deserves, ord...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,""" jaws "" is a rare film that grabs your atten...","[jaws, rare, film, grabs, attention, shows, si...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,moviemaking is a lot like being the general ma...,"[moviemaking, lot, like, general, manager, nfl...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


## Test Vectors With Cluster Model

In [16]:
# try making a cluster model with large number of clusters
X = np.stack(df['vector'].values)
print (X.shape)
model = KMeans(n_clusters=100)
model.fit(X)

(1000, 30236)


In [17]:
# add cluster labels onto dataframe
clusters = model.labels_ # labels of all the X data; no need to run predict to get it, since it's saved in model
df['cluster'] = clusters
df.head()

Unnamed: 0,text,words,vector,cluster
0,films adapted from comic books have had plenty...,"[films, adapted, comic, books, plenty, success...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",68
1,every now and then a movie comes along from a ...,"[every, movie, comes, along, suspect, studio, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",68
2,you've got mail works alot better than it dese...,"[got, mail, works, alot, better, deserves, ord...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",22
3,""" jaws "" is a rare film that grabs your atten...","[jaws, rare, film, grabs, attention, shows, si...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",74
4,moviemaking is a lot like being the general ma...,"[moviemaking, lot, like, general, manager, nfl...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",31


In [18]:
# find small clusters, i.e. those containing 2 reviews
hist_counts, bin_edges = np.histogram(clusters, bins=100)
clusters_with_two_reviews = [i for i, count in enumerate(hist_counts) if count == 2]
print (clusters_with_two_reviews)

[5, 9, 10, 23, 33, 51, 53, 80, 89]


In [19]:
# show the reviews from the 2-review clusters
for cluster in clusters_with_two_reviews:
    sel = df.loc[:,'cluster'] == cluster
    similar_reviews_df = df.loc[sel, :]
    review_texts = similar_reviews_df['text'].values
    for i in range(0, len(review_texts)):
        print (review_texts[i][0:500])
        print ()
    print ('----------------------------------------------')

i think the first thing this reviewer should mention is wether or not i am a fan of the x-files . 
first , let me assure you that no prior experience with the series is required to fully enjoy this movie . 
the producers are not stupid , making a movie just for fans of the series is not profitable . 
you have to reach for a larger audience . 
therefore , the movie is quite user-friendly . 
altough , non-fans will only fail to understand certain emotions behind the looks many characters exchange 

i think the first thing this reviewer should mention is wether or not i am a fan of the x-files . 
first , let me assure you that no prior experience with the series is required to fully enjoy this movie . 
the producers are not stupid , making a movie just for fans of the series is not profitable . 
you have to reach for a larger audience . 
therefore , the movie is quite user-friendly . 
altough , non-fans will only fail to understand certain emotions behind the looks many characters exchang

## Conclusion
The reviews in the 2-review clusters do indeed use similar words, and many are about the same movie, therefore TF/IDF can be used to compare two chunks of text.  And many of the results in the 2-review clusters were apparently duplicates, so this is also a useful method for finding duplicates