## Text Analysis of Last Jedi Audience Reviews
This notebook assumes you have already scraped audience reviews from Rotten Tomatoes. This notebook will analyze frequence of ngrams and try some text classifiers.

In [4]:
#load libraries
import pandas as pd
import nltk
import os
import string

In [7]:
#load the reviews file
os.chdir(r"O:\PDES\PRISM\Sullivan\Personal Projects")
reviewtbl = pd.read_csv("RT_Last_Jedi_2017-12-28.txt", sep="\t")

#check - well at least the export/import fixes some of my string issues
reviewtbl.head()

Unnamed: 0,userid,username,rating,text
0,840073561,['Jeffrey O'],2.5,"[""At least it was sort of original? Other than..."
1,977007867,['Claire R'],0.5,"[""The acting was great but the story writing w..."
2,976967449,['Caleb D'],0.5,['Would not recommend for so many reasons. Thi...
3,977007880,['James F'],0.5,['Horrible. Just watch any of the Youtube stuf...
4,977007877,['Michael C'],4.0,['Really good movie better than Force Awakens ...


In [70]:
#setup stop words
from nltk.corpus import stopwords
#stop_words = set(stopwords.words('english'))
mywords = set(["star", "wars", "movie", "film"])
#add custom words
stop_words = set(stopwords.words('english')) | mywords

In [71]:

from nltk.stem.wordnet import WordNetLemmatizer

#clean and stem the review text
def txtclean(mytxt, stop_words, lmtzr):
    #convert to list of words
    tokens = nltk.word_tokenize(mytxt)
    
    #keep only alphanumreic
    words = [x for x in [re.sub("\W","",x) for x in tokens] if x.isalpha()]
    
    #lemmatize  - odd treatment of 'was'
    std_words = [lmtzr.lemmatize(t) for t in words]
    
    #remove standard stop words (convert to lower) and return
    return [w for w in [x.lower() for x in std_words] if not w in stop_words]

#example
txtclean(reviewtbl["text"][14], stop_words, WordNetLemmatizer())

['saw',
 'weekend',
 'afterwards',
 'really',
 'think',
 'lot',
 'mixed',
 'feeling',
 'lot',
 'thinking',
 'determined',
 'really',
 'liked',
 'wa',
 'different',
 'made',
 'lot',
 'bold',
 'choice',
 'opinion',
 'give',
 'credit',
 'lot',
 'thing',
 'unexpected',
 'really',
 'thought',
 'felt',
 'like',
 'really',
 'good',
 'idea',
 'acting',
 'good',
 'especially',
 'mark',
 'hamill',
 'carrie',
 'fisher',
 'comedy',
 'usually',
 'good',
 'visuals',
 'course',
 'good',
 'really',
 'enjoyed',
 'stuff',
 'rey',
 'luke',
 'part',
 'dragged',
 'wa',
 'subplot',
 'finn',
 'rose',
 'rose',
 'character',
 'care',
 'much',
 'even',
 'though',
 'wa',
 'low',
 'point',
 'wa',
 'terrible',
 'overall',
 'wa',
 'great',
 'feel',
 'fresh',
 'really',
 'think',
 'appreciate',
 'would',
 'say',
 'like',
 'one',
 'bit',
 'force',
 'awakens']

In [40]:
from nltk.stem.wordnet import WordNetLemmatizer
#[x.lower() for x in baz]
lmtzr = WordNetLemmatizer()
[lmtzr.lemmatize(t) for t in baz]

['I',
 'saw',
 'this',
 'movie',
 'on',
 'the',
 'weekend',
 'and',
 'afterwards',
 'I',
 'really',
 'had',
 'to',
 'think',
 'about',
 'it',
 'because',
 'I',
 'had',
 'a',
 'lot',
 'of',
 'mixed',
 'feeling',
 'After',
 'a',
 'lot',
 'of',
 'thinking',
 'I',
 'have',
 'determined',
 'that',
 'I',
 'really',
 'liked',
 'this',
 'movie',
 'It',
 'wa',
 'very',
 'different',
 'from',
 'the',
 'other',
 'Star',
 'Wars',
 'film',
 'and',
 'made',
 'a',
 'lot',
 'of',
 'bold',
 'choice',
 'in',
 'my',
 'opinion',
 'which',
 'I',
 'have',
 'to',
 'give',
 'credit',
 'for',
 'There',
 'were',
 'a',
 'lot',
 'of',
 'thing',
 'that',
 'were',
 'unexpected',
 'that',
 'when',
 'I',
 'really',
 'thought',
 'about',
 'it',
 'it',
 'felt',
 'like',
 'they',
 'were',
 'really',
 'good',
 'idea',
 'All',
 'the',
 'acting',
 'in',
 'this',
 'film',
 'is',
 'very',
 'good',
 'especially',
 'from',
 'Mark',
 'Hamill',
 'and',
 'Carrie',
 'Fisher',
 'The',
 'comedy',
 'is',
 'usually',
 'good',
 'and',


In [72]:
#put cleaned results back into a long string for use by sklearn
testwrds = txtclean(reviewtbl["text"][14], stop_words, WordNetLemmatizer())
" ".join(testwrds)

reviewtbl['clean text'] = [" ".join(txtclean(x, stop_words, WordNetLemmatizer())) for x in reviewtbl['text']]
reviewtbl.head()

Unnamed: 0,userid,username,rating,text,clean text
0,840073561,['Jeffrey O'],2.5,"[""At least it was sort of original? Other than...",least wa sort original story hoaky dialogue ab...
1,977007867,['Claire R'],0.5,"[""The acting was great but the story writing w...",acting wa great story writing wa aweful know q...
2,976967449,['Caleb D'],0.5,['Would not recommend for so many reasons. Thi...,would recommend many reason wa poorly written ...
3,977007880,['James F'],0.5,['Horrible. Just watch any of the Youtube stuf...,horrible watch youtube stuff skewering detail
4,977007877,['Michael C'],4.0,['Really good movie better than Force Awakens ...,really good better force awakens equal return ...


In [46]:
#scratch - testing nltk ngram approach
testwrds = txtclean(reviewtbl["text"][14], stop_words, WordNetLemmatizer())
bgs = nltk.bigrams(testwrds)

fdist = nltk.FreqDist(testwrds)
for k,v in fdist.items():
    print(k,v)

overall 1
bit 1
rey 1
weekend 1
rose 2
great 1
fresh 1
course 1
different 1
idea 1
even 1
appreciate 1
usually 1
mixed 1
low 1
think 2
point 1
awakens 1
carrie 1
star 2
subplot 1
mark 1
bold 1
though 1
opinion 1
felt 1
character 1
thinking 1
stuff 1
feel 1
luke 1
would 1
much 1
terrible 1
one 1
determined 1
movie 3
made 1
good 4
film 3
give 1
feeling 1
fisher 1
dragged 1
afterwards 1
like 2
wa 5
hamill 1
part 1
credit 1
finn 1
say 1
saw 1
choice 1
thought 1
especially 1
care 1
really 6
thing 1
unexpected 1
force 1
lot 4
wars 2
comedy 1
acting 1
liked 1
visuals 1
enjoyed 1


In [73]:
#method from stackoveflow
from sklearn.feature_extraction.text import CountVectorizer
word_vectorizer = CountVectorizer(ngram_range=(1,3), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(reviewtbl[reviewtbl["rating"]<3.5]["clean text"])
frequencies = sum(sparse_matrix).toarray()[0]
allngrams = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency'])

In [74]:
allngrams['n'] = [len(x.split()) for x in allngrams.index]
allngrams[allngrams['n']==2].sort_values('frequency', ascending=False)

Unnamed: 0,frequency,n
last jedi,139,2
force awakens,109,2
rian johnson,102,2
luke skywalker,97,2
plot hole,91,2
first order,46,2
original trilogy,44,2
character development,43,2
feel like,41,2
kylo ren,41,2
