# 3.2 Exercises - Sentiment Analysis and Preprocessing Text
# Rahul Rajeev

In [133]:
# libraries
import numpy as np
import pandas as pd

## Part 1 Using the Textblob Sentiment Analyzer

In [134]:
# 1. loading in dataset
review_data=pd.read_csv('labeledTrainData.tsv',sep='\t')
review_data

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...
...,...,...,...
24995,3453_3,0,It seems like more consideration has gone into...
24996,5064_1,0,I don't believe they made this film. Completel...
24997,10905_3,0,"Guy is a loser. Can't get girls, needs to buil..."
24998,10194_3,0,This 30 minute documentary Buñuel made in the ...


In [135]:
# 2. count the number of positive and negative reviews
review_data.sentiment.value_counts()

1    12500
0    12500
Name: sentiment, dtype: int64

There are an equal amount of positive and negative reviews, about 12500 of each.

In [136]:
# importing TextBlob
from textblob import TextBlob

In [137]:
review_data['review'][1]

'\\The Classic War of the Worlds\\" by Timothy Hines is a very entertaining film that obviously goes to great effort and lengths to faithfully recreate H. G. Wells\' classic book. Mr. Hines succeeds in doing so. I, and those who watched his film with me, appreciated the fact that it was not the standard, predictable Hollywood fare that comes out every year, e.g. the Spielberg version with Tom Cruise that had only the slightest resemblance to the book. Obviously, everyone looks for different things in a movie. Those who envision themselves as amateur \\"critics\\" look only to criticize everything they can. Others rate a movie on more important bases,like being entertained, which is why most people never agree with the \\"critics\\". We enjoyed the effort Mr. Hines put into being faithful to H.G. Wells\' classic novel, and we found it to be very entertaining. This made it easy to overlook what the \\"critics\\" perceive to be its shortcomings."'

In [138]:
# 3. Use Textblob to classify each movie review as positive or negative
# greater than or equal to zero is a positive sentiment, and below zero is a negative sentiment
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

review_data['Polarity'] = review_data['review'].apply(getPolarity)

In [139]:
# function to say whether the sentiment is positive or negative
def getAnalysis(score):
    if score < 0:
        return 0
    elif score >= 0:
        return 1
    else: 
        return NaN

# applying this funciton to the entire polarity column
review_data['Analysis'] = review_data['Polarity'].apply(getAnalysis)

In [141]:
# iterating through each element and writing whether the prediction was correct or not, comparing the sentiment of the dataset
# to the TextBlob analysis.
review_data['Accuracy'] = ''
for i in range(len(review_data['Analysis'])):
    review_data['Accuracy'][i] = review_data['Analysis'][i] == review_data['sentiment'][i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_data['Accuracy'][i] = review_data['Analysis'][i] == review_data['sentiment'][i]


In [142]:
# checking whether the above code worked
review_data.head()

Unnamed: 0,id,sentiment,review,Polarity,Analysis,Accuracy
0,5814_8,1,With all this stuff going down at the moment w...,0.001277,1,True
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349,1,True
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941,0,True
3,3630_4,0,It must be assumed that those who praised this...,0.134753,1,False
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842,0,False


In [143]:
# calculating the accuracy, dividing the length of the subset where Analysis is true divided by the length of the entire column
accuracy = len(review_data[review_data['Analysis'] == True]) / len(review_data['Analysis'])
print('The accuracy of TextBlob without text cleaning is:', accuracy*100, '%.')

The accuracy of TextBlob without text cleaning is: 76.068 %.


In [144]:
# 5. importing VADER
import vaderSentiment

In [145]:
# initializing the sentiment intensity analyzer object
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sid_obj = SentimentIntensityAnalyzer()

In [146]:
# creating a helper function that calls the polarity and withdraws the compound part from the dictionary
def VaderPolarity(text):
    return sid_obj.polarity_scores(text)['compound']

review_data['VPolarity'] = review_data['review'].apply(VaderPolarity)

In [147]:
review_data.head()

Unnamed: 0,id,sentiment,review,Polarity,Analysis,Accuracy,VPolarity
0,5814_8,1,With all this stuff going down at the moment w...,0.001277,1,True,-0.8879
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349,1,True,0.9736
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941,0,True,-0.9883
3,3630_4,0,It must be assumed that those who praised this...,0.134753,1,False,-0.1202
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842,0,False,0.6115


In [148]:
# apply helper function to create the analysis
review_data['VAnalysis'] = review_data['VPolarity'].apply(getAnalysis)

In [150]:
# creating a dummy column
review_data['VAccuracy'] = ''

# creating actual column that compares the results with the original sentiment
for i in range(len(review_data['Analysis'])):
    review_data['VAccuracy'][i] = review_data['VAnalysis'][i] == review_data['sentiment'][i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_data['VAccuracy'][i] = review_data['VAnalysis'][i] == review_data['sentiment'][i]


In [151]:
# checking data head
review_data.head()

Unnamed: 0,id,sentiment,review,Polarity,Analysis,Accuracy,VPolarity,VAnalysis,VAccuracy
0,5814_8,1,With all this stuff going down at the moment w...,0.001277,1,True,-0.8879,0,False
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349,1,True,0.9736,1,True
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941,0,True,-0.9883,0,True
3,3630_4,0,It must be assumed that those who praised this...,0.134753,1,False,-0.1202,0,True
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842,0,False,0.6115,1,True


In [153]:
# calculating accuracy
vaccuracy = len(review_data[review_data['VAnalysis'] == True]) / len(review_data['VAnalysis'])
print('The accuracy of Vader without text cleaning is:', vaccuracy*100, '%.')

The accuracy of Vader without text cleaning is: 66.444 %.


## Part 2 Prepping Text for a Custom Model

In [154]:
# 1. convering all characters to lowercase
review_data['review'] = review_data['review'].str.lower()
review_data

Unnamed: 0,id,sentiment,review,Polarity,Analysis,Accuracy,VPolarity,VAnalysis,VAccuracy
0,5814_8,1,with all this stuff going down at the moment w...,0.001277,1,True,-0.8879,0,False
1,2381_9,1,"\the classic war of the worlds\"" by timothy hi...",0.256349,1,True,0.9736,1,True
2,7759_3,0,the film starts with a manager (nicholas bell)...,-0.053941,0,True,-0.9883,0,True
3,3630_4,0,it must be assumed that those who praised this...,0.134753,1,False,-0.1202,0,True
4,9495_8,1,superbly trashy and wondrously unpretentious 8...,-0.024842,0,False,0.6115,1,True
...,...,...,...,...,...,...,...,...,...
24995,3453_3,0,it seems like more consideration has gone into...,0.102083,1,False,0.8750,1,False
24996,5064_1,0,i don't believe they made this film. completel...,0.090813,1,False,0.9861,1,False
24997,10905_3,0,"guy is a loser. can't get girls, needs to buil...",0.145256,1,False,0.9252,1,False
24998,10194_3,0,this 30 minute documentary buñuel made in the ...,0.065625,1,False,-0.9598,0,True


In [155]:
import re

In [156]:
# 2. remove punctuation and special characters from the text using apply and a lambda function
review_data['review'] = review_data['review'].apply(lambda x: re.sub('[^A-Za-z0-9]', ' ', x))
review_data

Unnamed: 0,id,sentiment,review,Polarity,Analysis,Accuracy,VPolarity,VAnalysis,VAccuracy
0,5814_8,1,with all this stuff going down at the moment w...,0.001277,1,True,-0.8879,0,False
1,2381_9,1,the classic war of the worlds by timothy hi...,0.256349,1,True,0.9736,1,True
2,7759_3,0,the film starts with a manager nicholas bell ...,-0.053941,0,True,-0.9883,0,True
3,3630_4,0,it must be assumed that those who praised this...,0.134753,1,False,-0.1202,0,True
4,9495_8,1,superbly trashy and wondrously unpretentious 8...,-0.024842,0,False,0.6115,1,True
...,...,...,...,...,...,...,...,...,...
24995,3453_3,0,it seems like more consideration has gone into...,0.102083,1,False,0.8750,1,False
24996,5064_1,0,i don t believe they made this film completel...,0.090813,1,False,0.9861,1,False
24997,10905_3,0,guy is a loser can t get girls needs to buil...,0.145256,1,False,0.9252,1,False
24998,10194_3,0,this 30 minute documentary bu uel made in the ...,0.065625,1,False,-0.9598,0,True


In [157]:
# 3. Remove stop words.

# import nltk and download stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [158]:
# import stopwords from nltk, and initialize the stop words list in english
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [159]:
# apply the lambda function that joins the string together as long as the split string words are not in stop list
# and create a new column
review_data['review_without_stopwords'] = review_data['review'].apply(lambda x: ' '.join(
    [word for word in x.split() if word not in (stop)]))

In [160]:
# making sure the review_without_stopwords is a new column
review_data.head()

Unnamed: 0,id,sentiment,review,Polarity,Analysis,Accuracy,VPolarity,VAnalysis,VAccuracy,review_without_stopwords
0,5814_8,1,with all this stuff going down at the moment w...,0.001277,1,True,-0.8879,0,False,stuff going moment mj started listening music ...
1,2381_9,1,the classic war of the worlds by timothy hi...,0.256349,1,True,0.9736,1,True,classic war worlds timothy hines entertaining ...
2,7759_3,0,the film starts with a manager nicholas bell ...,-0.053941,0,True,-0.9883,0,True,film starts manager nicholas bell giving welco...
3,3630_4,0,it must be assumed that those who praised this...,0.134753,1,False,-0.1202,0,True,must assumed praised film greatest filmed oper...
4,9495_8,1,superbly trashy and wondrously unpretentious 8...,-0.024842,0,False,0.6115,1,True,superbly trashy wondrously unpretentious 80 ex...


In [161]:
# 4. Apply NLTK PorterStemmer
from nltk.stem.porter import PorterStemmer

In [162]:
# initializing 
stemmer = PorterStemmer()

In [163]:
# creating helper function that helps iterate through every word in each review sentence and apply the porter stemmer
def stem_sentences(sentence):
    tokens = sentence.split()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

# creating stemmed column with the results
review_data['stemmed'] = review_data['review_without_stopwords'].apply(stem_sentences)

In [164]:
# checking whether the porter stemmer worked
review_data.head()

Unnamed: 0,id,sentiment,review,Polarity,Analysis,Accuracy,VPolarity,VAnalysis,VAccuracy,review_without_stopwords,stemmed
0,5814_8,1,with all this stuff going down at the moment w...,0.001277,1,True,-0.8879,0,False,stuff going moment mj started listening music ...,stuff go moment mj start listen music watch od...
1,2381_9,1,the classic war of the worlds by timothy hi...,0.256349,1,True,0.9736,1,True,classic war worlds timothy hines entertaining ...,classic war world timothi hine entertain film ...
2,7759_3,0,the film starts with a manager nicholas bell ...,-0.053941,0,True,-0.9883,0,True,film starts manager nicholas bell giving welco...,film start manag nichola bell give welcom inve...
3,3630_4,0,it must be assumed that those who praised this...,0.134753,1,False,-0.1202,0,True,must assumed praised film greatest filmed oper...,must assum prais film greatest film opera ever...
4,9495_8,1,superbly trashy and wondrously unpretentious 8...,-0.024842,0,False,0.6115,1,True,superbly trashy wondrously unpretentious 80 ex...,superbl trashi wondrous unpretenti 80 exploit ...


In [181]:
# 5. bag-of-words matrix from the stemmed text
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [174]:
# initializing count vectorizer object
count = CountVectorizer()
# calling the fit_transform across every row in the stemmed column
bag_of_words = count.fit_transform(review_data['stemmed'].values)
# displaying matrix with dimensions
bag_of_words

<25000x50785 sparse matrix of type '<class 'numpy.int64'>'
	with 2383274 stored elements in Compressed Sparse Row format>

The rows of the bag_of_words matrix match the rows of the dataframe, 25000.

In [175]:
# for fun transforming into array, but it did take a long time.
bag_of_words.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [179]:
# get_features_names wasn't working, so I found an alternate method to finding the counts
count.vocabulary_

{'stuff': 42952,
 'go': 18335,
 'moment': 29569,
 'mj': 29412,
 'start': 42364,
 'listen': 26221,
 'music': 30294,
 'watch': 48811,
 'odd': 31846,
 'documentari': 12805,
 'wiz': 49621,
 'moonwalk': 29740,
 'mayb': 28074,
 'want': 48708,
 'get': 17911,
 'certain': 8032,
 'insight': 22392,
 'guy': 19326,
 'thought': 44830,
 'realli': 36466,
 'cool': 9983,
 'eighti': 14022,
 'make': 27295,
 'mind': 29116,
 'whether': 49191,
 'guilti': 19192,
 'innoc': 22346,
 'part': 33138,
 'biographi': 5243,
 'featur': 15709,
 'film': 16024,
 'rememb': 36926,
 'see': 39455,
 'cinema': 8806,
 'origin': 32281,
 'releas': 36875,
 'subtl': 43093,
 'messag': 28746,
 'feel': 15740,
 'toward': 45499,
 'press': 35025,
 'also': 2039,
 'obviou': 31789,
 'drug': 13378,
 'bad': 3862,
 'kay': 24049,
 'br': 6172,
 'visual': 48345,
 'impress': 21937,
 'cours': 10280,
 'michael': 28879,
 'jackson': 22950,
 'unless': 47086,
 'remot': 36948,
 'like': 26053,
 'anyway': 2669,
 'hate': 19928,
 'find': 16080,
 'bore': 6008,


In [182]:
# initializing term frequency-inverse document
tfidf = TfidfVectorizer()
# calling fit transform across every row
feature_matrix = tfidf.fit_transform(review_data['stemmed'].values)

In [183]:
# checking dimensions
feature_matrix

<25000x50785 sparse matrix of type '<class 'numpy.float64'>'
	with 2383274 stored elements in Compressed Sparse Row format>

The rows of the term frequency-inverse document matrix match the rows of the dataframe, 25000.