# Week 3 Exercise

In [39]:
import pandas as pd
import numpy as np
from textblob import TextBlob
from pattern.en import sentiment
import re
import unicodedata
import sys
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer


## TextBlob Sentiment Analyzer

##### Importing Movie Review Data

In [40]:
# file location
file_name = "labeledTrainData.tsv"
# importing reviews file
reviews = pd.read_csv(file_name, sep="\t")
reviews.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


##### How Many Positive/Negative Reviews?

In [41]:
# find how many positive and negative reviews there are
positive = reviews[reviews["sentiment"] == 1].shape[0]
negative = reviews[reviews["sentiment"] == 0].shape[0]
# print out num of positive/negative reviews
print(f"Number of positive reviews:{positive}\n"
      f"Number of negative reviews:{negative}")

Number of positive reviews:12500
Number of negative reviews:12500


##### TextBlob Sentiment Classification

In [42]:
# TextBlob classification; positive if polarity >= 0 else negative 
sentiments = [1  if TextBlob(review).sentiment.polarity >= 0 else 0 for review in reviews.review]

In [43]:
# add textblob sentiments to df
reviews["sentiment_tb"] = sentiments
# find how many positive and negative reviews there are
positive2 = reviews[reviews["sentiment_tb"] == 1].shape[0]
negative2 = reviews[reviews["sentiment_tb"] == 0].shape[0]
# print out num of positive/negative reviews
print(f"Number of positive reviews:{positive2}\n"
      f"Number of negative reviews:{negative2}")

Number of positive reviews:19017
Number of negative reviews:5983


##### Accuracy of TextBlob Model

In [44]:
# calculate number of correct textblob sentiments
correct = reviews[reviews["sentiment"] == reviews["sentiment_tb"]].shape[0]
# calclulate accuracy as number of correct guesses / total data
accuracy = correct / reviews.shape[0]
print(f"Accuracy of TextBlob Sentiments: {accuracy * 100}%")

Accuracy of TextBlob Sentiments: 68.524%


##### Extra Credit Sentiment Analysis Using Pattern.

In [45]:
# Pattern sentiment analysis; positive if polarity >= 0 else negative
sentiments2 = [1 if sentiment(review)[0] >= 0 else 0 for review in reviews.review]

In [46]:
# add pattern sentiments to df
reviews["sentiment_pat"] = sentiments2
# find how many positive and negative reviews there are
positive3 = reviews[reviews["sentiment_pat"] == 1].shape[0]
negative3 = reviews[reviews["sentiment_pat"] == 0].shape[0]
# print out num of positive/negative reviews
print(f"Number of positive reviews:{positive3}\n"
      f"Number of negative reviews:{negative3}")

Number of positive reviews:18918
Number of negative reviews:6082


###### Accuracy of Pattern Model

In [47]:
# calculate number of correct patttern sentiments
correct2 = reviews[reviews["sentiment"] == reviews["sentiment_pat"]].shape[0]
# calclulate accuracy as number of correct guesses / total data
accuracy2 = correct2 / reviews.shape[0]
print(f"Accuracy of Pattern Sentiments: {accuracy2 * 100}%")

Accuracy of Pattern Sentiments: 68.848%


## Prepping Text for a Custom Model

In [48]:
# downloading nltk stopwords
# nltk.download("stopwords")

In [49]:
def clean_reviews(reviews):
    """prepares series of text data for sentiment analysis by:
       converting to lowercase, removing punctuation & special characters, 
       removing stop words, and stemming words. Joins tokenized words together 
       at the end for use with Vectorizers"""
    # convert all text to lowercase
    lowercase = reviews.str.lower()
    # remove punctuation and special characters
    clean = [re.sub('[^A-Za-z0-9 ]+', '', review) for review in lowercase]
    # english stop words
    stop_words = stopwords.words("english")
    # tokenize review into words
    token_clean = [word_tokenize(review) for review in clean]
    # remove stop words
    cleaner = [[word for word in review if word not in stop_words ] for review in token_clean]
    # stem words
    porter = PorterStemmer()
    stemmed = [[porter.stem(word) for word in review] for review in cleaner]
    # join words back together
    cleaned = [" ".join(review) for review in stemmed]
    return cleaned

In [50]:
# create cleaned reviews for use with bag of words & tfidf
cleaned = clean_reviews(reviews.review)

In [51]:
# create bag of words
count = CountVectorizer()
bag_of_words = count.fit_transform(cleaned)
# make sure it's same number of rows as original df
bag_of_words

<25000x92331 sparse matrix of type '<class 'numpy.int64'>'
	with 2438710 stored elements in Compressed Sparse Row format>

In [52]:
# make tf-idf
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(cleaned)
feature_matrix

<25000x92331 sparse matrix of type '<class 'numpy.float64'>'
	with 2438710 stored elements in Compressed Sparse Row format>