# Text Blob

Benchmark test using the Text Blob module for Sentiment analysis.

In [1]:
# Import necessary modules

import numpy as np
import pandas as pd

from tqdm import tqdm_notebook
from textblob import TextBlob

from sklearn.model_selection import train_test_split

In [2]:
# Import data

df = pd.read_csv("data/amazon_reviews_small.csv", names = ['Sentiment', 'Title', 'Content'])
df.head()

Unnamed: 0,Sentiment,Title,Content
0,2,Right on the money,We are using the this book to get 100+ certifi...
1,2,Serves its Purpose!,Couldn't go without it. My 3 1/2 year still we...
2,2,Trailer Park Bwoys!!!,we get to see it on paramount in ol' LND UK an...
3,1,buyer beware,There are companies selling Bosch knock-offs o...
4,2,Great for those cold winters,If you are looking to keep your water liquifie...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
Sentiment    100000 non-null int64
Title        99997 non-null object
Content      100000 non-null object
dtypes: int64(1), object(2)
memory usage: 2.3+ MB


In [4]:
# For empty titles, we can replace it with a blank title
df.Title.fillna(" ", inplace = True)

In [5]:
# Sentiment is categorized as 1 and 2 and we will change it to 0 and 1
df.Sentiment = [int(a)-1 for a in df.Sentiment]

Text Blob is a popular python library that can predict the sentiment of a text. For this benchmark test, we will use Text Blob as the benchmark for two NLP models that we will be training.

In [6]:
# Get the sentiment polarity of the title and the content
df['Title_Sentiment'] = [TextBlob(title).sentiment[0] for title in tqdm_notebook(df.Title)]
df['Content_Sentiment'] = [TextBlob(content).sentiment[0] for content in tqdm_notebook(df.Content)]

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))




In [7]:
# We will get the sum of the title and content sentiment scores
df['Title_Content_Sent_Sum'] = df['Title_Sentiment'] + df['Content_Sentiment']

In [8]:
# Viewing our data
df.head()

Unnamed: 0,Sentiment,Title,Content,Title_Sentiment,Content_Sentiment,Title_Content_Sent_Sum
0,1,Right on the money,We are using the this book to get 100+ certifi...,0.285714,0.3125,0.598214
1,1,Serves its Purpose!,Couldn't go without it. My 3 1/2 year still we...,0.0,0.424167,0.424167
2,1,Trailer Park Bwoys!!!,we get to see it on paramount in ol' LND UK an...,0.0,0.607764,0.607764
3,0,buyer beware,There are companies selling Bosch knock-offs o...,0.0,0.013889,0.013889
4,1,Great for those cold winters,If you are looking to keep your water liquifie...,0.1,0.0,0.1


In [9]:
# We can predict the sentiment by getting all values greater than or equal to 0 as positive sentiment
# and the values less than 0 as negative sentiment

df['Title_Pred'] = [1 if a >= 0 else 0 for a in df['Title_Sentiment']]
df['Content_Pred'] = [1 if a >= 0 else 0 for a in df['Content_Sentiment']]
df['Title_Content_Pred'] = [1 if a >= 0 else 0 for a in df['Title_Content_Sent_Sum']]

df.head()

Unnamed: 0,Sentiment,Title,Content,Title_Sentiment,Content_Sentiment,Title_Content_Sent_Sum,Title_Pred,Content_Pred,Title_Content_Pred
0,1,Right on the money,We are using the this book to get 100+ certifi...,0.285714,0.3125,0.598214,1,1,1
1,1,Serves its Purpose!,Couldn't go without it. My 3 1/2 year still we...,0.0,0.424167,0.424167,1,1,1
2,1,Trailer Park Bwoys!!!,we get to see it on paramount in ol' LND UK an...,0.0,0.607764,0.607764,1,1,1
3,0,buyer beware,There are companies selling Bosch knock-offs o...,0.0,0.013889,0.013889,1,1,1
4,1,Great for those cold winters,If you are looking to keep your water liquifie...,0.1,0.0,0.1,1,1,1


In [10]:
# Getting the score of the sentiment analysis on all 3 combinations

print(f"Title Sentiment Score:\t\t{sum((df['Sentiment'] == df['Title_Pred']).values)/len(df)}")
print(f"Content Sentiment Score:\t{sum((df['Sentiment'] == df['Content_Pred']).values)/len(df)}")
print(f"Summed Sentiment Score:\t\t{sum((df['Sentiment'] == df['Title_Content_Pred']).values)/len(df)}")

Title Sentiment Score:		0.64843
Content Sentiment Score:	0.6552
Summed Sentiment Score:		0.71667


The sentiment analysis scored best when we summed up the sentiment polarities of the title and the content with 71.67%