# Sentiment Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("resources/sentimentdata.csv")
df.shape

(20, 2)

In [3]:
df.head(10)

Unnamed: 0,document,label
0,Pizza is great and I love pizza.,Positive
1,I hate burger and its bad to eat burger.,Negative
2,I hate dirty tables.,Negative
3,Burger is amazing and I love it more than anyt...,Positive
4,My boss is a monster and I hate him,Negative
5,The food was delivered late and I hate late de...,Negative
6,My wife love pizza and burger more than me,Positive
7,the table was bad and dirty and i hate this,Negative
8,Food was delicious and I love it,Positive
9,It great to have good food at good time,Positive


In [4]:
x = df.document
y = df.label

## Count Vectorization

In [5]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
dic = list(ENGLISH_STOP_WORDS)
dic.remove('not')

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(lowercase=True,stop_words='english')
vec.fit(x)

print(vec.get_feature_names())
print(len(vec.get_feature_names()))

['amazing', 'bad', 'boss', 'burger', 'delicious', 'delivered', 'delivery', 'dirty', 'eat', 'food', 'good', 'great', 'hate', 'icecream', 'juice', 'ketchup', 'late', 'love', 'monster', 'pizza', 'table', 'tables', 'time', 'wife']
24


In [7]:
x2 = vec.transform(x).toarray()
print(x2.shape)

(20, 24)


In [8]:
pd.DataFrame(x2,columns=vec.get_feature_names())

Unnamed: 0,amazing,bad,boss,burger,delicious,delivered,delivery,dirty,eat,food,...,juice,ketchup,late,love,monster,pizza,table,tables,time,wife
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,2,0,0,0,0
1,0,1,0,2,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
5,0,0,0,0,0,1,1,0,0,1,...,0,0,2,0,0,0,0,0,0,0
6,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,1
7,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
8,0,0,0,0,1,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0


## Apply ML

In [9]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100)
model.fit(x2,y)

RandomForestClassifier()

In [10]:
newdoc = ["Pasta is great and i love pasta"]
model.predict(vec.transform(newdoc))

array(['Positive'], dtype=object)

In [11]:
newdoc = ["Pasta is bad and i hate pasta"]
model.predict(vec.transform(newdoc))

array(['Negative'], dtype=object)

In [21]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
dic = list(ENGLISH_STOP_WORDS)
dic.remove('not')

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(lowercase=True,stop_words=dic)
vec.fit(x)

print(vec.get_feature_names())
print(len(vec.get_feature_names()))

['amazing', 'bad', 'boss', 'burger', 'delicious', 'delivered', 'delivery', 'dirty', 'eat', 'food', 'good', 'great', 'hate', 'icecream', 'juice', 'ketchup', 'late', 'love', 'monster', 'not', 'pizza', 'table', 'tables', 'time', 'wife']
25


In [12]:
newdoc = ["Pasta is good"]
model.predict(vec.transform(newdoc))

array(['Positive'], dtype=object)

In [13]:
newdoc = ["Pasta is not good"]
model.predict(vec.transform(newdoc))

array(['Positive'], dtype=object)

In [14]:
newdoc = ["Pasta is bad"]
model.predict(vec.transform(newdoc))

array(['Negative'], dtype=object)

In [15]:
newdoc = ["Pasta is not bad"]
model.predict(vec.transform(newdoc))

array(['Negative'], dtype=object)

In [16]:
newdoc = ["Pasta is good"]
model.predict(vec.transform(newdoc))
model.predict_proba(vec.transform(newdoc))

array([[0.391, 0.609]])

In [17]:
newdoc = ["Today is monday and pasta is good"]
model.predict(vec.transform(newdoc))
model.predict_proba(vec.transform(newdoc))

array([[0.391, 0.609]])

In [18]:
newdoc = ["Today is monday and pasta is not good"]
print(model.predict(vec.transform(newdoc)))
model.predict_proba(vec.transform(newdoc))

['Positive']


array([[0.391, 0.609]])

In [19]:
newdoc = ["Today is not monday and pasta is good"]
print(model.predict(vec.transform(newdoc)))
model.predict_proba(vec.transform(newdoc))

['Positive']


array([[0.391, 0.609]])

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(lowercase=True,stop_words=dic,ngram_range=(1,2),
min_df=2,max_df=0.7)
vec.fit(x)

print(vec.get_feature_names())
print(len(vec.get_feature_names()))

['bad', 'burger', 'burger bad', 'dirty', 'food', 'good', 'great', 'great good', 'hate', 'icecream', 'icecream not', 'juice', 'ketchup', 'ketchup pizza', 'love', 'love pizza', 'not', 'not bad', 'not good', 'pizza', 'pizza great']
21
