# Sentiment Analysis of comments using NaiveBayes and SVM

## Import Libraries

### Standard Libraries

In [1]:
import pandas as pd
import numpy as np
import re

### SKLearn Libraries

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

## Load Dataset

In [3]:
# Read comments_withLabels.tsv (encoding is set to latin-1 because of the special characters)
df = pd.read_csv('C:/Users/cherryb/Desktop/Personal Projects/Datasets/Telus - Fintech/cleaned/commentsCleaned_NV-SVM.tsv', sep='\t')
# Inspect df
df.head()

Unnamed: 0.1,Unnamed: 0,is_reply,comment_message,comment_published,comment_like_count,attachment_type,Sentiments
0,0,1,[],2019-06-18T00:08:26+0000,0,,Neutral
1,1,0,"['worst', 'card', 'ever', 'lawsuit', 'time']",2019-06-04T19:53:28+0000,6,,Negative
2,2,0,"['think', 'need', 'hear', 'year', 'think', 'bi...",2019-05-14T03:28:34+0000,0,,Neutral
3,3,0,"['long', 'take', 'get', 'refunded', 'money', '...",2019-04-20T20:44:07+0000,1,,Negative
4,4,0,"['really', 'recipe', 'heaven']",2019-03-15T22:37:37+0000,0,,Neutral


In [4]:
# Drop unnecessary columns
df = df.drop(['Unnamed: 0'], axis=1)

In [5]:
# Inspect the Dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3226 entries, 0 to 3225
Data columns (total 6 columns):
is_reply              3226 non-null int64
comment_message       3226 non-null object
comment_published     3226 non-null object
comment_like_count    3226 non-null int64
attachment_type       9 non-null object
Sentiments            3226 non-null object
dtypes: int64(2), object(4)
memory usage: 151.3+ KB


In [6]:
# Copy the dataframe first, then get columns: comment_message, sentiments
df_sub = df.copy()
df_sub = df_sub[['comment_message', 'Sentiments']]

In [7]:
# Inspect df_sub
df_sub.head()

Unnamed: 0,comment_message,Sentiments
0,[],Neutral
1,"['worst', 'card', 'ever', 'lawsuit', 'time']",Negative
2,"['think', 'need', 'hear', 'year', 'think', 'bi...",Neutral
3,"['long', 'take', 'get', 'refunded', 'money', '...",Negative
4,"['really', 'recipe', 'heaven']",Neutral


In [8]:
# Check for null values
df_sub.isnull().sum()

comment_message    0
Sentiments         0
dtype: int64

In [9]:
# Rename 'comment_message' to 'comment_message', 'Sentiments' to 'sentiments'
df_sub.columns = ['comment_message', 'sentiments']

In [10]:
# Check dataset again
df_sub.head()

Unnamed: 0,comment_message,sentiments
0,[],Neutral
1,"['worst', 'card', 'ever', 'lawsuit', 'time']",Negative
2,"['think', 'need', 'hear', 'year', 'think', 'bi...",Neutral
3,"['long', 'take', 'get', 'refunded', 'money', '...",Negative
4,"['really', 'recipe', 'heaven']",Neutral


In [11]:
# Get the value counts for each sentiment
df_sub['sentiments'].value_counts()

Negative    1461
Neutral      885
Positive     880
Name: sentiments, dtype: int64

## Train Test Split

In [12]:
# Set 'comment_message as X and Y'
X = df_sub['comment_message']
Y = df_sub['sentiments']

# Split dataset into train and test subset; 70% Train, 30% Test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state = 20)

# Print shape of X_train, Y_train, X_test, Y_test
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((2258,), (2258,), (968,), (968,))

## Data Pre-processing

### LabelEncoder

In [13]:
# LabelEncoder
Encoder = LabelEncoder()
# Transform Y_train, Y_test to numerical data
Y_train = Encoder.fit_transform(Y_train)
Y_test = Encoder.fit_transform(Y_test)

### Word Vectorization using TFIDF

In [14]:
# Instantiate TfidfVectorizer
Tfidf_vect = TfidfVectorizer(max_features=5000)
# Fit X_train
Tfidf_vect.fit(X_train)
# Transform X_train, X_test
X_train_Tfidf = Tfidf_vect.transform(X_train)
X_test_Tfidf = Tfidf_vect.transform(X_test)

### Word Vectorization using CountVectorizer

In [15]:
# Instantiate CountVectorizer
Count_vect = CountVectorizer()
# Transform X_train
Count_vect.fit(X_train)
# Transform X_train, X_test
X_train_Cvect = Count_vect.transform(X_train)
X_test_Cvect = Count_vect.transform(X_test)

## Sentiment Analysis - Naive Bayes

### Using TFIDF

In [16]:
# Instantiate MultinomialNB
Naive = MultinomialNB(alpha=0.1)
# Fit the training dataset on the NB classifier; using X_train_Tfidf
Naive.fit(X_train_Tfidf, Y_train)
# Predict the labels on validation dataset
Y_pred_TNB = Naive.predict(X_test_Tfidf)
# Use accuracy_score function to get the accuracy
accuracy_NB_Tfidf = accuracy_score(Y_pred_TNB, Y_test)*100
print('Naive Bayes using TFIDF Accuracy Score -> ', accuracy_NB_Tfidf)

Naive Bayes using TFIDF Accuracy Score ->  74.79338842975206


### Using CountVectorizer

In [17]:
# Fit the training dataset on the NB classifier; using X_train_Cvect
Naive.fit(X_train_Cvect, Y_train)
# Predict the labels on validation dataset
Y_pred_CNB = Naive.predict(X_test_Cvect)
# Use accuracy_score function to get the accuracy
accuracy_NB_Cvect = accuracy_score(Y_pred_CNB, Y_test)*100
print('Naive Bayes using CountVectorizer Accuracy Score -> ', accuracy_NB_Cvect)

Naive Bayes using CountVectorizer Accuracy Score ->  74.27685950413223


## Sentiment Analysis - SVM

### Using TFIDF

In [18]:
# Instantiate SVC
svm = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
# Fit the training dataset on the svm classifier
svm.fit(X_train_Tfidf, Y_train)
# Predict the labels on validation dataset
Y_pred_TSVM = svm.predict(X_test_Tfidf)
# Use accuracy_score function to get the accuracy
accuracy_SVM_Tfidf = accuracy_score(Y_pred_TSVM, Y_test)*100
print('SVM using TFIDF Accuracy Score -> ', accuracy_SVM_Tfidf)

SVM using TFIDF Accuracy Score ->  80.26859504132231


### Using CountVectorizer

In [19]:
# Fit the training dataset on the svm classifier
svm.fit(X_train_Cvect, Y_train)
# Predict the labels on validation dataset
y_pred_CSVM = svm.predict(X_test_Cvect)
# Use accuracy_score function to get the accuracy
accuracy_SVM_Cvect = accuracy_score(y_pred_CSVM, Y_test)*100
print('SVM using CountVectorizer Accuracy Score -> ', accuracy_SVM_Cvect)

SVM using CountVectorizer Accuracy Score ->  80.99173553719008


## Create DataFrame for Accuracy Scores

In [20]:
score_dict = {'NB-CVect': pd.Series([accuracy_NB_Cvect], index=['Accuracy Score']),
              'NB-TFIDF': pd.Series([accuracy_NB_Tfidf], index=['Accuracy Score']),
              'SVM-CVect': pd.Series([accuracy_SVM_Cvect], index=['Accuracy Score']),
              'SVM-TFIDF': pd.Series([accuracy_SVM_Tfidf], index=['Accuracy Score'])}

scores = pd.DataFrame(score_dict)

In [21]:
scores.head()

Unnamed: 0,NB-CVect,NB-TFIDF,SVM-CVect,SVM-TFIDF
Accuracy Score,74.27686,74.793388,80.991736,80.268595


## Save scores as scores_NB_SVM

In [22]:
scores.to_csv('C:/Users/cherryb/Desktop/Personal Projects/Datasets/Telus - Fintech/results/scores_NB_SVM.tsv', sep='\t')