# Sentiment Analysis of comments using NaiveBayes and SVM

## Import Libraries

### Standard Libraries

In [1]:
import pandas as pd
import numpy as np
import re

### SKLearn Libraries

In [2]:
# Classification models
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score

## Load Dataset

In [3]:
# Read comments_withLabels.tsv (encoding is set to latin-1 because of the special characters)
df = pd.read_csv('commentsCleaned_NV-SVM.tsv', sep='\t')
# Inspect df
df.head()

Unnamed: 0.1,Unnamed: 0,post_id,is_reply,comment_message,comment_published,comment_like_count,attachment_type,Sentiments
0,0,155027942462_10157280442467463,1,,2019-06-18T00:08:26+0000,0,,Neutral
1,1,155027942462_10157280442467463,0,worst card ever lawsuit time,2019-06-04T19:53:28+0000,6,,Negative
2,2,155027942462_10157236349992463,0,think need hear year think bitcoin impact lot ...,2019-05-14T03:28:34+0000,0,,Neutral
3,3,155027942462_10157174321432463,0,long take get refund monei paypal state refund...,2019-04-20T20:44:07+0000,1,,Negative
4,4,155027942462_10157096430022463,0,realli recip heaven,2019-03-15T22:37:37+0000,0,,Neutral


In [4]:
# Drop unnecessary columns
df = df.drop(['Unnamed: 0'], axis=1)

In [5]:
# Inspect the Dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3226 entries, 0 to 3225
Data columns (total 7 columns):
post_id               3226 non-null object
is_reply              3226 non-null int64
comment_message       3173 non-null object
comment_published     3226 non-null object
comment_like_count    3226 non-null int64
attachment_type       9 non-null object
Sentiments            3226 non-null object
dtypes: int64(2), object(5)
memory usage: 176.5+ KB


In [6]:
# Copy the dataframe first, then get columns: comment_message, sentiments
df_sub = df.copy()
df_sub = df_sub[['post_id', 'comment_message', 'Sentiments']]

In [7]:
# Fill NA with '-'
df_sub['comment_message'].fillna('-', inplace = True) 

In [8]:
# Inspect df_sub
df_sub.head()

Unnamed: 0,post_id,comment_message,Sentiments
0,155027942462_10157280442467463,-,Neutral
1,155027942462_10157280442467463,worst card ever lawsuit time,Negative
2,155027942462_10157236349992463,think need hear year think bitcoin impact lot ...,Neutral
3,155027942462_10157174321432463,long take get refund monei paypal state refund...,Negative
4,155027942462_10157096430022463,realli recip heaven,Neutral


In [9]:
# Check for null values
df_sub.isnull().sum()

post_id            0
comment_message    0
Sentiments         0
dtype: int64

In [10]:
# Rename 'comment_message' to 'comment_message', 'Sentiments' to 'sentiments'
df_sub.columns = ['post_id', 'comment_message', 'sentiments']

In [11]:
# Check dataset again
df_sub.head()

Unnamed: 0,post_id,comment_message,sentiments
0,155027942462_10157280442467463,-,Neutral
1,155027942462_10157280442467463,worst card ever lawsuit time,Negative
2,155027942462_10157236349992463,think need hear year think bitcoin impact lot ...,Neutral
3,155027942462_10157174321432463,long take get refund monei paypal state refund...,Negative
4,155027942462_10157096430022463,realli recip heaven,Neutral


In [12]:
# Get the value counts for each sentiment
df_sub['sentiments'].value_counts()

Negative    1461
Neutral      885
Positive     880
Name: sentiments, dtype: int64

## Train Test Split

In [13]:
# Set 'comment_message as X and Y'
X = df_sub['comment_message']
Y = df_sub['sentiments']

# Split dataset into train and test subset; 70% Train, 30% Test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state = 20)

# Print shape of X_train, Y_train, X_test, Y_test
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((2258,), (2258,), (968,), (968,))

## Data Pre-processing

### LabelEncoder

In [14]:
# LabelEncoder
Encoder = LabelEncoder()
# Transform Y_train, Y_test to numerical data
Y_train = Encoder.fit_transform(Y_train)
Y_test = Encoder.fit_transform(Y_test)

### Word Vectorization using TFIDF

In [15]:
# Instantiate TfidfVectorizer
Tfidf_vect = TfidfVectorizer(max_features=5000)
# Fit X_train
Tfidf_vect.fit(X_train)
# Transform X_train, X_test
X_train_Tfidf = Tfidf_vect.transform(X_train)
X_test_Tfidf = Tfidf_vect.transform(X_test)

### Word Vectorization using CountVectorizer

In [16]:
# Instantiate CountVectorizer
Count_vect = CountVectorizer()
# Transform X_train
Count_vect.fit(X_train)
# Transform X_train, X_test
X_train_Cvect = Count_vect.transform(X_train)
X_test_Cvect = Count_vect.transform(X_test)

## Sentiment Analysis - Naive Bayes

### Using TFIDF

In [17]:
# Instantiate MultinomialNB
Naive = MultinomialNB(alpha=0.1)
# Fit the training dataset on the NB classifier; using X_train_Tfidf
Naive.fit(X_train_Tfidf, Y_train)
# Predict the labels on validation dataset
Y_pred_TNB = Naive.predict(X_test_Tfidf)
# Use accuracy_score function to get the accuracy
accuracy_NB_Tfidf = accuracy_score(Y_pred_TNB, Y_test)*100
print('Naive Bayes using TFIDF Accuracy Score -> ', accuracy_NB_Tfidf)

Naive Bayes using TFIDF Accuracy Score ->  74.79338842975206


### Using CountVectorizer

In [18]:
# Fit the training dataset on the NB classifier; using X_train_Cvect
Naive.fit(X_train_Cvect, Y_train)
# Predict the labels on validation dataset
Y_pred_CNB = Naive.predict(X_test_Cvect)
# Use accuracy_score function to get the accuracy
accuracy_NB_Cvect = accuracy_score(Y_pred_CNB, Y_test)*100
print('Naive Bayes using CountVectorizer Accuracy Score -> ', accuracy_NB_Cvect)

Naive Bayes using CountVectorizer Accuracy Score ->  74.07024793388429


## Sentiment Analysis - SVM

### Using TFIDF

In [19]:
# Instantiate SVC
svm = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
# Fit the training dataset on the svm classifier
svm.fit(X_train_Tfidf, Y_train)
# Predict the labels on validation dataset
Y_pred_TSVM = svm.predict(X_test_Tfidf)
# Use accuracy_score function to get the accuracy
accuracy_SVM_Tfidf = accuracy_score(Y_pred_TSVM, Y_test)*100
print('SVM using TFIDF Accuracy Score -> ', accuracy_SVM_Tfidf)

SVM using TFIDF Accuracy Score ->  81.50826446280992


### Using CountVectorizer

In [20]:
# Fit the training dataset on the svm classifier
svm.fit(X_train_Cvect, Y_train)
# Predict the labels on validation dataset
y_pred_CSVM = svm.predict(X_test_Cvect)
# Use accuracy_score function to get the accuracy
accuracy_SVM_Cvect = accuracy_score(y_pred_CSVM, Y_test)*100
print('SVM using CountVectorizer Accuracy Score -> ', accuracy_SVM_Cvect)

SVM using CountVectorizer Accuracy Score ->  81.09504132231406


## Cross Validation

In [21]:
print('Mean Cross validated score -> ', np.mean(cross_val_score(svm, X_train_Cvect, Y_train, cv=5)*100))

Mean Cross validated score ->  78.60935766973563


## Create DataFrame for Accuracy Scores

In [22]:
score_dict = {'NB-CVect': pd.Series([accuracy_NB_Cvect], index=['Accuracy Score']),
              'NB-TFIDF': pd.Series([accuracy_NB_Tfidf], index=['Accuracy Score']),
              'SVM-CVect': pd.Series([accuracy_SVM_Cvect], index=['Accuracy Score']),
              'SVM-TFIDF': pd.Series([accuracy_SVM_Tfidf], index=['Accuracy Score'])}

scores = pd.DataFrame(score_dict)

In [23]:
scores.head()

Unnamed: 0,NB-CVect,NB-TFIDF,SVM-CVect,SVM-TFIDF
Accuracy Score,74.070248,74.793388,81.095041,81.508264


## Save scores as scores_NB_SVM

In [24]:
scores.to_csv('scores_NB_SVM.tsv', sep='\t')

-----------

# Predict Sentiment using SVM-TFIDF

In [25]:
# Load dataset to be predicted
df_predict = pd.read_csv('commentsCleaned_pred_NV-SVM.tsv', sep='\t', index_col='Unnamed: 0')
# Inspect df_noLabels
df_predict.head()

Unnamed: 0,post_id,is_reply,comment_message,comment_published,comment_like_count,attachment_type
0,155027942462_10156535458487463,0,walmart monei card green dot effect card,2018-07-30T17:24:25+0000,0,
1,155027942462_10156535458487463,0,b,2018-07-28T15:28:23+0000,0,
2,155027942462_10156535458487463,0,p,2018-07-29T20:42:01+0000,0,
3,155027942462_10156535458487463,0,specif green dot card includ walmart monei car...,2018-07-25T09:01:58+0000,0,
4,155027942462_10156535458487463,0,b,2018-07-28T23:00:24+0000,0,


In [26]:
# Fill NA with '-'
df_predict['comment_message'].fillna('-', inplace = True) 

In [27]:
# Word Vectorization using TFIDF
X_new = Tfidf_vect.transform(df_predict['comment_message'])

In [28]:
# Select comment_message column
df_predict = df_predict[['post_id', 'comment_message']]

In [29]:
# Create column predict_sentiment
df_predict['sentiments'] = svm.predict(X_new)

In [30]:
def sentiment_type(pred):
    if pred == 0:
        return 'Negative'
    elif pred == 1:
        return 'Neutral'
    elif pred == 2:
        return 'Positive'

In [31]:
# Convert predicted to Negative, Neutral, Positive
df_predict['sentiments'] = df_predict['sentiments'].apply(sentiment_type)

In [32]:
df_predict.head()

Unnamed: 0,post_id,comment_message,sentiments
0,155027942462_10156535458487463,walmart monei card green dot effect card,Neutral
1,155027942462_10156535458487463,b,Neutral
2,155027942462_10156535458487463,p,Neutral
3,155027942462_10156535458487463,specif green dot card includ walmart monei car...,Neutral
4,155027942462_10156535458487463,b,Neutral


In [33]:
# Concat df_sub and df_predict
df_SVM = pd.concat([df_sub, df_predict])
df_SVM.head()

Unnamed: 0,post_id,comment_message,sentiments
0,155027942462_10157280442467463,-,Neutral
1,155027942462_10157280442467463,worst card ever lawsuit time,Negative
2,155027942462_10157236349992463,think need hear year think bitcoin impact lot ...,Neutral
3,155027942462_10157174321432463,long take get refund monei paypal state refund...,Negative
4,155027942462_10157096430022463,realli recip heaven,Neutral


## Save as commentsSVM.tsv

In [34]:
df_SVM.to_csv('commentsSVM.tsv', sep='\t')