In [226]:
#Import Libraries
import pandas as pd
import numpy as np

In [227]:
#Import additional libraries
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/jtmarquez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jtmarquez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jtmarquez/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jtmarquez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [228]:
#Read Labelled CSV File,  encoding is set to latin-1 because of the special characters.
df = pd.read_csv("comments_cleaned.csv")

In [229]:
#Inspect the Dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18452 entries, 0 to 18451
Data columns (total 3 columns):
ID                 18452 non-null int64
comment_message    18452 non-null object
Sentiments         3008 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 432.5+ KB


In [230]:
#copy the dataframe first, then drop all columns except comment_message and sentiments

df_sub = df.copy()

In [231]:
#inspect copy

df_sub.head()

Unnamed: 0,ID,comment_message,Sentiments
0,0,[],1.0
1,1,"['worst', 'card', 'ever', 'lawsuit', 'time']",0.0
2,2,"['think', 'need', 'hear', 'year', 'think', 'bi...",
3,3,"['long', 'take', 'get', 'refunded', 'money', '...",0.0
4,10,"['really', 'recipe', 'heaven']",1.0


In [232]:
df_sub.columns = ["ID", "text","Sentiments"]

In [233]:
#Check for rows with NULL values for sentiments

df_sub=df_sub.dropna(axis=0, subset=('Sentiments', ))

In [234]:
#check dataset again
df_sub.head()

Unnamed: 0,ID,text,Sentiments
0,0,[],1.0
1,1,"['worst', 'card', 'ever', 'lawsuit', 'time']",0.0
3,3,"['long', 'take', 'get', 'refunded', 'money', '...",0.0
4,10,"['really', 'recipe', 'heaven']",1.0
5,12,"['hey', 'son', 'new', 'customer', 'card', 'dir...",0.0


In [235]:
#get the value counts for each sentiment
df_sub['Sentiments'].value_counts()

0.0    1461
2.0     880
1.0     667
Name: Sentiments, dtype: int64

In [237]:
#set random seed for reproducibility
np.random.seed(500)

<h1>Train Test Split</h1>

In [238]:
Train_X = df_sub['text']
Train_Y = df_sub['Sentiments']


Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Train_X,Train_Y,
                                                                    test_size=0.3)

<h1>Encoding</h1>

In [239]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [240]:
Train_X.head()

2646    ['moon', 'back', 'going', 'see', 'mom', 'heave...
2899    ['green', 'dot', 'health', 'card', 'switchauth...
2209    ['best', 'money', 'advice', 'ever', 'received'...
429     ['verify', 'sent', 'money', 'provide', 'info',...
3206    ['please', 'tell', 'u', 'got', 'help', 'son', ...
Name: text, dtype: object

<h1>Word Vectorization</h1>

In [241]:
#instantiate CountVectorizer

vect = CountVectorizer()
X_train_vectorized = vect.fit_transform(Train_X)

<h1>TFIDF</h1>

In [242]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df_sub['text'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

<h1>NAIVE BAYES</h1>

In [243]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("TFIDF Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

TFIDF Naive Bayes Accuracy Score ->  71.42857142857143


<b>Naive Bayes Not using TFIDF</b>

In [244]:
clfrNB = naive_bayes.MultinomialNB(alpha=0.1)
clfrNB.fit(X_train_vectorized, Train_Y)

# predict the labels on validation dataset
y_pred = clfrNB.predict(vect.transform(Test_X))

# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(y_pred, Test_Y)*100)

Naive Bayes Accuracy Score ->  76.0797342192691


<h1>SVM</h1>

In [245]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM_tf = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM_tf.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM_tf = SVM_tf.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("TFIDF SVM Accuracy Score -> ",accuracy_score(predictions_SVM_tf, Test_Y)*100)

TFIDF SVM Accuracy Score ->  77.29789590254707


<b>SVM not using TFIDF</b>

In [246]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM_ = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train_vectorized,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(vect.transform(Test_X))
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  76.30121816168328
