 # CRE Research Project for Sentiment Analysis Task



In [4]:
#Load in the dataset
import pandas
import pickle
with open("SentAnalysisData.csv", 'rb') as f:
    data=pickle.load(f)

In [5]:
#Get original size/shape of dataset. 
data.shape

(11327, 2)

In [11]:
#Output the data
data

Unnamed: 0,Emotion,Text
0,neutral,There are tons of other paintings that I thin...
1,sadness,"Yet the dog had grown old and less capable , a..."
2,fear,When I get into the tube or the train without ...
3,fear,This last may be a source of considerable disq...
4,anger,She disliked the intimacy he showed towards so...
...,...,...
3388,sadness,"My sweetheart left me, or rather we decided to..."
3389,sadness,"Well , it's too bad that we like different kin..."
3390,neutral,It sure is .
3391,sadness,He ’ s got laid off again . I do feel sorry fo...


In [13]:
#distribution of each of the class labels. 5 Emotions to train for. 
print(data.Emotion.value_counts())

joy        2326
sadness    2317
anger      2259
neutral    2254
fear       2171
Name: Emotion, dtype: int64


In [10]:
#split up the data for training, validation, and test
import numpy as np
from sklearn.model_selection import train_test_split

In [14]:
#split into train and test sets using data 80/20 split on train and test
       #leave alone minus preprocesssing
         #----#          #-----#
X_train, X_test, y_train, y_test = train_test_split(data.Text, data.Emotion, test_size = 0.20, random_state = 77)
#txt,    txt,   emotion, emotion

In [15]:
#working with a pd series
type(X_train)

pandas.core.series.Series

In [16]:
#---------------------------------------------------------------------------------
#Preprocessing of the data
#---------------------------------------------------------------------------------

In [17]:
import nltk
from nltk.tokenize import word_tokenize
word_tokenizer = nltk.tokenize.WhitespaceTokenizer()

In [18]:
#Removes numbers 
###X_train = X_train.str.replace('\d+', '')
###X_test = X_test.str.replace('\d+', '')

In [19]:
#Punctuation removal
#X_train = X_train.str.replace('[^\w\s]','')
#X_test = X_test.str.replace('[^\w\s]','')

In [20]:
#Preprocessing method #1
#Contraction reomoval: https://www.geeksforgeeks.org/nlp-expand-contractions-in-text-processing/
import contractions
def remove_contractions_text(text):
    return [contractions.fix(w) for w in word_tokenizer.tokenize(text)]

X_train = X_train.apply(remove_contractions_text)
X_test = X_test.apply(remove_contractions_text)

X_train=X_train.str.join(" ")
X_test=X_test.str.join(" ")

In [21]:
#dont use bad lemmatization function
#lemmatazation: https://www.geeksforgeeks.org/python-lemmatization-approaches-with-examples/
#lemmatazation: https://stackoverflow.com/questions/47557563/lemmatization-of-all-pandas-cells
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in word_tokenizer.tokenize(text)]

#X_train = X_train.str.replace('[^\w\s]','')
X_train = X_train.apply(lemmatize_text)
X_test = X_test.apply(lemmatize_text)

X_train=X_train.str.join(" ")
X_test=X_test.str.join(" ")



In [22]:
#Preprocessing method #2
#Snowball stemmer to stem the words
import pandas as pd
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

def stem_text(text):
    return [stemmer.stem(w) for w in word_tokenizer.tokenize(text)]

X_train = X_train.apply(stem_text)
X_test = X_test.apply(stem_text)

X_train=X_train.str.join(" ")
X_test=X_test.str.join(" ")

In [23]:
#Preprocessing method #3
#Remove stopwords with the help of the nltk library stopwords. Import words to compare against and remove.
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
X_train = X_train.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
X_test= X_test.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


In [24]:
#Preprocessing method #4
#Removal of certain characters/symbols from the dataset. I skimmed over the data and rmoved some reoccuring symbols
#that were littered about in the dataset.
X_train = X_train.map(lambda x: x.strip('’#$%@'))
X_test = X_test.map(lambda x: x.strip('’#$%@'))


In [None]:
#Preprocessing method #5
#Convert to lowercase. 
X_train = X_train.str.lower()
X_test = X_test.str.lower()

In [25]:
#Validate that preprocessing methods worked as anticipated
X_train[6]

6     ! also finish write sale report bos . end , f...
6    find chosen collect norm chines aphasia (i con...
Name: Text, dtype: object

In [26]:
#Validate that preprocessing methods worked as anticipated
X_train[6]

6     ! also finish write sale report bos . end , f...
6    find chosen collect norm chines aphasia (i con...
Name: Text, dtype: object

In [27]:
#Validate that preprocessing methods did not change the size of the data
print(X_train.shape)
print(X_test.shape)

#From this point leave the test set alone until the very end.

(9061,)
(2266,)


In [28]:
#split into train and validation using train 60/20 split on train and validation
#This is for development and is often called the development set. 
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = 45)
#txt, txt, emotion, emotion

In [29]:
#Validate that the size of the data was changed accordingly
print(X_train.shape)
print(X_test.shape)

(6795,)
(2266,)


In [30]:
#use the validation set that was just created. 
print("class distribution of test set")
print(y_test.value_counts())  #distribution of test set

print("class distribution of train set")
print(y_train.value_counts())  #distribution of train set

print("class distribution of validation set")
print(y_val.value_counts())  #distribution of val set

class distribution of test set
joy        460
anger      458
neutral    451
sadness    450
fear       447
Name: Emotion, dtype: int64
class distribution of train set
joy        1396
sadness    1391
anger      1363
neutral    1352
fear       1293
Name: Emotion, dtype: int64
class distribution of validation set
sadness    476
joy        470
neutral    451
anger      438
fear       431
Name: Emotion, dtype: int64


In [31]:
#use TF-IDF: Numerical representaion, number of features increased from 300 to 1500. 
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features = 1500, stop_words='english')
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

In [32]:
#print out words in vocabulary (aka dictionary)
print(vectorizer.vocabulary_)#much larger vocabulary because the features were increased within the above block.

{'birthday': 158, 'veri': 1431, 'close': 257, 'friend': 559, 'sent': 1179, 'huge': 680, 'pleas': 996, 'react': 1072, 'bad': 118, 'start': 1276, 'season': 1165, 'quarrel': 1059, 'younger': 1496, 'sister': 1216, 'becaus': 134, 'said': 1148, 'someth': 1240, 'boyfriend': 182, 'consequ': 290, 'look': 803, 'saw': 1153, 'thought': 1367, 'wa': 1438, 'tri': 1400, 'separ': 1180, 'misunderstood': 874, 'girl': 583, 'love': 811, 'accept': 18, 'propos': 1045, 'everyth': 461, 'went': 1460, 'want': 1443, 'previous': 1025, 'feel': 517, 'lower': 814, 'class': 249, 'dure': 420, 'summer': 1320, 'join': 725, 'research': 1110, 'group': 614, 'chines': 243, 'univers': 1421, 'student': 1303, 'knew': 742, 'lot': 809, 'year': 1490, 'later': 756, 'travel': 1395, 'china': 242, 'got': 598, 'die': 381, 'week': 1456, 'come': 271, 'togeth': 1380, 'funer': 566, 'bodi': 170, 'time': 1376, 'felt': 521, 'extrem': 490, 'sad': 1145, 'yes': 1492, 'like': 786, 'coffe': 264, 'bring': 188, 'famili': 504, 'explain': 487, 'make':

In [33]:
# #original KNN method for training: Gave ~39 precent accuracy w no preprocessing. 

# from sklearn.neighbors import KNeighborsClassifier
# knn = KNeighborsClassifier(n_neighbors = 3, metric = 'euclidean')
# knn.fit(train_vectors, y_train)

In [34]:
#Use the Support Vector Machines classifier from sklearn. 
#use the Radial Basis Function for the kernel because this will provide best results becaice we habe 5 emotions to
#train for this prediction
from sklearn import svm
from sklearn.svm import SVC
clf = svm.SVC(kernel = 'rbf')
#higher number of features can cause a large chance for the dataset to be overfit. 
clf.fit(train_vectors, y_train)

SVC()

In [35]:
#prediction = knn.predict(test_vectors)
prediction = clf.predict(test_vectors) #get new preciction based on the SVM model

In [36]:
#import and print cassificaion report and accuracy
from sklearn.metrics import classification_report
print(classification_report(y_test, prediction))

#Model Accuracy: how often is the classifier correct
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_test, prediction))

              precision    recall  f1-score   support

       anger       0.60      0.68      0.64       458
        fear       0.74      0.67      0.70       447
         joy       0.67      0.67      0.67       460
     neutral       0.70      0.67      0.68       451
     sadness       0.68      0.68      0.68       450

    accuracy                           0.67      2266
   macro avg       0.68      0.67      0.68      2266
weighted avg       0.68      0.67      0.68      2266

Accuracy: 0.6743159752868491
