# Importing the data set

The data set consists of the following two columns.

 •  EMOTION lists the emotions.

 •  TEXT features the number of corresponding sentences.

In [1]:
import pandas as pd 
data = pd.read_csv('ISEAR.csv')
data.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2.1,Unnamed: 2
0,joy,On days when I feel close to my partner and ot...,
1,fear,Every time I imagine that someone I love or I ...,
2,anger,When I had been obviously unjustly treated and...,
3,sadness,When I think about the short time that we live...,
4,disgust,At a gathering I found myself involuntarily si...,


In [2]:
data.columns = ['EMOTION', 'TEXT', 'Unnamed']
data = data.drop(columns=['Unnamed'])
data.head()

Unnamed: 0,EMOTION,TEXT
0,joy,On days when I feel close to my partner and ot...
1,fear,Every time I imagine that someone I love or I ...
2,anger,When I had been obviously unjustly treated and...
3,sadness,When I think about the short time that we live...
4,disgust,At a gathering I found myself involuntarily si...


# Data Cleaning and Preprocessing

Data cleaning is important to obtain better features and accuracy. We can achieve this by doing text preprocessing steps on the data.

The preprocessing steps are as follows.

1. Lowercase

2. Remove special characters

3. Remove punctuation

4. Remove stop words

5. Correct spelling

6. Normalization

The following are the libraries to preprocess the text. NLTK is a predominant free source Python package for text preprocessing.

In [3]:
#Importing the libraries for building Emotion Classifier 

from nltk.corpus import stopwords

from nltk.stem.wordnet import WordNetLemmatizer

import string

from textblob.classifiers import NaiveBayesClassifier

from textblob import TextBlob

from nltk.corpus import stopwords

from nltk.stem import PorterStemmer

from textblob import Word

from nltk.util import ngrams

import re

from nltk.tokenize import word_tokenize

import matplotlib.pyplot as plt

from sklearn.feature_extraction. text import CountVectorizer, TfidfVectorizer

from sklearn.decomposition import LatentDirichletAllocation

import sklearn.feature_extraction.text as text

from sklearn.decomposition import NMF, LatentDirichletAllocation,TruncatedSVD

from sklearn import model_selection, preprocessing, linear_model,naive_bayes, metrics, svm

import xgboost

from sklearn import decomposition, ensemble

import pandas, numpy, textblob, string

import re

import nltk

from sklearn.metrics import classification_report

from sklearn.metrics import confusion_matrix

from sklearn.metrics import accuracy_score

from sklearn.metrics import mean_absolute_error

In [4]:
#Convert uppercase letters to lowercase.
data['TEXT'] = data['TEXT'].apply(lambda a: " ".join(a.lower() for a in a.split()))

In [5]:
#Remove white space and special characters.
data['TEXT'] = data['TEXT'].apply(lambda a: " ".join(a.replace('[^\w\s]','') for a in a.split()))

In [6]:
#Remove the stop words.
stop = stopwords.words('english')
data['TEXT'] = data['TEXT'].apply(lambda a: " ".join(a for a in a.split() if a not in stop))

In [7]:
#Correct Spelling
data['TEXT'] = data['TEXT'].apply(lambda a: str(TextBlob(a).correct()))

In [8]:
#Do Stemming
st = PorterStemmer()
data['TEXT'] = data['TEXT'].apply(lambda a: " ".join([st.stem (word) for word in a.split()]))

In [9]:
from spellchecker import SpellChecker
spell = SpellChecker()
data['TEXT'] = data['TEXT'].apply(lambda a: " ".join(spell.correction(word) if spell.correction(word) is not None else word for word in a.split()))

In [10]:
#After completing all the preprocessing steps, this is what the data looks like.
data.head()

Unnamed: 0,EMOTION,TEXT
0,joy,day feel close partner friends feel peace also...
1,fear,ever time imagine someone love could contact s...
2,anger,obvious unjustly treat possible lucid this
3,sadness,think short time live relax period life think ...
4,disgust,gather found involuntarily sit next two peopl ...


# Label Encoding

The target encoding is an approach to convert categorical value to numerical value.
There are seven categories in this data, and we must encode them to proceed further. We are using the label encoder function to encode these categories.

In [11]:
#data before encoding.
data['EMOTION'].value_counts()

EMOTION
joy        1092
sadness    1082
anger      1079
fear       1076
shame      1071
disgust    1066
guilt      1049
guit          1
Name: count, dtype: int64

In [12]:
#Labels encode the target variable
object = preprocessing. LabelEncoder()
data['EMOTION'] = object. fit_transform(data['EMOTION'])

In [13]:
# data after encoding.
data['EMOTION'].value_counts()

EMOTION
5    1092
6    1082
0    1079
2    1076
7    1071
1    1066
3    1049
4       1
Name: count, dtype: int64

# Train-Test Split

The data is split into two parts: one part trains the model, which is the training set, and the other part evaluates the model, which is the test set. The train_test_split library from sklearn.model_selection is imported to split the data frame into two parts.

In [14]:
from sklearn.model_selection import train_test_split

# Get the emotions with at least two instances
valid_emotions = data['EMOTION'].value_counts()[data['EMOTION'].value_counts() > 1].index.tolist()

# Filter the data to only include valid emotions
filtered_data = data[data['EMOTION'].isin(valid_emotions)]

# Split the filtered data into train and test sets
Xtrain, Xtest, Ytrain, Ytest = train_test_split(filtered_data['TEXT'], filtered_data['EMOTION'], stratify=filtered_data['EMOTION'], test_size=0.4, random_state=42)

Now that we have completed the train-test split step, the next step is to extract the features out of these texts. For this, we use two important methods.

# Feature Engineering

Feature engineering is the process of creating a new feature considering the domain context. Let's implement the count vectorizer and TF-IDF techniques to obtain the relevant features from the data sets.

In [15]:
cv = CountVectorizer()

cv.fit(data['TEXT'])

cv_xtrain = cv.transform(Xtrain)

cv_xtest = cv. transform(Xtest)

# word-level TF-IDF

tv = TfidfVectorizer()

tv.fit(data['TEXT'])

#Transform the training and validation data using TF-IDF object.

tv_xtrain = tv.transform(Xtrain)

tv_xtest =tv.transform(Xtest) 

Now let's get into one of the crucial steps to build the multiclass text classification model. We explore the different algorithms in this section.

# Model Building Phase

In this phase, we build different models using both count vectors and word-level TF-IDF as features, and then the model is finalized based on the accuracy level of the classifier.

Let's build a classifier function so that you we play around with the different algorithms.

In [16]:
def build(model_initializer, independent_variables_training, target, independent_variable_test):
    model_initializer.fit(independent_variables_training, target)
    modelPred = model_initializer.predict(independent_variable_test)
    return metrics.accuracy_score(modelPred, Ytest)

Let's use the preceding function and try various algorithms.

# Multinomial Naive Bayes

The multinomial naive Bayes algorithm essentially calculates the probability of each category using the Bayes theorem.

Let's build a naive Bayes model.

In [17]:
# The following uses naive Bayes generated with count vectors.
output = build(naive_bayes.MultinomialNB(), cv_xtrain, Ytrain, cv_xtest)
print(output)
# The following uses naive Bayes generated with word-level TF-IDF vectors.
output = build (naive_bayes. MultinomialNB (), tv_xtrain, Ytrain, tv_xtest) 
print(output)

0.5329341317365269
0.542581503659348


53.2% accuracy is obtained from count vectorizer features. 

54.2% accuracy is obtained from TD-IDF vectorizer features.

# Linear Classifier/Logistic Regression

The following builds a logistic regression model.

In [18]:
# for CV

output = build(linear_model.LogisticRegression(), cv_xtrain, Ytrain, cv_xtest) 
print(output)

# for TF-IDF

output = build(linear_model.LogisticRegression(), tv_xtrain, Ytrain,tv_xtest)
print(output)

0.5515635395874917
0.5671989354624085


# Support-Vector Machine
Let's build the SVM Model.

In [19]:
#for cv

output =build(svm. SVC(), cv_xtrain, Ytrain, cv_xtest)
print (output)

#for TF-IDF

output =build (svm.SVC(), tv_xtrain, Ytrain, tv_xtest)
print(output)

0.5199600798403193
0.5582168995342648


# Random Forest
The following builds a random forest model.

In [20]:
#for CV

output = build(ensemble. RandomForestClassifier(), cv_xtrain, Ytrain, cv_xtest) 
print(output)

#for TF-IDF

output = build(ensemble. RandomForestClassifier(), tv_xtrain, Ytrain,tv_xtest)

print(output)

0.5359281437125748
0.5508982035928144


# Model Evaluation and Comparison Summary

We tried a few different machine learning algorithms using both count vectorizers and TE-IDF vectorizers.Among the models mentioned, the Linear Classifier/Logistic Regression using TF-IDF vectorizer achieved the highest accuracy with 56.7%.