In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import string
import json
import spacy
sp = spacy.load('en_core_web_sm')
import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input,Embedding,Dense,Flatten
from sklearn.metrics import accuracy_score,classification_report

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train_path = './../datasets/Emotion_classification_ISEAR/corrected_isear-train.csv'
test_path = './../datasets/Emotion_classification_ISEAR/corrected_isear-test.csv'
data_train = pd.read_csv(train_path, names=['emotion', 'text', 'NaN'])   
data_test = pd.read_csv(test_path, names=['emotion', 'text', 'NaN']) 

In [3]:
data_train.head()

Unnamed: 0,emotion,text,NaN
0,joy,When I understood that I was admitted to the U...,
1,fear,I broke a window of a neighbouring house and I...,
2,joy,Got a big fish in fishing.,
3,fear,"Whenever I am alone in a dark room, walk alone...",
4,shame,I bought a possible answer to a homework probl...,


In [4]:
data_test.head()

Unnamed: 0,emotion,text,NaN
0,anger,Deliberately provoked by someone close. Angere...,
1,shame,In my 20s - and was going out to dinner to cel...,
2,guilt,When I noticed that my little sister was sulki...,
3,disgust,I got disgusted with a man who was beating a w...,
4,joy,When I passed the driving test (on the second ...,


In [5]:
data_train.shape

(5357, 3)

In [6]:
data_test.shape

(1148, 3)

In [7]:
data_train.drop(columns=['NaN'], axis=1, inplace=True)
data_test.drop(columns=['NaN'], axis=1, inplace=True)

In [8]:
def remove_puncts(data):
    new_data = re.sub(r'[^\w\s]', '', data)
    return new_data

In [9]:
def remove_nums(data):
    pattern = r'[0-9]'
    new_data = re.sub(pattern, '', data)
    return new_data

In [10]:
def clean_data(data):
    data['text'] = data['text'].str.lower()
    data['text'] = data.apply(lambda x: remove_nums(x['text']), axis=1)
    data['text'] = data.apply(lambda x: remove_puncts(x['text']), axis=1)
    return data

In [11]:
train = clean_data(data_train)
test = clean_data(data_test)

In [12]:
train.head()

Unnamed: 0,emotion,text
0,joy,when i understood that i was admitted to the u...
1,fear,i broke a window of a neighbouring house and i...
2,joy,got a big fish in fishing
3,fear,whenever i am alone in a dark room walk alone ...
4,shame,i bought a possible answer to a homework probl...


In [13]:
test.head()

Unnamed: 0,emotion,text
0,anger,deliberately provoked by someone close angered...
1,shame,in my s and was going out to dinner to celebr...
2,guilt,when i noticed that my little sister was sulki...
3,disgust,i got disgusted with a man who was beating a w...
4,joy,when i passed the driving test on the second try


In [14]:
def remove_stop_words(data):
    all_stopwords = sp.Defaults.stop_words
    tokens = data.split(" ")
    tokens_filtered= [word for word in tokens if not word in all_stopwords]
    return (" ").join(tokens_filtered)

In [15]:
def lemmatize(data):
    new_string = ''
    doc = sp(data)
    for token in doc:
        new_string= new_string +" "+ str(token.lemma_)
    return new_string

In [16]:
train['text'] = train.apply(lambda x: remove_stop_words(x['text']), axis=1)
train['text'] = train.apply(lambda x: lemmatize(x['text']), axis=1)

In [17]:
def remove_PRON(data):
    pattern = r'-PRON-'
    new_data = re.sub(pattern, '', data)
    return new_data

In [18]:
train['text'] = train.apply(lambda x: remove_PRON(x['text']), axis=1)

In [19]:
train.head()

Unnamed: 0,emotion,text
0,joy,understood admit university
1,fear,broke window neighbouring house fear mothers ...
2,joy,get big fish fishing
3,fear,dark room walk street sleep room night partly...
4,shame,buy possible answer homework problem complete...


In [20]:
test.head()

Unnamed: 0,emotion,text
0,anger,deliberately provoked by someone close angered...
1,shame,in my s and was going out to dinner to celebr...
2,guilt,when i noticed that my little sister was sulki...
3,disgust,i got disgusted with a man who was beating a w...
4,joy,when i passed the driving test on the second try


In [34]:
x_train = train['text'].values
y_train = train['emotion'].values
x_test = test['text'].values
y_test = test['emotion'].values

In [35]:
len(x_train)

5357

In [36]:
len(y_train)

5357

In [37]:
len(x_test)

1148

In [38]:
len(y_test)

1148

In [39]:
type(x_train[0])

str

In [40]:
x_train.shape

(5357,)

In [41]:
type(x_train)

numpy.ndarray

In [42]:
x_train

array([' understood admit university',
       ' broke window neighbouring house fear mothers judgement action',
       ' get big fish fishing', ..., ' final exam year   natural science',
       ' drunk man bump want grip fortunately able break loose',
       ' good friend grow apart find run friend'], dtype=object)

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [44]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit(x_train)
X_train = vectorizer.transform(x_train)
X_test = vectorizer.transform(x_test)

In [45]:
X_train.shape, X_test.shape

((5357, 6115), (1148, 6115))

In [46]:
X_test

<1148x6115 sparse matrix of type '<class 'numpy.float64'>'
	with 8438 stored elements in Compressed Sparse Row format>

# Decision Trees

In [47]:
from sklearn.tree import DecisionTreeClassifier

In [48]:
model_decisin_tree = DecisionTreeClassifier(criterion='gini', min_samples_leaf=1, max_depth=5)

In [49]:
model_decisin_tree.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [50]:
pred_decision_tree = model_decisin_tree.predict(X_test)

In [51]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [52]:
accuracy_score(pred_decision_tree.reshape(-1), y_test)

0.23693379790940766

In [53]:
print(classification_report(pred_decision_tree.reshape(-1), y_test))

              precision    recall  f1-score   support

       anger       0.16      0.82      0.27        34
     disgust       0.05      0.80      0.09        10
        fear       0.00      0.00      0.00         0
       guilt       0.16      0.93      0.27        27
         joy       1.00      0.16      0.27      1026
     sadness       0.15      0.92      0.25        24
       shame       0.16      1.00      0.28        27

    accuracy                           0.24      1148
   macro avg       0.24      0.66      0.20      1148
weighted avg       0.91      0.24      0.27      1148



  _warn_prf(average, modifier, msg_start, len(result))


## SVM

In [54]:
from sklearn.svm import LinearSVC

In [55]:
model_svc = LinearSVC(loss='squared_hinge')

In [56]:
model_svc.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [57]:
pred_svc = model_svc.predict(X_test)

In [58]:
print(classification_report(pred_svc.reshape(-1), y_test))

              precision    recall  f1-score   support

       anger       0.32      0.41      0.36       138
     disgust       0.55      0.50      0.52       191
        fear       0.57      0.58      0.57       161
       guilt       0.47      0.41      0.44       176
         joy       0.56      0.49      0.52       185
     sadness       0.50      0.53      0.52       143
       shame       0.41      0.44      0.42       154

    accuracy                           0.48      1148
   macro avg       0.48      0.48      0.48      1148
weighted avg       0.49      0.48      0.48      1148



In [59]:
accuracy_score(pred_svc.reshape(-1), y_test)

0.4817073170731707

# Logistic regression

In [60]:
from sklearn.linear_model import LogisticRegression

In [61]:
model_log_reg = LogisticRegression(penalty='l2')

In [62]:
model_log_reg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [63]:
pred_log_reg = model_log_reg.predict(X_test)

In [64]:
print(classification_report(pred_log_reg.reshape(-1), y_test))

              precision    recall  f1-score   support

       anger       0.41      0.47      0.44       155
     disgust       0.60      0.49      0.54       212
        fear       0.62      0.60      0.61       168
       guilt       0.49      0.43      0.46       175
         joy       0.57      0.53      0.55       172
     sadness       0.48      0.60      0.53       122
       shame       0.44      0.51      0.47       144

    accuracy                           0.52      1148
   macro avg       0.52      0.52      0.52      1148
weighted avg       0.52      0.52      0.52      1148

