In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#Importing dataset

emo = pd.read_csv(r"C:\Users\Priya\Downloads\emotion_nlp.csv")

In [3]:
emo.head()

Unnamed: 0,Text,Emotions
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [4]:
emo.shape

(17999, 2)

In [5]:
#Checking nulls and Data Cleaning

emo.isnull().sum()

Text        0
Emotions    0
dtype: int64

In [6]:
emo = emo.rename(columns={'Text':'X','Emotions':'Y'})

In [7]:
emo.head()

Unnamed: 0,X,Y
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [8]:
emo.X = emo.X.str.lower()

In [9]:
emo.head()

Unnamed: 0,X,Y
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [10]:
emo.Y.value_counts()

joy         6056
sadness     5247
anger       2434
fear        2161
love        1463
surprise     638
Name: Y, dtype: int64

In [11]:
emo.Y.replace({'joy':1, 'sadness':0,'anger':2,'fear':3,'love':4,'surprise':5}, inplace=True)

In [12]:
#Importing nltk library

import nltk
from nltk.corpus import stopwords

In [17]:
l1 = stopwords.words('english')

In [18]:
#for punctuation marks

import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [19]:
def text_process(mess):            ### creating a function
    """                                                        ## a docstring
    1. remove the punctuation
    2. remove the stopwords
    3. return the list of clean textwords
    
    """
    nopunc = [char for char in mess if char not in string.punctuation]
    nopunc = "".join(nopunc)
    
    return [ word for word in nopunc.split() if word not in l1]

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
bow_transformer = CountVectorizer(analyzer=text_process).fit(emo['X'])

In [22]:
bow_transformer = CountVectorizer(analyzer=text_process).fit(emo['X'])

In [23]:
bow_transformer.vocabulary_

{'didnt': 3834,
 'feel': 5205,
 'humiliated': 6786,
 'go': 5997,
 'feeling': 5209,
 'hopeless': 6687,
 'damned': 3392,
 'hopeful': 6685,
 'around': 726,
 'someone': 13096,
 'cares': 2062,
 'awake': 974,
 'im': 6920,
 'grabbing': 6058,
 'minute': 8934,
 'post': 10731,
 'greedy': 6120,
 'wrong': 15877,
 'ever': 4830,
 'nostalgic': 9576,
 'fireplace': 5342,
 'know': 7832,
 'still': 13463,
 'property': 11032,
 'grouchy': 6160,
 'ive': 7442,
 'little': 8241,
 'burdened': 1870,
 'lately': 7950,
 'wasnt': 15482,
 'sure': 13811,
 'taking': 13975,
 'milligrams': 8890,
 'times': 14361,
 'recommended': 11455,
 'amount': 484,
 'fallen': 5093,
 'asleep': 798,
 'lot': 8351,
 'faster': 5140,
 'also': 429,
 'like': 8164,
 'funny': 5747,
 'confused': 2837,
 'life': 8137,
 'teenager': 14091,
 'jaded': 7458,
 'year': 15934,
 'old': 9767,
 'man': 8527,
 'petronas': 10378,
 'years': 15942,
 'performed': 10304,
 'well': 15573,
 'made': 8463,
 'huge': 6763,
 'profit': 10985,
 'romantic': 11990,
 'make': 8501

In [24]:
tdm = bow_transformer.transform(emo.X)

In [25]:
tdm.shape

(17999, 16035)

In [26]:
type(tdm)

scipy.sparse._csr.csr_matrix

In [27]:
#Splitting data into train and test

from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(tdm, emo.Y, test_size=.2)

In [28]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(train_x, train_y)

In [29]:
pred = nb.predict(test_x)
pred

array([3, 0, 1, ..., 1, 0, 0], dtype=int64)

In [30]:
from sklearn.metrics import confusion_matrix, classification_report

In [31]:
tab = confusion_matrix(test_y, pred)
tab

array([[1012,   41,   12,    2,    1,    0],
       [  59, 1099,    7,    4,    6,    1],
       [ 106,   54,  307,    7,    0,    0],
       [  96,   78,   24,  280,    2,    2],
       [  38,  146,    7,    3,   98,    0],
       [  30,   48,    1,   14,    1,   14]], dtype=int64)

In [32]:
print(classification_report(test_y, pred))

              precision    recall  f1-score   support

           0       0.75      0.95      0.84      1068
           1       0.75      0.93      0.83      1176
           2       0.86      0.65      0.74       474
           3       0.90      0.58      0.71       482
           4       0.91      0.34      0.49       292
           5       0.82      0.13      0.22       108

    accuracy                           0.78      3600
   macro avg       0.83      0.60      0.64      3600
weighted avg       0.80      0.78      0.76      3600



In [33]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(train_x, train_y)

In [34]:
pred_dt = dt.predict(test_x)

In [35]:
tab_dt = confusion_matrix(test_y, pred_dt)
tab_dt

array([[ 971,   17,   42,   22,    7,    9],
       [  43, 1036,   14,   13,   58,   12],
       [  33,    9,  420,    7,    5,    0],
       [  19,    5,   25,  407,    1,   25],
       [   5,   58,    2,    2,  225,    0],
       [   2,    7,    0,   14,    0,   85]], dtype=int64)

In [36]:
print(classification_report(test_y,pred_dt))

              precision    recall  f1-score   support

           0       0.90      0.91      0.91      1068
           1       0.92      0.88      0.90      1176
           2       0.83      0.89      0.86       474
           3       0.88      0.84      0.86       482
           4       0.76      0.77      0.77       292
           5       0.65      0.79      0.71       108

    accuracy                           0.87      3600
   macro avg       0.82      0.85      0.83      3600
weighted avg       0.88      0.87      0.87      3600



In [37]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(train_x, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [38]:
pred_lr = lr.predict(test_x)

In [39]:
print(classification_report(test_y,pred_lr))

              precision    recall  f1-score   support

           0       0.91      0.94      0.93      1068
           1       0.90      0.93      0.92      1176
           2       0.87      0.87      0.87       474
           3       0.91      0.84      0.87       482
           4       0.83      0.74      0.78       292
           5       0.78      0.72      0.75       108

    accuracy                           0.89      3600
   macro avg       0.87      0.84      0.85      3600
weighted avg       0.89      0.89      0.89      3600



In [40]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(train_x, train_y)

In [41]:
pred_svm = svm.predict(test_x)

In [42]:
tab_svm = confusion_matrix(test_y, pred_svm)
tab_svm

array([[ 986,   57,   15,    8,    2,    0],
       [  28, 1118,    5,    6,   15,    4],
       [  53,   79,  336,    3,    3,    0],
       [  33,   75,   26,  346,    1,    1],
       [  15,  132,    1,    0,  144,    0],
       [   9,   27,    1,   20,    0,   51]], dtype=int64)

In [43]:
print(classification_report(test_y,pred_svm))

              precision    recall  f1-score   support

           0       0.88      0.92      0.90      1068
           1       0.75      0.95      0.84      1176
           2       0.88      0.71      0.78       474
           3       0.90      0.72      0.80       482
           4       0.87      0.49      0.63       292
           5       0.91      0.47      0.62       108

    accuracy                           0.83      3600
   macro avg       0.87      0.71      0.76      3600
weighted avg       0.84      0.83      0.82      3600

