<a href="https://colab.research.google.com/github/rajiul123/emotion_classification_in_nlp/blob/main/emotion_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import spacy

In [2]:
df = pd.read_csv("Emotion_classify_Data.csv")
df.head()

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [3]:
df.Emotion.value_counts()

Unnamed: 0_level_0,count
Emotion,Unnamed: 1_level_1
anger,2000
joy,2000
fear,1937


In [4]:
df.describe()

Unnamed: 0,Comment,Emotion
count,5937,5937
unique,5934,3
top,i feel pretty tortured because i work a job an...,anger
freq,2,2000


In [5]:
df["emotion_num"] = df.Emotion.map({
    "joy": 0,
    "fear": 1,
    "anger": 2
})
df.head()

Unnamed: 0,Comment,Emotion,emotion_num
0,i seriously hate one subject to death but now ...,fear,1
1,im so full of life i feel appalled,anger,2
2,i sit here to write i start to dig out my feel...,fear,1
3,ive been really angry with r and i feel like a...,joy,0
4,i feel suspicious if there is no one outside l...,fear,1


**Building Model Without Text Preprocessing**

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.Comment, df.emotion_num, test_size=0.2, random_state=2022, stratify=df.emotion_num)

In [7]:
print(f"X_train shape: {X_train.shape} | y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape} | y_test shape: {y_test.shape}")

X_train shape: (4749,) | y_train shape: (4749,)
X_test shape: (1188,) | y_test shape: (1188,)


In [8]:
# importing necessary modules
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

In [10]:
#RandomForest with CountVectorizer trigrams

clf_rf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(3, 3))),
    ('rf', RandomForestClassifier())
])

clf_rf.fit(X_train, y_train)
y_pred = clf_rf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.27      0.37       400
           1       0.37      0.79      0.51       388
           2       0.53      0.23      0.32       400

    accuracy                           0.43      1188
   macro avg       0.49      0.43      0.40      1188
weighted avg       0.49      0.43      0.40      1188



# Observation
* Precision, recall & f1-score are too low
* As well as accuracy is also too low

In [11]:
#MultinomialNB with CountVectorizer unigram and bigrams

clf_nb = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1, 2))),
    ('nb', MultinomialNB())
])

clf_nb.fit(X_train, y_train)
y_pred = clf_nb.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.86      0.87       400
           1       0.87      0.83      0.85       388
           2       0.83      0.88      0.85       400

    accuracy                           0.86      1188
   macro avg       0.86      0.86      0.86      1188
weighted avg       0.86      0.86      0.86      1188



# Observation
* Precision, recall & f1-score are improved compare to previous method
* As well as accuracy has also increased to a satisfactory level
* But there are rooms to improve

In [12]:
#RandomForest with CountVectorizer unigram and Bigrams

clf_rf_2 = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1, 2))),
    ('rf', RandomForestClassifier())
])

clf_rf_2.fit(X_train, y_train)
y_pred = clf_rf_2.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.96      0.90       400
           1       0.96      0.89      0.92       388
           2       0.93      0.86      0.90       400

    accuracy                           0.90      1188
   macro avg       0.91      0.90      0.90      1188
weighted avg       0.91      0.90      0.90      1188



# Observation
* With unigram and bigrams precision, recall & f1-score have improved more
* As well as accuracy has also improved

In [13]:
#RandomForest with TF-IDF vectorizer

clf_rf_3 = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

clf_rf_3.fit(X_train, y_train)
y_pred = clf_rf_3.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.95      0.91       400
           1       0.91      0.90      0.91       388
           2       0.93      0.86      0.90       400

    accuracy                           0.90      1188
   macro avg       0.91      0.90      0.90      1188
weighted avg       0.91      0.90      0.90      1188



# Observation
* with Tf-Idf Vectorizer precision, recall & f1-score has improved more
* AAccuracy is as it was in previous method

**Building Model after Text Preprocessing**

In [14]:
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
  doc = nlp(text)
  filtered_tokens = []
  for token in doc:
    if token.is_stop or token.is_punct:
      continue
    filtered_tokens.append(token.lemma_)
  return " ".join(filtered_tokens)

In [15]:
df["peprocessed_comment"] = df.Comment.apply(preprocess)
df.head()

Unnamed: 0,Comment,Emotion,emotion_num,peprocessed_comment
0,i seriously hate one subject to death but now ...,fear,1,seriously hate subject death feel reluctant drop
1,im so full of life i feel appalled,anger,2,m life feel appalled
2,i sit here to write i start to dig out my feel...,fear,1,sit write start dig feeling think afraid accep...
3,ive been really angry with r and i feel like a...,joy,0,ve angry r feel like idiot trust place
4,i feel suspicious if there is no one outside l...,fear,1,feel suspicious outside like rapture happen


In [16]:
X_train_pr, X_test_pr, y_train_pr, y_test_pr = train_test_split(df.peprocessed_comment, df.emotion_num, test_size=0.2, random_state=2022, stratify=df.emotion_num)
print(f"Preprocessed X_train shape: {X_train_pr.shape} | Preprocessed y_train shape: {y_train_pr.shape}")
print(f"Preprocessed X_test shape: {X_test_pr.shape} | Preprocessed y_test shape: {y_test_pr.shape}")

Preprocessed X_train shape: (4749,) | Preprocessed y_train shape: (4749,)
Preprocessed X_test shape: (1188,) | Preprocessed y_test shape: (1188,)


In [17]:
#RandomForest with CountVectorizer unigram and Bigrams

clf_rf_pr = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1, 2))),
    ('rf', RandomForestClassifier())
])

clf_rf_pr.fit(X_train_pr, y_train_pr)
y_pred = clf_rf_pr.predict(X_test_pr)
print(classification_report(y_test_pr, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.95      0.95       400
           1       0.94      0.90      0.92       388
           2       0.91      0.94      0.93       400

    accuracy                           0.93      1188
   macro avg       0.93      0.93      0.93      1188
weighted avg       0.93      0.93      0.93      1188



# Observation
* After text preprocessing the precision, recall and f1-score have improved more
* Acuuracy is now 93%

In [18]:
#RandomForest with TF-IDF vectorizer

clf_rf_pr_tf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

clf_rf_pr_tf.fit(X_train_pr, y_train_pr)
y_pred = clf_rf_pr_tf.predict(X_test_pr)
print(classification_report(y_test_pr, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94       400
           1       0.94      0.93      0.93       388
           2       0.94      0.91      0.92       400

    accuracy                           0.93      1188
   macro avg       0.93      0.93      0.93      1188
weighted avg       0.93      0.93      0.93      1188



# Observation
* After text preprocessing the precision, recall and f1-score remained almost same compared to CountVectorizer with Random Forest
* Acuuracy is now 93%
* Thus we can conclude that Random Forest has preformed well compared to Naive Bayes
* Count Vectorizer and Tf-Idf both have performed well