In [1]:
import pandas as pd
import numpy as np
import spacy


In [10]:
data = pd.read_csv("Emotion_classify_Data.csv")
data.shape
data[:5]

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [3]:
data.Emotion.value_counts()

anger    2000
joy      2000
fear     1937
Name: Emotion, dtype: int64

In [12]:
data['emotion_coding']=data['Emotion'].map({'joy':0,'fear':1,'anger':2})

data.head(5)

Unnamed: 0,Comment,Emotion,emotion_coding
0,i seriously hate one subject to death but now ...,fear,1
1,im so full of life i feel appalled,anger,2
2,i sit here to write i start to dig out my feel...,fear,1
3,ive been really angry with r and i feel like a...,joy,0
4,i feel suspicious if there is no one outside l...,fear,1


# Modelling without Pre-processing Text data

In [15]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(data['Comment'],data['emotion_coding'],test_size=0.25)

In [16]:
x_train.shape

(4452,)

In [18]:
x_test.shape

(1485,)

Attempt 1 :

using the sklearn pipeline module create a classification pipeline to classify the data.

Note:

using CountVectorizer with only trigrams.
use RandomForest as the classifier.
print the classification report.

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
clf = Pipeline([
    ('vectorizer',CountVectorizer(ngram_range=(3,3),stop_words='english')),
    ('rf',RandomForestClassifier())
])
clf.fit(x_train,y_train)
clf.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.89      0.96      0.92       490
           1       0.93      0.91      0.92       484
           2       0.94      0.90      0.92       511

    accuracy                           0.92      1485
   macro avg       0.92      0.92      0.92      1485
weighted avg       0.92      0.92      0.92      1485



In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
clf = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('rf',RandomForestClassifier())
])

In [22]:
clf.fit(x_train,y_train)


Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('rf', RandomForestClassifier())])

In [23]:
y_pred=clf.predict(x_test)

In [25]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.89      0.96      0.92       490
           1       0.93      0.91      0.92       484
           2       0.94      0.90      0.92       511

    accuracy                           0.92      1485
   macro avg       0.92      0.92      0.92      1485
weighted avg       0.92      0.92      0.92      1485



In [31]:
from sklearn.naive_bayes import MultinomialNB
clf1 = Pipeline([
    ('vectorizer',CountVectorizer(ngram_range=(1,2))),
    ("mnb",MultinomialNB())
])
clf.fit(x_train,y_train)

ypred=clf.predict(x_test)
print(classification_report(y_test,ypred))

              precision    recall  f1-score   support

           0       0.48      0.06      0.11       490
           1       0.74      0.13      0.22       484
           2       0.36      0.94      0.52       511

    accuracy                           0.39      1485
   macro avg       0.52      0.38      0.28      1485
weighted avg       0.52      0.39      0.29      1485



In [33]:
from sklearn.ensemble import RandomForestClassifier

clf1= Pipeline([
    ('vectorizer',CountVectorizer(ngram_range=(1,2))),
    ('rf',RandomForestClassifier())
    
    
])

clf1.fit(x_train,y_train)

ypred1= clf1.predict(x_test)

print(classification_report(y_test,ypred1))



              precision    recall  f1-score   support

           0       0.83      0.95      0.88       490
           1       0.94      0.86      0.90       484
           2       0.93      0.87      0.90       511

    accuracy                           0.89      1485
   macro avg       0.90      0.89      0.89      1485
weighted avg       0.90      0.89      0.89      1485



In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
clf2=Pipeline([
    ('vectorizer',TfidfVectorizer()),
    ('rf',RandomForestClassifier())
])

clf2.fit(x_train,y_train)

ypred2 = clf2.predict(x_test)

print(classification_report(y_test,ypred2))


              precision    recall  f1-score   support

           0       0.88      0.94      0.91       490
           1       0.93      0.89      0.91       484
           2       0.93      0.90      0.91       511

    accuracy                           0.91      1485
   macro avg       0.91      0.91      0.91      1485
weighted avg       0.91      0.91      0.91      1485



# Use text pre-processing to remove stop words, punctuations and apply lemmatization

In [41]:
import spacy
nlp=spacy.load("en_core_web_sm")


def preprocess(text):
    doc= nlp(text)

    filter_tokens=[]
    for word in doc:
        if word.is_stop or word.is_punct:
            continue
        filter_tokens.append(word.lemma_)

    return " ".join(filter_tokens)

In [42]:
data['Comment_preprocess']=data['Comment'].apply(preprocess)

In [48]:
data[:5]

Unnamed: 0,Comment,Emotion,emotion_coding,Comment_preprocess
0,i seriously hate one subject to death but now ...,fear,1,seriously hate subject death feel reluctant drop
1,im so full of life i feel appalled,anger,2,m life feel appalled
2,i sit here to write i start to dig out my feel...,fear,1,sit write start dig feeling think afraid accep...
3,ive been really angry with r and i feel like a...,joy,0,ve angry r feel like idiot trust place
4,i feel suspicious if there is no one outside l...,fear,1,feel suspicious outside like rapture happen


# build model with prepocessed text

In [52]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(data['Comment_preprocess'],data['emotion_coding'],test_size=0.2)
clf3=Pipeline([
    ('vectorizer',CountVectorizer(ngram_range=(1,2))),
    ('rf',RandomForestClassifier())
])

clf3.fit(x_train,y_train)

ypred3 = clf3.predict(x_test)

print(classification_report(y_test,ypred3))

              precision    recall  f1-score   support

           0       0.91      0.95      0.93       399
           1       0.94      0.90      0.92       387
           2       0.92      0.92      0.92       402

    accuracy                           0.92      1188
   macro avg       0.92      0.92      0.92      1188
weighted avg       0.92      0.92      0.92      1188



In [53]:
x_train,x_test,y_train,y_test=train_test_split(data['Comment_preprocess'],data['emotion_coding'],test_size=0.2)
clf4=Pipeline([
    ('vectorizer',TfidfVectorizer()),
    ('rf',RandomForestClassifier())
])

clf4.fit(x_train,y_train)

ypred4 = clf4.predict(x_test)

print(classification_report(y_test,ypred4))

              precision    recall  f1-score   support

           0       0.93      0.96      0.94       395
           1       0.96      0.92      0.93       400
           2       0.91      0.92      0.92       393

    accuracy                           0.93      1188
   macro avg       0.93      0.93      0.93      1188
weighted avg       0.93      0.93      0.93      1188

