In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('Emotion_classify_Data.csv')
df

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear
...,...,...
5932,i begun to feel distressed for you,fear
5933,i left feeling annoyed and angry thinking that...,anger
5934,i were to ever get married i d have everything...,joy
5935,i feel reluctant in applying there because i w...,fear


In [8]:
df.Emotion.value_counts()

anger    2000
joy      2000
fear     1937
Name: Emotion, dtype: int64

In [9]:
df['Emotion_label'] = df.Emotion.map({
    'anger' : 0,
    'joy' : 1,
    'fear' : 2
})

In [10]:
df

Unnamed: 0,Comment,Emotion,Emotion_label
0,i seriously hate one subject to death but now ...,fear,2
1,im so full of life i feel appalled,anger,0
2,i sit here to write i start to dig out my feel...,fear,2
3,ive been really angry with r and i feel like a...,joy,1
4,i feel suspicious if there is no one outside l...,fear,2
...,...,...,...
5932,i begun to feel distressed for you,fear,2
5933,i left feeling annoyed and angry thinking that...,anger,0
5934,i were to ever get married i d have everything...,joy,1
5935,i feel reluctant in applying there because i w...,fear,2


In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
v = CountVectorizer(ngram_range=(3,3))
v.fit(df.Comment)
transformed = v.transform(df.Comment)
transformed.toarray()[:1]

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [22]:
v.vocabulary_

{'seriously hate one': 55759,
 'hate one subject': 28713,
 'one subject to': 48830,
 'subject to death': 59887,
 'to death but': 68397,
 'death but now': 15715,
 'but now feel': 11870,
 'now feel reluctant': 46286,
 'feel reluctant to': 21587,
 'reluctant to drop': 53924,
 'to drop it': 68489,
 'im so full': 32599,
 'so full of': 57401,
 'full of life': 26176,
 'of life feel': 47017,
 'life feel appalled': 38486,
 'sit here to': 56782,
 'here to write': 30382,
 'to write start': 70315,
 'write start to': 78200,
 'start to dig': 58987,
 'to dig out': 68421,
 'dig out my': 16397,
 'out my feelings': 49931,
 'my feelings and': 43670,
 'feelings and think': 23927,
 'and think that': 5413,
 'think that am': 66391,
 'that am afraid': 61178,
 'am afraid to': 2214,
 'afraid to accept': 1089,
 'to accept the': 67889,
 'accept the possibility': 674,
 'the possibility that': 64270,
 'possibility that he': 51817,
 'that he might': 61641,
 'he might not': 29663,
 'might not make': 42134,
 'not make

In [18]:
from sklearn.model_selection import train_test_split as tts
X_train, X_test, y_train, y_test = tts(df.Comment, df.Emotion_label, test_size=0.2, random_state=2023, stratify=df.Emotion_label)

In [23]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [28]:
clf = Pipeline([
    ('count_vectorizer', CountVectorizer(ngram_range = (3, 3))),
    ('rand_forest', RandomForestClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.54      0.24      0.33       400
           1       0.59      0.23      0.33       400
           2       0.37      0.83      0.52       388

    accuracy                           0.43      1188
   macro avg       0.50      0.43      0.39      1188
weighted avg       0.50      0.43      0.39      1188



In [29]:
clf = Pipeline([
    ('count_vectorizer', CountVectorizer(ngram_range = (1,2))),
    ('multi_nb', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.88      0.86       400
           1       0.89      0.83      0.86       400
           2       0.86      0.89      0.87       388

    accuracy                           0.86      1188
   macro avg       0.86      0.86      0.86      1188
weighted avg       0.86      0.86      0.86      1188



In [30]:
clf = Pipeline([
    ('count_vectorizer', CountVectorizer(ngram_range = (1,2))),
    ('rand_forest', RandomForestClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.88      0.90       400
           1       0.84      0.96      0.90       400
           2       0.94      0.85      0.90       388

    accuracy                           0.90      1188
   macro avg       0.91      0.90      0.90      1188
weighted avg       0.90      0.90      0.90      1188



In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [37]:
clf = Pipeline([
    ('count_vectorizer', TfidfVectorizer()),
    ('rand_forest', RandomForestClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.88      0.91       400
           1       0.87      0.94      0.90       400
           2       0.92      0.89      0.91       388

    accuracy                           0.90      1188
   macro avg       0.91      0.90      0.90      1188
weighted avg       0.91      0.90      0.90      1188



In [39]:
import spacy

nlp = spacy.load('en_core_web_sm')

def preprocess(text):
    doc = nlp(text)
    
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
        
    return ' '.join(filtered_tokens)    

In [40]:
df['preprocessed_comment'] = df.Comment.apply(preprocess)

In [41]:
df

Unnamed: 0,Comment,Emotion,Emotion_label,preprocessed_comment
0,i seriously hate one subject to death but now ...,fear,2,seriously hate subject death feel reluctant drop
1,im so full of life i feel appalled,anger,0,m life feel appal
2,i sit here to write i start to dig out my feel...,fear,2,sit write start dig feeling think afraid accep...
3,ive been really angry with r and i feel like a...,joy,1,ve angry r feel like idiot trust place
4,i feel suspicious if there is no one outside l...,fear,2,feel suspicious outside like rapture happen
...,...,...,...,...
5932,i begun to feel distressed for you,fear,2,begin feel distressed
5933,i left feeling annoyed and angry thinking that...,anger,0,leave feel annoyed angry thinking center stupi...
5934,i were to ever get married i d have everything...,joy,1,married d ready offer ve get club perfect good...
5935,i feel reluctant in applying there because i w...,fear,2,feel reluctant apply want able find company kn...


In [43]:
X_train, X_test, y_train, y_test = tts(df.preprocessed_comment, df.Emotion_label, test_size=0.2, random_state=2022, stratify=df.Emotion_label)

In [44]:
clf = Pipeline([
    ('count_vectorizer', CountVectorizer(ngram_range = (1,2))),
    ('rand_forest', RandomForestClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.94      0.93       400
           1       0.93      0.95      0.94       400
           2       0.95      0.90      0.92       388

    accuracy                           0.93      1188
   macro avg       0.93      0.93      0.93      1188
weighted avg       0.93      0.93      0.93      1188



In [46]:
clf = Pipeline([
    ('count_vectorizer', TfidfVectorizer()),
    ('rand_forest', RandomForestClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.91      0.92       400
           1       0.93      0.94      0.94       400
           2       0.91      0.93      0.92       388

    accuracy                           0.93      1188
   macro avg       0.93      0.93      0.93      1188
weighted avg       0.93      0.93      0.93      1188

