In [1]:
import pandas as pd
data=pd.read_csv("Emotion_classify_Data.csv")
print(data.shape)

data.head()

(5937, 2)


Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [2]:
data.Emotion.value_counts()

Emotion
anger    2000
joy      2000
fear     1937
Name: count, dtype: int64

In [3]:
data["Emotion_num"]=data.Emotion.map({
    "joy":0,
    "fear":1,
    "anger":2
})

In [4]:
data.head()

Unnamed: 0,Comment,Emotion,Emotion_num
0,i seriously hate one subject to death but now ...,fear,1
1,im so full of life i feel appalled,anger,2
2,i sit here to write i start to dig out my feel...,fear,1
3,ive been really angry with r and i feel like a...,joy,0
4,i feel suspicious if there is no one outside l...,fear,1


In [5]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(
    data.Comment,
    data.Emotion_num,
    test_size=0.2,
    random_state=2022,
    stratify=data.Emotion_num
)

In [6]:
X_train.shape

(4749,)

In [7]:
y_train.shape

(4749,)

In [8]:
X_train.shape

(4749,)

In [9]:
X_test.shape

(1188,)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer_tri_grams', CountVectorizer(ngram_range = (3, 3))),                       #using the ngram_range parameter 
    ('random_forest', (RandomForestClassifier()))         
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.56      0.26      0.35       400
           1       0.37      0.80      0.50       388
           2       0.54      0.22      0.31       400

    accuracy                           0.42      1188
   macro avg       0.49      0.42      0.39      1188
weighted avg       0.49      0.42      0.39      1188



In [11]:
from sklearn.naive_bayes import MultinomialNB

clf = Pipeline([
    ('vectorizer_tri_grams', CountVectorizer(ngram_range = (1, 2))),                       #using the ngram_range parameter 
    ('multi_nb', (MultinomialNB()))         
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.86      0.87       400
           1       0.87      0.83      0.85       388
           2       0.83      0.88      0.85       400

    accuracy                           0.86      1188
   macro avg       0.86      0.86      0.86      1188
weighted avg       0.86      0.86      0.86      1188



In [12]:
clf = Pipeline([
    ('Random Forest', CountVectorizer(ngram_range = (1, 2))),                       #using the ngram_range parameter 
    ('multi_nb', (RandomForestClassifier()))         
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.85      0.95      0.90       400
           1       0.94      0.88      0.91       388
           2       0.92      0.86      0.89       400

    accuracy                           0.90      1188
   macro avg       0.90      0.90      0.90      1188
weighted avg       0.90      0.90      0.90      1188



In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

clf = Pipeline([
    ('Tf-Idf-Vectorizer', TfidfVectorizer()),                       #using the ngram_range parameter 
    ('multi_nb', (RandomForestClassifier()))         
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.95      0.91       400
           1       0.91      0.90      0.91       388
           2       0.93      0.85      0.89       400

    accuracy                           0.90      1188
   macro avg       0.90      0.90      0.90      1188
weighted avg       0.90      0.90      0.90      1188



In [14]:
import spacy
nlp = spacy.load("en_core_web_sm") 
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [15]:
data["preprocessed_comments"] =data.Comment.apply(preprocess)

In [16]:
data.head()

Unnamed: 0,Comment,Emotion,Emotion_num,preprocessed_comments
0,i seriously hate one subject to death but now ...,fear,1,seriously hate subject death feel reluctant drop
1,im so full of life i feel appalled,anger,2,m life feel appalled
2,i sit here to write i start to dig out my feel...,fear,1,sit write start dig feeling think afraid accep...
3,ive been really angry with r and i feel like a...,joy,0,ve angry r feel like idiot trust place
4,i feel suspicious if there is no one outside l...,fear,1,feel suspicious outside like rapture happen


In [23]:
X_train,X_test,y_train,y_test=train_test_split(
    data.preprocessed_comments,
    data.Emotion_num,
    test_size=0.2,
    random_state=2022,
    stratify=data.Emotion_num
)

In [24]:
X_train.shape

(4749,)

In [25]:
y_train.shape

(4749,)

In [26]:
X_train[:5]

4574    feel highly intimidate fluster not form word e...
2534              remember feel bit confused question say
5492                           bed look mirror feel brave
5393    stop feel mad machine steal money choose inste...
4311    ve hear thing happen singapore get feel irrita...
Name: preprocessed_comments, dtype: object

In [27]:
y_train[:5]

4574    1
2534    1
5492    0
5393    2
4311    2
Name: Emotion_num, dtype: int64

In [29]:
X_test.shape

(1188,)

In [30]:
y_test.shape

(1188,)

In [32]:
clf=Pipeline(
    [("CountVectorizer ",CountVectorizer(ngram_range=(1,2))),
    ("RandomForestClassifier",RandomForestClassifier())]
)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95       400
           1       0.94      0.91      0.93       388
           2       0.92      0.94      0.93       400

    accuracy                           0.94      1188
   macro avg       0.94      0.93      0.94      1188
weighted avg       0.94      0.94      0.94      1188



In [33]:
clf=Pipeline(
    [
        ("tf-idf-vectorizer",TfidfVectorizer()),
        ("RandomForestClassifier",RandomForestClassifier())
    ]
)

clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94       400
           1       0.93      0.92      0.92       388
           2       0.94      0.92      0.93       400

    accuracy                           0.93      1188
   macro avg       0.93      0.93      0.93      1188
weighted avg       0.93      0.93      0.93      1188

