In [68]:
import pandas as pd

In [69]:
train = pd.read_csv('train.txt', delimiter = ';', header = None,  names=['comment', 'emotion'])
test = pd.read_csv('test.txt', delimiter = ';', header = None,  names=['comment', 'emotion'])
df = pd.concat([train, test], ignore_index=True)

In [70]:
df.emotion.value_counts() #imbalance exists

joy         6057
sadness     5247
anger       2434
fear        2161
love        1463
surprise     638
Name: emotion, dtype: int64

For now, remove surprise

In [71]:
def remove_surprise_emotion(df, target_emotion):
    df_filtered = df[df['emotion'] != target_emotion]
    return df_filtered

In [72]:
df_filtered = remove_surprise_emotion(df, 'surprise')

In [73]:
df_filtered.emotion.value_counts()

joy        6057
sadness    5247
anger      2434
fear       2161
love       1463
Name: emotion, dtype: int64

In [74]:
df_filtered

Unnamed: 0,comment,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
17995,i just keep feeling like someone is being unki...,anger
17996,im feeling a little cranky negative after this...,anger
17997,i feel that i am useful to my people and that ...,joy
17998,im feeling more comfortable with derby i feel ...,joy


In [75]:
df_filtered = df_filtered.copy()
df_filtered.loc[:, 'emotion_num'] = df_filtered['emotion'].map({
    'joy' : 0,
    'fear' : 1,
    'anger' : 2,
    'sadness' : 3,
    'love' : 4
})

df_filtered

Unnamed: 0,comment,emotion,emotion_num
0,i didnt feel humiliated,sadness,3
1,i can go from feeling so hopeless to so damned...,sadness,3
2,im grabbing a minute to post i feel greedy wrong,anger,2
3,i am ever feeling nostalgic about the fireplac...,love,4
4,i am feeling grouchy,anger,2
...,...,...,...
17995,i just keep feeling like someone is being unki...,anger,2
17996,im feeling a little cranky negative after this...,anger,2
17997,i feel that i am useful to my people and that ...,joy,0
17998,im feeling more comfortable with derby i feel ...,joy,0


No pre-processing

In [76]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_filtered.comment,
    df_filtered.emotion_num,
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df_filtered.emotion_num
)

In [77]:
print(X_train.shape)
print(X_test.shape)

(13889,)
(3473,)


In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer()
from sklearn.feature_extraction.text import CountVectorizer

In [79]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
     ('vectorizer_tfidf',v),
     ('Random Forest', RandomForestClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.93      0.87      1212
           1       0.89      0.83      0.86       432
           2       0.86      0.77      0.81       487
           3       0.91      0.87      0.89      1049
           4       0.85      0.67      0.75       293

    accuracy                           0.86      3473
   macro avg       0.86      0.81      0.83      3473
weighted avg       0.86      0.86      0.85      3473



In [80]:
from sklearn.naive_bayes import MultinomialNB

clf = Pipeline([
     ('vectorizer_tfidf',v),
     ('Multi NB', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.59      0.98      0.73      1212
           1       1.00      0.11      0.20       432
           2       0.95      0.15      0.27       487
           3       0.72      0.90      0.80      1049
           4       1.00      0.02      0.05       293

    accuracy                           0.65      3473
   macro avg       0.85      0.43      0.41      3473
weighted avg       0.76      0.65      0.56      3473



Count vectorizer


In [81]:
clf = Pipeline([
    ('vectorizer_tri_grams', CountVectorizer(ngram_range = (3, 3))),                       #using the ngram_range parameter
    ('random_forest', (RandomForestClassifier()))
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.49      0.64      0.56      1212
           1       0.24      0.51      0.32       432
           2       0.43      0.25      0.32       487
           3       0.57      0.34      0.43      1049
           4       0.53      0.10      0.16       293

    accuracy                           0.43      3473
   macro avg       0.45      0.37      0.36      3473
weighted avg       0.48      0.43      0.42      3473



After preprocessing

In [82]:
import spacy

nlp = spacy.load("en_core_web_sm")


def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)

In [83]:
df_filtered['preprocessed_comment'] = df_filtered['comment'].apply(preprocess)

In [84]:
df_filtered.head()

Unnamed: 0,comment,emotion,emotion_num,preprocessed_comment
0,i didnt feel humiliated,sadness,3,not feel humiliate
1,i can go from feeling so hopeless to so damned...,sadness,3,feel hopeless damned hopeful care awake
2,im grabbing a minute to post i feel greedy wrong,anger,2,m grab minute post feel greedy wrong
3,i am ever feeling nostalgic about the fireplac...,love,4,feel nostalgic fireplace know property
4,i am feeling grouchy,anger,2,feel grouchy


In [85]:
X_train, X_test, y_train, y_test = train_test_split(
    df_filtered.preprocessed_comment,
    df_filtered.emotion_num,
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df_filtered.emotion_num
)

CountVectorizer

In [86]:
clf = Pipeline([
    ('vectorizer_bi_grams', CountVectorizer(ngram_range = (1, 2))),                       #using the ngram_range parameter
    ('random_forest', (RandomForestClassifier()))
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.89      0.90      1212
           1       0.94      0.88      0.91       432
           2       0.86      0.86      0.86       487
           3       0.88      0.93      0.91      1049
           4       0.76      0.73      0.74       293

    accuracy                           0.89      3473
   macro avg       0.87      0.86      0.86      3473
weighted avg       0.89      0.89      0.89      3473



Random Forest Classifier

In [87]:
clf = Pipeline([
     ('vectorizer_tfidf',v),        #using the ngram_range parameter
     ('Random Forest', RandomForestClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.91      0.88      1212
           1       0.89      0.87      0.88       432
           2       0.85      0.86      0.86       487
           3       0.91      0.88      0.90      1049
           4       0.80      0.67      0.73       293

    accuracy                           0.87      3473
   macro avg       0.86      0.84      0.85      3473
weighted avg       0.87      0.87      0.87      3473

