<a href="https://colab.research.google.com/github/robitussin/CMSCSNLP/blob/main/emotiondetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install emoji

Collecting emoji
  Downloading emoji-1.7.0.tar.gz (175 kB)
[?25l[K     |█▉                              | 10 kB 18.4 MB/s eta 0:00:01[K     |███▊                            | 20 kB 8.1 MB/s eta 0:00:01[K     |█████▋                          | 30 kB 5.5 MB/s eta 0:00:01[K     |███████▌                        | 40 kB 5.2 MB/s eta 0:00:01[K     |█████████▍                      | 51 kB 4.0 MB/s eta 0:00:01[K     |███████████▏                    | 61 kB 4.7 MB/s eta 0:00:01[K     |█████████████                   | 71 kB 5.1 MB/s eta 0:00:01[K     |███████████████                 | 81 kB 4.8 MB/s eta 0:00:01[K     |████████████████▉               | 92 kB 5.3 MB/s eta 0:00:01[K     |██████████████████▊             | 102 kB 5.4 MB/s eta 0:00:01[K     |████████████████████▌           | 112 kB 5.4 MB/s eta 0:00:01[K     |██████████████████████▍         | 122 kB 5.4 MB/s eta 0:00:01[K     |████████████████████████▎       | 133 kB 5.4 MB/s eta 0:00:01[K     |██████████

Import all dependencies and a list of tagalog stop words

In [None]:
import pandas as pd
import numpy as np
import re, emoji, os, random
import nltk as nltk
nltk.download('punkt')
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, precision_score, recall_score

# Tagalog stop word list 
stop_words = ['akin','aking','ako','alin','am','amin','aming','ang','ano','anumang','apat','at','atin','ating','ay','bababa','bago','bakit','bawat','bilang','dahil',
             'dalawa','dapat','din','dito','doon','gagawin','gayunman','ginagawa','ginawa','ginawang','gumawa','gusto','habang','hanggang','hindi','huwag','iba','ibaba',
             'ibabaw','ibig','ikaw','ilagay','ilalim','ilan','inyong','isa','isang','itaas','ito','iyo','iyon','iyong','ka','kahit','kailangan','kailanman','kami','kanila',
             'kanilang','kanino','kanya','kanyang','kapag','kapwa','karamihan','katiyakan','katulad','kaya','kaysa','ko','kong','kulang','kumuha','kung','laban','lahat','lamang',
             'likod','lima','maaari','maaaring','maging','mahusay','makita','marami','marapat','masyado','may','mayroon','mga','minsan','mismo','mula','muli','na','nabanggit','naging',
             'nagkaroon','nais','nakita','namin','napaka','narito','nasaan','ng','ngayon','ni','nila','nilang','nito','niya','niyang','noon','o','pa','paano','pababa','paggawa','pagitan',
             'pagkakaroon','pagkatapos','palabas','pamamagitan','panahon','pangalawa','para','paraan','pareho','pataas','pero','pumunta','pumupunta','sa','saan','sabi','sabihin','sarili','sila','sino','siya','tatlo','tayo','tulad','tungkol','una','walang']

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Load Data Set from github repository

In [None]:
url = 'https://github.com/robitussin/emotiondetection/blob/main/datasets/mergedset.xlsx?raw=true'
dataset = pd.read_excel(url)

Utility functions

1.   A function for text cleaning to remove whitespaces and non-alphabetical characters
2.   A function to insert punctuation marks for data augmentation
3.   A function to count the number of word per comment



In [None]:
# Remove any other characters other than alphabetical characters
# Remove white spaces
def cleaner(text):
    text = re.sub('[^a-zA-Z]', '', str(text))
    text = re.sub(' +', ' ', str(text))

    cleaned_text = text.strip()
    return cleaned_text

def word_count_per_doc(text):
	tokenized = word_tokenize(cleaner(text))
	return len(tokenized)
 
PUNCTUATIONS = ['.', ',', '!', '?', ';', ':']
PUNC_RATIO = 0.3

# Data Augmentation Technique
def insert_punctuation_marks(sentence, punc_ratio=PUNC_RATIO):
	words = sentence.split(' ')
	new_line = []
	q = random.randint(1, int(punc_ratio * len(words) + 1))
	qs = random.sample(range(0, len(words)), q)

	for j, word in enumerate(words):
		if j in qs:
			new_line.append(PUNCTUATIONS[random.randint(0, len(PUNCTUATIONS)-1)])
			new_line.append(word)
		else:
			new_line.append(word)
	new_line = ' '.join(new_line)
	return new_line

Cleaning Process


1. Duplicates were removed
2. Rows with NULL values were removed
3. All comments were converted to lower text
4. Relabeled emotions
5. Renamed column names


In [None]:
# Remove duplicates
dataset = dataset.drop_duplicates(subset=['COMMENTS'])

# Remove rows with NULL value
dataset = dataset.dropna().reset_index(drop=True)

# Convert all text to lower case
dataset = dataset.apply(lambda x: x.astype(str).str.lower())

# Re label misspelled and similar labels
dataset["MAJORITY"].replace({"sad": "sadness", "0": "none", "digust": "disgust"}, inplace=True)

dataset = dataset.rename(columns={"COMMENTS": "comments", "MAJORITY": "label"})

print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17771 entries, 0 to 17770
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   comments  17771 non-null  object
 1   label     17771 non-null  object
dtypes: object(2)
memory usage: 277.8+ KB
None


Get total number of rows for each label/emotion

In [None]:
class_none, class_sadness, class_anger, class_joy, class_fear, class_surprise, class_disgust = dataset.label.value_counts()

In [None]:
dataset_none = dataset.loc[dataset['label'] == "none"]
dataset_sadness = dataset.loc[dataset['label'] == "sadness"]
dataset_anger = dataset.loc[dataset['label'] == "anger"]
dataset_joy = dataset.loc[dataset['label'] == "joy"]
dataset_fear = dataset.loc[dataset['label'] == "fear"]
dataset_surprise = dataset.loc[dataset['label'] == "surprise"]
dataset_disgust = dataset.loc[dataset['label'] == "disgust"]

Check the number of records for each class/emotion. Choose to either under sample (downsize) or over sample (augment) data.

In [None]:
print(dataset['label'].value_counts())

none        4974
sadness     4749
anger       4168
joy         3344
fear         214
surprise     194
disgust      128
Name: label, dtype: int64


Important Note: only choose to under sample or over sample when doing an experiment. Do not run both undersample and oversample cells

UnderSample
- Get the label/emotion with the least number of rows in the data set. Downsize all other classes/emotions to make them all have an equal number of rows.

In [None]:
sampleCount = class_disgust;

dataset_none_under = dataset_none.sample(sampleCount)
dataset_sadness_under = dataset_sadness.sample(sampleCount)
dataset_anger_under = dataset_anger.sample(sampleCount)
dataset_joy_under = dataset_joy.sample(sampleCount)
dataset_fear_under = dataset_fear.sample(sampleCount)
dataset_surprise_under = dataset_surprise.sample(sampleCount)

balanced_dataset = pd.concat([dataset_disgust, dataset_none_under, dataset_sadness_under, dataset_anger_under, dataset_joy_under, dataset_fear_under, dataset_surprise_under], ignore_index=True)

print(balanced_dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 896 entries, 0 to 895
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   comments  896 non-null    object
 1   label     896 non-null    object
dtypes: object(2)
memory usage: 14.1+ KB
None


OverSample
- Get the label/emotion with the largest number of rows in the dataset. Augment all other classes/emotions to make them all have an equal amount of rows. 

In [None]:
#sampleCount = class_none;

# Value hardcoded to prevent exceeding RAM usage
sampleCount = 3000; 

dataset_disgust_over = dataset_disgust.sample(sampleCount, replace=True)
dataset_sadness_over  = dataset_sadness.sample(sampleCount, replace=True)
dataset_anger_over  = dataset_anger.sample(sampleCount, replace=True)
dataset_joy_over  = dataset_joy.sample(sampleCount, replace=True)
dataset_fear_over  = dataset_fear.sample(sampleCount, replace=True)
dataset_surprise_over  = dataset_surprise.sample(sampleCount, replace=True)

balanced_dataset = pd.concat([dataset_none, dataset_disgust_over, dataset_sadness_over, dataset_anger_over, dataset_joy_over, dataset_fear_over, dataset_surprise_over], ignore_index=True)

print(balanced_dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22974 entries, 0 to 22973
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   comments  22974 non-null  object
 1   label     22974 non-null  object
dtypes: object(2)
memory usage: 359.1+ KB
None


Shuffle data set and insert punctuation marks for data augmentation

In [None]:
balanced_dataset = shuffle(balanced_dataset)
dataset_X = balanced_dataset[['comments']]
dataset_X = dataset_X['comments'].apply(insert_punctuation_marks).to_frame()
dataset_y = balanced_dataset['label']

Split data set to train and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dataset_X, dataset_y, test_size=0.20, random_state=1)

Split data set to test and validation

In [None]:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.50, random_state=1)

In [None]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
X_val.reset_index(drop=True, inplace=True)

y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
y_val.reset_index(drop=True, inplace=True)

Traditional Features

1. Feature to check angry emojis
2. Feature to check sad emojis
3. Feature to check joyful emojis
4. Feature to check emojis showing disgust
5. Feature to check emojis showing fear
6. Feature to check emojis showing surprise
7. Word Frequency feature
8. Vowel count feature
9. Consonant count feature

In [None]:
def check_angry_emojis(comment):
   emojis = ''.join(character for character in comment if character in emoji.UNICODE_EMOJI['en'])

   line = ["🖕", "💩", "😤", "😡", "😠", "🤬"]
   for character in emojis:
	   if character in line:
		   return 1
   return 0

def check_sad_emojis(comment):
   emojis = ''.join(character for character in comment if character in emoji.UNICODE_EMOJI['en'])

   line = ["😟", "🙁	", "☹", "😡", "😰", "😥", "😢", "😭", "😓", "💔"]
   for character in emojis:
	   if character in line:
		   return 1
   return 0

def check_joy_emojis(comment):
   emojis = ''.join(character for character in comment if character in emoji.UNICODE_EMOJI['en'])

   line = ["😀", "😃", "😄", "😁", "😆", "🤣", "😂", "🙂", "😊", "😇", "🥰", "😍", "🤗", "❤"]
   for character in emojis:
	   if character in line:
		   return 1
   return 0

def check_disgust_emojis(comment):
   emojis = ''.join(character for character in comment if character in emoji.UNICODE_EMOJI['en'])

   line = ["🤢", "🤮"]
   for character in emojis:
	   if character in line:
		   return 1
   return 0

def check_fear_emojis(comment):
   emojis = ''.join(character for character in comment if character in emoji.UNICODE_EMOJI['en'])

   line = ["😨", "😰", "😱"]
   for character in emojis:
	   if character in line:
		   return 1
   return 0

def check_surprise_emojis(comment):
   emojis = ''.join(character for character in comment if character in emoji.UNICODE_EMOJI['en'])

   line = ["😮", "😯", "😲", "😳"]
   for character in emojis:
	   if character in line:
		   return 1
   return 0

def wordFrequency(sentences):
	sentences = list(sentences)
	sentences = [word_tokenize(sentence) for sentence in sentences]
	for i in range(len(sentences)):
			sentences[i] = [word for word in sentences[i] if word not in stop_words]
	return sentences

def vowel_count(text):
	syllable_counts = 0
	for char in text:
		if char == 'a' or char == 'e' or char == 'i' or char == 'o' or char == 'u' or char == 'A' or char == 'E' or char == 'I' or char == 'O' or char == 'U':
			syllable_counts += 1
	return syllable_counts


def consonant_count(article):
    article = article.lower()
    total_consonant = 0

    for i in article:
        if i == 'b' or i == 'c' or i == 'd' or i == 'f' or i == 'g' \
                or i == 'h' or i == 'j' or i == 'k' or i == 'l' \
                or i == 'm' or i == 'n' or i == 'p' or i == 'q' \
                or i == 'r' or i == 's' or i == 't' or i == 'v' \
                or i == 'w' or i == 'x' or i == 'y' or i == 'z':
            total_consonant = total_consonant + 1;

    return total_consonant

Ortography Features

1. Consonant cluster feature



In [None]:
def get_consonant_cluster(text):
    cleaned = cleaner(text)
    word_count = word_count_per_doc(text)

    pattern = "([bcdfghjklmnpqrstvwxyz]{1}[bcdfghjklmnpqrstvwxyz]{1}[bcdfghjklmnpqrstvwxyz]*)"
    matches = len(re.findall(pattern, cleaned))

    result = 0;
    if word_count > 0:
        matches / word_count

    return result

Morphological Features

1. Auxiliary verb ratio
2. Lexical Density feature




In [None]:
def aux_verb_ratio(text):
    splitted = re.split('[?.]+', text)
    splitted = [i for i in splitted if i]   #removes empty strings in list

    word_count = word_count_per_doc(text)

    verb_counter = 0
    aux_verbs = 0
    for i in splitted:
        i = i.strip()
        tagged_text = pos_tagger.tag(word_tokenize(i))
        for x in tagged_text:
            if '|' not in x[0]:
                pos = x[1].split('|')[1]
                #print(pos)
                if pos[:2] == 'VB':
                    verb_counter += 1
                if pos == 'VBS':
                    aux_verbs += 1

    if word_count == 0:
        return 0

    return (aux_verbs/word_count)

def lexical_density(text):
    splitted = re.split('[?.]+', text)
    splitted = [i for i in splitted if i]   #removes empty strings in list

    lexical_item_counter = 0
    for i in splitted:
        i = i.strip()
        tagged_text = pos_tagger.tag(word_tokenize(i))
        for x in tagged_text:
            if '|' not in x[0]:
                pos = x[1].split('|')[1]
                if pos[:2] == 'VB' or pos[:2] == 'NN' or pos[:2] == 'JJ' or pos[:2] == 'RB':
                    lexical_item_counter += 1

    word_count = word_count_per_doc(text)
    print("Word Count:",word_count)
    if word_count == 0:
        return 0
    return (lexical_item_counter/word_count_per_doc(text))

Feature Extraction of Training Set. Get 10 features



In [None]:
vectorizer = CountVectorizer()
vectorizer.fit_transform(X_train['comments'])

# Feature 1 - Word Frequency
X_f1 = X_train['comments'].apply(wordFrequency)
X_f1 = vectorizer.transform(X_train['comments'])
X_f1 = pd.DataFrame(X_f1.toarray())

# Feature 2 - Emojis(Sad)
X_f2 = X_train['comments'].apply(check_sad_emojis)

# Feature 3 - Emojis(Angry)
X_f3 = X_train['comments'].apply(check_angry_emojis)

# Feature 4 - Emojis(Joy)
X_f4 = X_train['comments'].apply(check_joy_emojis)

# Feature 5 - Emojis(Disgust)
X_f5 = X_train['comments'].apply(check_disgust_emojis)

# Feature 6 - Emojis(Fear)
X_f6 = X_train['comments'].apply(check_fear_emojis)

# Feature 7 - Emojis(Surprise)
X_f7 = X_train['comments'].apply(check_surprise_emojis)

# Feature 8 - Vowel Count
X_f8 = X_train['comments'].apply(vowel_count)

# Feature 9 - Consonant Count
X_f9 = X_train['comments'].apply(consonant_count)

# Feature 10 - Consonant Cluster
X_f10 = X_train['comments'].apply(get_consonant_cluster)

# Concatenate all features
collected_features_train = pd.concat([X_f1, X_f2, X_f3, X_f4, X_f5, X_f6, X_f7, X_f8, X_f9, X_f10], axis=1)
collected_features_train = collected_features_train.to_numpy();

Feature Extraction of Validation Set

In [None]:
# Feature 1 - Word Frequency
X_f1 = X_val['comments'].apply(wordFrequency)
X_f1 = vectorizer.transform(X_val['comments'])
X_f1 = pd.DataFrame(X_f1.toarray())

# Feature 2 - Emojis(Sad)
X_f2 = X_val['comments'].apply(check_sad_emojis)

# Feature 3 - Emojis(Angry)
X_f3 = X_val['comments'].apply(check_angry_emojis)

# Feature 4 - Emojis(Joy)
X_f4 = X_val['comments'].apply(check_joy_emojis)

# Feature 5 - Emojis(Disgust)
X_f5 = X_val['comments'].apply(check_disgust_emojis)

# Feature 6 - Emojis(Fear)
X_f6 = X_val['comments'].apply(check_fear_emojis)

# Feature 7 - Emojis(Surprise)
X_f7 = X_val['comments'].apply(check_surprise_emojis)

# Feature 8 - Vowel Count
X_f8 = X_val['comments'].apply(vowel_count)

# Feature 9 - Consonant Count
X_f9 = X_val['comments'].apply(consonant_count)

# Feature 10 - Consonant Cluster
X_f10 = X_val['comments'].apply(get_consonant_cluster)

# Concatenate all features
collected_features_val = pd.concat([X_f1, X_f2, X_f3, X_f4, X_f5, X_f6, X_f7, X_f8, X_f9, X_f10], axis=1)
collected_features_val = collected_features_val.to_numpy();

Convert to Array

In [None]:
y_train = y_train.to_numpy();
y_train = np.squeeze(y_train)

K Nearest Neighbor (Validation data)

In [None]:
knn_clf = KNeighborsClassifier(n_neighbors = 5)

Fit model and predict using KNN

In [None]:
# K Nearest Neighbor
knn_clf.fit(collected_features_train,y_train)

y_pred = knn_clf.predict(collected_features_val)

print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))

              precision    recall  f1-score   support

       anger       0.42      0.51      0.46       296
     disgust       0.88      1.00      0.93       302
        fear       0.74      1.00      0.85       308
         joy       0.45      0.41      0.43       297
        none       0.42      0.31      0.35       485
     sadness       0.47      0.26      0.33       312
    surprise       0.77      1.00      0.87       298

    accuracy                           0.61      2298
   macro avg       0.59      0.64      0.60      2298
weighted avg       0.58      0.61      0.58      2298

[[151   7  15  20  67  25  11]
 [  0 302   0   0   0   0   0]
 [  0   0 308   0   0   0   0]
 [ 44   7  29 122  59  25  11]
 [104  22  37  96 148  40  38]
 [ 58   7  25  35  79  80  28]
 [  0   0   0   0   0   0 298]]


Multinomial Naive Bayes (Validation data)

In [None]:
mnb_clf = MultinomialNB(alpha=1.0)

Fit model and predict using multinomail naive bayes

In [None]:
# Multinomial Naive Bayes
mnb_clf.fit(collected_features_train, y_train)

y_pred = mnb_clf.predict(collected_features_val)

print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))

              precision    recall  f1-score   support

       anger       0.63      0.81      0.71       296
     disgust       0.94      0.98      0.96       302
        fear       0.91      0.94      0.93       308
         joy       0.67      0.74      0.70       297
        none       0.71      0.54      0.61       485
     sadness       0.73      0.67      0.70       312
    surprise       0.91      0.92      0.92       298

    accuracy                           0.78      2298
   macro avg       0.79      0.80      0.79      2298
weighted avg       0.78      0.78      0.78      2298

[[239   4   5   3  33  11   1]
 [  7 295   0   0   0   0   0]
 [  2   2 289   1   3  11   0]
 [ 11   3   3 221  37   9  13]
 [ 65   7  13  83 263  42  12]
 [ 51   2   3  14  33 208   1]
 [  2   1   3  10   3   4 275]]


Decision Tree (Validation data)

In [None]:
dt_clf = DecisionTreeClassifier()

Fit model and predct using decision tree

In [None]:
# Decision Tree
dt_clf.fit(collected_features_train, y_train)

y_pred = dt_clf.predict(collected_features_val)

print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))


              precision    recall  f1-score   support

       anger       0.71      0.68      0.70       296
     disgust       0.99      1.00      0.99       302
        fear       0.94      1.00      0.97       308
         joy       0.71      0.73      0.72       297
        none       0.63      0.60      0.61       485
     sadness       0.74      0.72      0.73       312
    surprise       0.95      1.00      0.98       298

    accuracy                           0.80      2298
   macro avg       0.81      0.82      0.81      2298
weighted avg       0.80      0.80      0.80      2298

[[202   0   5   7  59  22   1]
 [  0 302   0   0   0   0   0]
 [  0   0 308   0   0   0   0]
 [  4   1   4 217  59   9   3]
 [ 58   3   9  66 290  49  10]
 [ 20   0   2  14  51 224   1]
 [  0   0   0   0   0   0 298]]


Predict emotion/label using test data

In [None]:
# Feature 1 - Word Frequency
X_f1 = X_test['comments'].apply(wordFrequency)
X_f1 = vectorizer.transform(X_test['comments'])
X_f1 = pd.DataFrame(X_f1.toarray())

# Feature 2 - Emojis(Sad)
X_f2 = X_test['comments'].apply(check_sad_emojis)

# Feature 3 - Emojis(Angry)
X_f3 = X_test['comments'].apply(check_angry_emojis)

# Feature 4 - Emojis(Joy)
X_f4 = X_test['comments'].apply(check_joy_emojis)

# Feature 5 - Emojis(Disgust)
X_f5 = X_test['comments'].apply(check_disgust_emojis)

# Feature 6 - Emojis(Fear)
X_f6 = X_test['comments'].apply(check_fear_emojis)

# Feature 7 - Emojis(Surprise)
X_f7 = X_test['comments'].apply(check_surprise_emojis)

# Feature 8 - Vowel Count
X_f8 = X_test['comments'].apply(vowel_count)

# Feature 9 - Consonant Count
X_f9 = X_test['comments'].apply(consonant_count)

# Feature 10 - Consonant Cluster
X_f10 = X_test['comments'].apply(get_consonant_cluster)

# Concatenate all features
collected_features_test = pd.concat([X_f1, X_f2, X_f3, X_f4, X_f5, X_f6, X_f7, X_f8, X_f9, X_f10], axis=1)
collected_features_test = collected_features_test.to_numpy();

K Nearest Neighbor (Test data)

In [None]:
y_pred = knn_clf.predict(collected_features_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

       anger       0.43      0.47      0.45       301
     disgust       0.87      1.00      0.93       299
        fear       0.74      1.00      0.85       273
         joy       0.45      0.43      0.44       295
        none       0.46      0.34      0.39       509
     sadness       0.50      0.30      0.38       306
    surprise       0.76      1.00      0.87       314

    accuracy                           0.62      2297
   macro avg       0.60      0.65      0.61      2297
weighted avg       0.59      0.62      0.59      2297

[[142   5  20  25  64  27  18]
 [  0 299   0   0   0   0   0]
 [  0   0 273   0   0   0   0]
 [ 40  11  14 126  67  14  23]
 [ 96  23  40  93 171  51  35]
 [ 53   7  23  36  72  93  22]
 [  0   0   0   0   0   0 314]]


Multinomial Naive Bayes  (Test data)

In [None]:
y_pred = mnb_clf.predict(collected_features_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

       anger       0.65      0.82      0.72       301
     disgust       0.91      0.98      0.94       299
        fear       0.89      0.94      0.91       273
         joy       0.70      0.71      0.71       295
        none       0.74      0.59      0.65       509
     sadness       0.71      0.64      0.67       306
    surprise       0.91      0.94      0.93       314

    accuracy                           0.78      2297
   macro avg       0.79      0.80      0.79      2297
weighted avg       0.78      0.78      0.78      2297

[[247   4   6   2  29  11   2]
 [  5 294   0   0   0   0   0]
 [  1   0 256   1   2  13   0]
 [ 13   3   5 209  40  16   9]
 [ 70  15  14  60 299  38  13]
 [ 42   6   7  15  34 197   5]
 [  3   2   0  10   0   3 296]]


Decision Tree  (Test data)

In [None]:
# Decision Trees
y_pred = dt_clf.predict(collected_features_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

       anger       0.68      0.71      0.69       301
     disgust       0.99      1.00      0.99       299
        fear       0.94      1.00      0.97       273
         joy       0.71      0.75      0.73       295
        none       0.67      0.61      0.64       509
     sadness       0.74      0.67      0.70       306
    surprise       0.94      1.00      0.97       314

    accuracy                           0.80      2297
   macro avg       0.81      0.82      0.81      2297
weighted avg       0.80      0.80      0.80      2297

[[213   1   2  11  58  16   0]
 [  0 299   0   0   0   0   0]
 [  0   0 273   0   0   0   0]
 [ 12   0   3 221  47   8   4]
 [ 60   3   8  65 311  49  13]
 [ 27   0   5  14  51 206   3]
 [  0   0   0   0   0   0 314]]
