# Random Forest - Implementation

In [61]:
import numpy as np
import pandas as pd
import nltk
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import os

## Read data

In [62]:
file_path = os.path.join("Data", "track-a.csv")
dataframe = pd.read_csv(file_path)
dataframe.head()

Unnamed: 0,id,text,anger,fear,joy,sadness,surprise
0,eng_train_track_a_00001,"Colorado, middle of nowhere.",0,1,0,0,1
1,eng_train_track_a_00002,This involved swimming a pretty large lake tha...,0,1,0,0,0
2,eng_train_track_a_00003,It was one of my most shameful experiences.,0,1,0,1,0
3,eng_train_track_a_00004,"After all, I had vegetables coming out my ears...",0,0,0,0,0
4,eng_train_track_a_00005,Then the screaming started.,0,1,0,1,1


## Data Pre-processing

### Stop word removal

In [63]:
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

dataframe['text'] = dataframe['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tushar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Lemmatization

In [64]:
word_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
# TODO: Lot of punctuations (remove)

def lemmatize_text(txt):
    out = ""
    for word in word_tokenizer.tokenize(txt):
        out = out + lemmatizer.lemmatize(word) + " "
    return out

dataframe["text"] = dataframe.text.apply(lemmatize_text)



### Train-Test Split

In [65]:
from sklearn.feature_extraction.text import CountVectorizer 

X_texts = dataframe['text']
vectorizer = CountVectorizer()
X_texts_vec = vectorizer.fit_transform(X_texts)


emotions = ["anger", "fear", "joy", "sadness", "surprise"]
Y_emotions = dataframe[emotions]


X_train_text, X_test_text, Y_train_labels, Y_test_labels = train_test_split(X_texts_vec, Y_emotions, test_size = 0.1)

## Random Forest Classification

In [66]:
from sklearn.ensemble import RandomForestClassifier


In [67]:
model = RandomForestClassifier(n_estimators=10,max_features=4,random_state=101)

In [68]:
model.fit(X_train_text,Y_train_labels)

0,1,2
,n_estimators,10
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,4
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [69]:
predictions = model.predict(X_test_text)

### Evaluation

In [70]:
from sklearn.metrics import multilabel_confusion_matrix

In [71]:
confusion_mat = multilabel_confusion_matrix(Y_test_labels,predictions)

In [72]:
def accuracy(tp, tn, fp, fn):
    return (tp + tn) / (tp + tn + fp + fn)


def precision(tp, fp):
    return tp / (tp + fp)


def recall(tp, fn):
    return tp / (tp + fn)

def f1_score(tp,fp,fn):
    precision_val = precision(tp,fp)
    recall_val = recall(tp,fn)
    return 2 * (precision_val * recall_val) / (precision_val + recall_val)

In [73]:
def print_eval(title, accuracy, precision, recall, f1_score):
    print(f"{title}\n")
    print(f"accuracy: {round(accuracy,2)}")
    print(f"precision: {round(precision,2)}")
    print(f"recall: {round(recall,2)}")
    print(f"f1 Score: {round(f1_score,2)}")
    print("=======\n")


def present_data(log_level):
    total_accuracy = 0
    total_precision = 0
    total_recall = 0
    total_f1_score = 0

    for i in range(0, len(confusion_mat)):
        tp, fp = confusion_mat[i][0]
        fn, tn = confusion_mat[i][1]

        accuracy_val = accuracy(tp, tn, fp, fn)
        precision_val = precision(tp, fp)
        recall_val = recall(tp, fn)
        f1_score_val = f1_score(tp, fp, fn)

        total_accuracy += accuracy_val
        total_precision += precision_val
        total_recall += recall_val
        total_f1_score += f1_score_val

        if(log_level == "emotions"):
            print_eval(
            f"Emotion: {emotions[i]}",
                accuracy_val,
                precision_val,
                recall_val,
                f1_score_val,
            )

    avg_accuracy = total_accuracy / len(confusion_mat)
    avg_precision = total_precision / len(confusion_mat)
    avg_recall = total_recall / len(confusion_mat)
    avg_f1_score = total_f1_score / len(confusion_mat)

    if(log_level=="macro"):
        print_eval("Macro Average:", avg_accuracy, avg_precision, avg_recall, avg_f1_score)

In [74]:
present_data("emotions")

Emotion: anger

accuracy: 0.88
precision: 1.0
recall: 0.88
f1 Score: 0.94

Emotion: fear

accuracy: 0.56
precision: 0.46
recall: 0.42
f1 Score: 0.44

Emotion: joy

accuracy: 0.78
precision: 1.0
recall: 0.79
f1 Score: 0.88

Emotion: sadness

accuracy: 0.65
precision: 0.95
recall: 0.64
f1 Score: 0.77

Emotion: surprise

accuracy: 0.7
precision: 0.93
recall: 0.72
f1 Score: 0.82



In [75]:
present_data("macro")

Macro Average:

accuracy: 0.71
precision: 0.87
recall: 0.69
f1 Score: 0.77

