In [1]:
import pandas as pd
import numpy as np
import os
import joblib

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

## Read Data

In [3]:
def read_data(filename, type = "csv"):
    if(type == "csv"):
        data = pd.read_csv(filename)  
        data_df = pd.DataFrame(data)   
        return data_df
    
    elif (type == "excel"):
        data = pd.read_excel(filename)
        data_df = pd.DataFrame(data)
        return data_df

In [6]:
datapath = os.path.join("Data", "track-a.csv")
track_a = read_data(datapath)

## Train Test Split

In [7]:
x = track_a.iloc[:, 1]
y = track_a.iloc[:, 2:]
test_size = 0.20

In [8]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = test_size, random_state= 42)

## Training Model

In [9]:
def multinomialNB_classifier(X_train, y_train): 
    model = make_pipeline(
        TfidfVectorizer(),
        OneVsRestClassifier(MultinomialNB())
    )
    model.fit(X_train, y_train)
    return model

In [10]:
model_NB = multinomialNB_classifier(X_train, y_train)

In [11]:
joblib.dump(model_NB, "savedModel/nbmodel.joblib")

['savedModel/nbmodel.joblib']

In [12]:
y_pred = model_NB.predict(X_test)

In [13]:
def calculate_results(y_test, y_pred) : 
    result_dict = {}
    result_dict["accuracy"] = accuracy_score(y_test, y_pred)

    result_dict[ "f1_micro" ] = f1_score(y_test, y_pred, average='micro')
    result_dict ["f1_macro"] = f1_score(y_test, y_pred, average='macro')

    result_dict ["report"] = classification_report(y_test, y_pred, target_names=["anger","fear","joy","sadness","surprise"], zero_division = 0)

    return result_dict

In [14]:
def accuracy_per_label(y_test, y_pred):
    accuracies = []
    labels = ["anger", "fear", "joy", "sadness", "surprise"] 
    for i in range(5):
        accuracies.append( accuracy_score(y_test.iloc[:,i], y_pred[:,i]))
        
        print(F"Accuracy {labels[i]}: {accuracies[i] }")
    return accuracies

In [15]:
results = calculate_results(y_test, y_pred)

In [16]:
accuracies = accuracy_per_label(y_test, y_pred)

Accuracy anger: 0.8700361010830325
Accuracy fear: 0.6389891696750902
Accuracy joy: 0.7942238267148014
Accuracy sadness: 0.7003610108303249
Accuracy surprise: 0.6823104693140795


In [17]:
accuracy = accuracy_score(y_test,y_pred)
print(accuracy)
print( np.mean(accuracies))

0.1624548736462094
0.7371841155234657


In [18]:
print(results ["f1_micro"])
print(results["f1_macro"])

0.4716981132075472
0.1712754540588825


In [19]:
print(results["report"])

              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        72
        fear       0.63      0.96      0.76       330
         joy       1.00      0.01      0.02       115
     sadness       0.57      0.02      0.05       167
    surprise       1.00      0.02      0.03       179

   micro avg       0.63      0.38      0.47       863
   macro avg       0.64      0.20      0.17       863
weighted avg       0.69      0.38      0.31       863
 samples avg       0.57      0.35      0.41       863



## Trying Logistic Regression

In [20]:
clf = MultiOutputClassifier(LogisticRegression(solver='liblinear'))
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
clf.fit(X_train_tfidf, y_train)
y_pred_lr = clf.predict(X_test_tfidf)

In [21]:

accuracies_lr = accuracy_per_label(y_test, y_pred_lr)

Accuracy anger: 0.8700361010830325
Accuracy fear: 0.644404332129964
Accuracy joy: 0.8014440433212996
Accuracy sadness: 0.723826714801444
Accuracy surprise: 0.7346570397111913


In [22]:
accuracy_score(y_test,y_pred_lr)
print(accuracy)
print( np.mean(accuracies_lr))

0.1624548736462094
0.7548736462093864


In [23]:
results_lr = calculate_results(y_test,y_pred_lr)
print(results_lr["f1_micro"])
print(results_lr["f1_macro"])


0.5061818181818182
0.2894000023683524
