# Dataset prepareation
First the dataset needs to be loaded and modified to only include the labels we want. This is done by merging unchosen labels to their closest related label. Cases where text has multiple labels for one entry are handled by choosing the label which has the lowest representation in the dataset so far.

In [None]:
from data.dataset import EmotionsDataset

train_dataset = EmotionsDataset(split="train")
valid_dataset = EmotionsDataset(split="valid")
test_dataset = EmotionsDataset(split="test")


In [None]:
print(train_dataset[0])
print(len(train_dataset))
print(len(valid_dataset))
print(len(test_dataset))

# Evalutation Methods
## Confusion Matrix
A confusion matrix is used to visualise the performance of the classifier, this helps us see which labels the classifiers are making the most mistakes on.
The dataset is largely saturated by neutral tags, this means a raw confusion matrix turns out dark for all values apart from neural-neutral, to solve this my confusion matrix displays the log values.

In [None]:
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
from data.dataset import label_names, chosen_labels

from typing import Tuple

def get_label_names():
    names = []

    for label in chosen_labels:
        names.append(label_names[label])
    names.append(label_names[27])

    return names

def display_confusion_matrix(preds, y, scale="linear", save_name: Tuple(str, str) = None):
    confusion_matrix = np.zeros((14,14)).astype(np.int32)

    for pred, label in zip(preds, y):
        confusion_matrix[pred, label] += 1

    if scale == "log":
        confusion_matrix = np.log2(confusion_matrix+1)
    
    chosen_label_names = get_label_names()
    heatmap_confusion_matrix = pd.DataFrame(confusion_matrix, index=chosen_label_names, columns=chosen_label_names)

    ax = plt.axes()
    sb.heatmap(heatmap_confusion_matrix, annot=True, ax=ax)
    scale_title = scale[0].upper() + scale[1:]
    ax.set_title(f"{scale_title} Confusion Matrix")

    if save_name != None:
        plt.savefig(f"./figures/{save_name[0]}/{save_name[1]}.png")

    plt.show()


## Precision, Recall, F1 Score
To keep track of the performance of each experiment on an induvidual label, by tracking how often a label is corrcetly guessed out of the the instances of that label and how ofter a guess of a label is correct out of all the guesses of that label.

In [None]:
from sklearn.metrics import precision_recall_fscore_support

def get_scores(labels, predictions, kernel_type = "linear"):
    prec, recall, f1, support = precision_recall_fscore_support(labels, predictions)

    results_dictionary = {"label":list(range(14)), "precision":prec, "recall":recall, "f1":f1}
    results = pd.DataFrame(results_dictionary)
    results.to_csv(f"figures/{kernel_type}_kernel_metrics.csv", index=False)

    return results

# Experiment 1: Different SVM kernel types
This experiment looks at the different kernel options used to build a support vector, the kernels covered are: linear, polynomial, radial basis function and sigmoid.
## Vectorisation using TFIDF 
The input text needs to be vectorised before it can be used for training a support vector machine. First stop words and non-alphabetic words are removed before being lemmatised. 

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
from nltk import download, pos_tag
download('punkt')
download('wordnet')
download('omw-1.4')
download('averaged_perceptron_tagger')
download('stopwords')

from collections import defaultdict

def lem_text(text):
    text = [word.lower() for word in text]
    text = [word_tokenize(word) for word in text]

    word_tags = defaultdict(lambda: wordnet.NOUN)
    word_tags['J'] = wordnet.ADJ
    word_tags['V'] = wordnet.VERB
    word_tags['R'] = wordnet.ADV

    for i, words in enumerate(text):
        lemmed_text = []
        lemmer = WordNetLemmatizer()

        for word, tag in pos_tag(words):
            if word not in stopwords.words('english') and word.isalpha():
                lem_word = lemmer.lemmatize(word, word_tags[tag[0]])
                lemmed_text.append(lem_word)

        text[i] = str(lemmed_text)

    return text

Once the text of all datasets has been lemmatised, the training data is used to extract the most important word tfidf features. These features are used to convert all dataset text entries into vectors.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

train_dataset_tfidf = EmotionsDataset(split="train")
test_dataset_tfidf = EmotionsDataset(split="test")

train_dataset_tfidf.x = lem_text(train_dataset_tfidf.x)
test_dataset_tfidf.x = lem_text(test_dataset_tfidf.x)

tfidf_vectoriser = TfidfVectorizer(max_features=1000)
tfidf_vectoriser.fit(train_dataset_tfidf.x)

train_dataset_tfidf.x = tfidf_vectoriser.transform(train_dataset_tfidf.x)
test_dataset_tfidf.x = tfidf_vectoriser.transform(test_dataset_tfidf.x)

## SVM kernel results
The performance of the SVM is evaluated with accuracy, recall, precision and f1 score

### Linear Kernel

In [None]:
from sklearn import svm
from sklearn.metrics import accuracy_score
kernel_dir = "kernel_experiement"

svm_linear = svm.SVC(kernel='linear')
svm_linear.fit(train_dataset_tfidf.x, train_dataset_tfidf.y)
preds = svm_linear.predict(test_dataset_tfidf.x)

display_confusion_matrix(preds, test_dataset_tfidf.y, save_name=(kernel_dir, "confusion_matrix_no_scaling"))
display_confusion_matrix(preds, test_dataset_tfidf.y, scale="log", save_name=(kernel_dir, 'linear_matrix'))

acc = accuracy_score(test_dataset_tfidf.y, preds)*100

results = get_scores(test_dataset_tfidf.y, preds, kernel_type="linear")
print(results)
print("Testing Accuracy: ", acc)

### Polynomial Kernel

In [None]:
svm_poly = svm.SVC(kernel='poly')
svm_poly.fit(train_dataset_tfidf.x, train_dataset_tfidf.y)
preds = svm_poly.predict(test_dataset_tfidf.x)

display_confusion_matrix(preds, test_dataset_tfidf.y, scale="log", save_name=(kernel_dir, 'poly_matrix'))

acc = accuracy_score(test_dataset_tfidf.y, preds)*100
prec, recall, f1, support = precision_recall_fscore_support(test_dataset_tfidf.y, preds)

results = get_scores(test_dataset_tfidf.y, preds, kernel_type="poly")

print(results)
print("Testing Accuracy: ", acc)

### Sigmoid Kernel

In [None]:
svm_sig = svm.SVC(kernel='sigmoid')
svm_sig.fit(train_dataset_tfidf.x, train_dataset_tfidf.y)
preds = svm_sig.predict(test_dataset_tfidf.x)

display_confusion_matrix(preds, test_dataset_tfidf.y, scale="log", save_name=(kernel_dir, 'sigmoid_matrix'))

acc = accuracy_score(test_dataset_tfidf.y, preds)*100

results = get_scores(test_dataset_tfidf.y, preds, kernel_type="sigmoid")

print(results)
print("Testing Accuracy: ", acc)

### Radial Basis Function Kernel

In [None]:
svm_rbf = svm.SVC(kernel='rbf')
svm_rbf.fit(train_dataset_tfidf.x, train_dataset_tfidf.y)
preds = svm_rbf.predict(test_dataset_tfidf.x)

display_confusion_matrix(preds, test_dataset_tfidf.y, scale="log", save_name=(kernel_dir, 'rbf_matrix'))

acc = accuracy_score(test_dataset_tfidf.y, preds)*100

results = get_scores(test_dataset_tfidf.y, preds, kernel_type="rbf")

print(results)
print("Testing Accuracy: ", acc)

# Experiment 2: SVM Hyper-parameter tuning
I will tune the hyper-parameters of SVM while keeping the kernel the same. I will use the radial basis function kernel as it has provided the best results in experiment 1.
## Grid Search
The hyper-parameters will be searched using a grid search technique. I will select a set of discrete values for each variable that I am changing. The variables I will focus on will be: the regularisation parameter, gamma value and stopping tolerance.

In [None]:
c_values = [0.5, 0.75, 1, 1.25, 1.5]
g_values = [0.5, 0.75, 1, 1.25, 1.5]
t_values = [1e-2, 1e-3, 1e-4]

grid_search_dir = 'grid_search_experiment'

for c_value in c_values:
    for g_value in g_values:
        for t_value in t_values:
            test_name = f'{c_value}_{g_value}_{t_value}'

            svm_rbf = svm.SVC(kernel='rbf')
            svm_rbf.fit(train_dataset_tfidf.x, train_dataset_tfidf.y)

            preds = svm_rbf.predict(test_dataset_tfidf.x)

            display_confusion_matrix(preds, test_dataset_tfidf.y, scale="log", save_name=(grid_search_dir, f'{test_name}_matrix'))

            acc = accuracy_score(test_dataset_tfidf.y, preds)*100

            results = get_scores(test_dataset_tfidf.y, preds, kernel_type="rbf")

            print(results)
            print(f"Testing Accuracy ({test_name}): ", acc)
            