### Support Vector Machines
Using a support vector machine to classify data based on the speech act

In [66]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import CountVectorizer

Generating the sentences and labels from the Excel sheet

In [67]:
# from preprocessing import get_sentences_labels

# file_path = "data/divided_data.xlsx"

# sentences, labels = get_sentences_labels(file_path)

In [68]:
from label_regex import label_csv

labeled_sentences = label_csv("data/MainExpTranscriptFullSMAQPER.csv")

sentences = []
labels = []
for sentence, label in labeled_sentences:
    sentences.append(sentence)
    labels.append(label)

Data has been written to data/labeled_sentences.csv


## Preprocessing
Vectorising the data

In [69]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sentences)
y = labels

Encode the labels

In [70]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(labels)

Convert to tensors

In [71]:
X = torch.tensor(X.toarray(), dtype=torch.float32)
y = torch.tensor(y_encoded, dtype=torch.float32)

Defining the SVM model using TensorFlow

In [72]:
class SVM(nn.Module):
    def __init__(self):
        super(SVM, self).__init__()
        self.linear = nn.Linear(X.shape[1], 1)

    def forward(self, x):
        return self.linear(x)

Define the SVM model

In [73]:
model = SVM()

Define loss function and optimiser

In [74]:
criterion = nn.HingeEmbeddingLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

Train the model

In [75]:
epochs = 300
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(X)
    loss = criterion(outputs.squeeze(), y)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item()}')

Epoch [10/300], Loss: 0.4341365694999695
Epoch [20/300], Loss: 0.3295084536075592
Epoch [30/300], Loss: 0.2248803824186325
Epoch [40/300], Loss: 0.12025228142738342
Epoch [50/300], Loss: 0.01562420278787613
Epoch [60/300], Loss: -0.08900386095046997
Epoch [70/300], Loss: -0.19363200664520264
Epoch [80/300], Loss: -0.2982602119445801
Epoch [90/300], Loss: -0.40288838744163513
Epoch [100/300], Loss: -0.5075165629386902
Epoch [110/300], Loss: -0.6121448278427124
Epoch [120/300], Loss: -0.716773271560669
Epoch [130/300], Loss: -0.8214015364646912
Epoch [140/300], Loss: -0.9260299205780029
Epoch [150/300], Loss: -1.03065824508667
Epoch [160/300], Loss: -1.135286569595337
Epoch [170/300], Loss: -1.239914894104004
Epoch [180/300], Loss: -1.344543218612671
Epoch [190/300], Loss: -1.449171543121338
Epoch [200/300], Loss: -1.5537998676300049
Epoch [210/300], Loss: -1.6584281921386719
Epoch [220/300], Loss: -1.7630561590194702
Epoch [230/300], Loss: -1.867684245109558
Epoch [240/300], Loss: -1.97

## Evaluation

In [76]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Convert model predictions to labels
def predictions_to_labels(predictions):
    return torch.round(torch.sigmoid(predictions)).detach().numpy()

# Make predictions
with torch.no_grad():
    predicted_labels = predictions_to_labels(model(X))

# Convert ground truth labels to numpy array
true_labels = y.detach().numpy()

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='macro', zero_division=0)
recall = recall_score(true_labels, predicted_labels, average='macro')
f1 = f1_score(true_labels, predicted_labels, average='macro')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')

Accuracy: 0.06352992194275803
Precision: 0.009075703134679718
Recall: 0.14285714285714285
F1-score: 0.01706713266346294


In [77]:
# from sklearn.metrics import classification_report

# print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=0))

Decoding and evaluating with classification report

In [78]:
def decode_labels(encoded_labels):
    return label_encoder.inverse_transform(encoded_labels)