### Support Vector Machines
Using a support vector machine to classify data based on the speech act

In [73]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report

Generating the sentences and labels from the Excel sheet

In [74]:
raw_data = pd.read_csv("data/labeled_sentences.csv")

sentences = raw_data["Sentence"] 
labels = raw_data["Label"]

Separating the data into training and test data

In [75]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.2)

## Preprocessing
Vectorising based on the Tf-idf values in the data set

In [76]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Create feature vectors
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

Selecting the Linear Support Vector Classification model

In [77]:
from sklearn import svm
from sklearn.metrics import classification_report
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')

In [78]:
classifier_linear.fit(train_vectors, y_train)

Evaluating Results

In [79]:
prediction_linear = classifier_linear.predict(test_vectors)

In [80]:
# results
report = classification_report(y_test, prediction_linear)
print(f"Accuracy: {accuracy_score(y_test, prediction_linear):.2f}")
print(report)

Accuracy: 0.35
                         precision    recall  f1-score   support

           Action words       0.36      0.20      0.26        60
              Buildings       0.26      0.24      0.25        76
         Communications       0.89      0.93      0.91        90
             Directions       0.39      0.23      0.29        30
             Fire words       0.42      0.39      0.41       165
      Hills and Forests       0.27      0.29      0.28        42
Intel (from newspapers)       0.06      0.12      0.08        17
        Named Locations       0.19      0.24      0.21       144
        Reasoning words       0.18      0.19      0.18        43
           Rescue words       0.25      0.29      0.27        66
                  Woods       0.30      0.27      0.29        62

               accuracy                           0.35       795
              macro avg       0.33      0.31      0.31       795
           weighted avg       0.36      0.35      0.35       795
