In [81]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

raw_data = pd.read_excel("../data/adjusted-labels-multiclass.xlsx")

raw_data.dropna(subset=['Sentence'], inplace=True)  # Get rid of anything NaN

sentences = raw_data["Sentence"]
labels = raw_data.drop(columns=["Sentence"])

In [82]:
vectorizer = TfidfVectorizer()

sentences_tfidf = vectorizer.fit_transform(sentences)

In [83]:
svm_classifier = SVC(kernel="linear")
multi_label_svm = MultiOutputClassifier(svm_classifier)

In [84]:
from sklearn.model_selection import KFold

In [91]:
n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

train_sentences, test_sentences, train_labels, test_labels = train_test_split(sentences_tfidf, labels, test_size=0.2, random_state=26)

accuracy_scores = cross_val_score(multi_label_svm, train_sentences, train_labels, cv=kf, scoring='accuracy')

print("Using ", n_folds, " Folds")
print("Mean Accuracy across the folds during training: ", accuracy_scores.mean())
# Step 6: Test the Model
y_pred = multi_label_svm.fit(train_sentences, train_labels).predict(test_sentences)

# Calculate metrics
accuracy = accuracy_score(test_labels, y_pred)
micro_precision = precision_score(test_labels, y_pred, average='micro')
macro_precision = precision_score(test_labels, y_pred, average='macro')
micro_recall = recall_score(test_labels, y_pred, average='micro')
macro_recall = recall_score(test_labels, y_pred, average='macro')

print("Micro Precision:", micro_precision)
print("Macro Precision:", macro_precision)
print("Micro Recall:", micro_recall)
print("Macro Recall:", macro_recall)

# Classification Report
print("\nClassification Report:")
print("Overall Accuracy:", accuracy)
print(classification_report(test_labels, y_pred, zero_division=0))

Using  10  Folds
Mean Accuracy across the folds during training:  0.7524289405684754
Micro Precision: 0.9712820512820513
Macro Precision: 0.9916549094180673
Micro Recall: 0.887535145267104
Macro Recall: 0.7682288371817312

Classification Report:
Overall Accuracy: 0.8014842300556586
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       447
           1       1.00      0.40      0.57        15
           2       1.00      0.62      0.77        24
           3       1.00      0.70      0.83        61
           4       0.98      0.87      0.92        54
           5       1.00      0.70      0.83        27
           6       0.98      0.85      0.91       131
           7       1.00      0.93      0.96       133
           8       1.00      0.83      0.90        63
           9       1.00      0.67      0.80        58
          10       1.00      0.87      0.93        54

   micro avg       0.97      0.89      0.93      1067
   macro avg 