<a href="https://colab.research.google.com/github/samservo09/thesis-svm-tele-triage/blob/main/src/svm/svm-simulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SVM Simulation

## Data Preparation

In [1]:
# add required libraries
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [2]:
!pip install -q nltk
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
# set random seed to reproduce the same result every time
np.random.seed(500)

In [4]:
! wget https://raw.githubusercontent.com/samservo09/thesis-svm-tele-triage/refs/heads/main/data/5k_Synthetic_Reddit_User_Data.csv

--2024-11-16 02:12:25--  https://raw.githubusercontent.com/samservo09/thesis-svm-tele-triage/refs/heads/main/data/5k_Synthetic_Reddit_User_Data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 17881543 (17M) [text/plain]
Saving to: ‘5k_Synthetic_Reddit_User_Data.csv’


2024-11-16 02:12:27 (266 MB/s) - ‘5k_Synthetic_Reddit_User_Data.csv’ saved [17881543/17881543]



In [5]:
# import the corpus
Corpus = pd.read_csv("/content/5k_Synthetic_Reddit_User_Data.csv", encoding='latin-1')

In [6]:
# rename columns into small letters
Corpus.columns = Corpus.columns.str.lower()

In [7]:
Corpus.shape

(5000, 3)

## Data Preprocessing

### Tokenization & Word Stemming/Lemmatization

In [8]:
# Step - a : Remove blank rows if any.
Corpus['post'].dropna(inplace=True)

# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
Corpus['post'] = [entry.lower() for entry in Corpus['post']]

# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
Corpus['post']= [word_tokenize(entry) for entry in Corpus['post']]

# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(Corpus['post']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus.loc[index,'text_final'] = str(Final_words)

In [9]:
!pip install -q tabulate

import pandas as pd
from tabulate import tabulate

# display a few rows of the preprocessed text
print(tabulate(Corpus.head(), headers='keys', tablefmt='psql'))

+----+----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Split dataset

In [10]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(Corpus['text_final'],Corpus['label'],test_size=0.3, random_state=42)

### Encoding

In [11]:
Encoder = LabelEncoder()
y_train = Encoder.fit_transform(y_train)
y_test = Encoder.fit_transform(y_test)

### Word Vectorization

In [12]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(X_train)
Test_X_Tfidf = Tfidf_vect.transform(X_test)

In [13]:
# see the vocabulary that it has learned from the corpus
print(Tfidf_vect.vocabulary_)



In [14]:
# vectorized data
print(Train_X_Tfidf)

  (0, 59)	0.09287161170798777
  (0, 168)	0.16180385398367234
  (0, 354)	0.17030325231689283
  (0, 706)	0.08948710569438668
  (0, 757)	0.19933645514804882
  (0, 912)	0.07729478560315822
  (0, 1213)	0.20446575659640018
  (0, 1465)	0.10959001148857897
  (0, 1642)	0.16672027685795582
  (0, 1715)	0.21572803083148448
  (0, 1786)	0.06967074695918085
  (0, 1807)	0.1288021954968585
  (0, 1863)	0.182646993662753
  (0, 1869)	0.12934293327436538
  (0, 1887)	0.06244392085939267
  (0, 1956)	0.1058676050685217
  (0, 1996)	0.08102204508246691
  (0, 2141)	0.2505644388736621
  (0, 2193)	0.05772463825738558
  (0, 2210)	0.12309942919291367
  (0, 2388)	0.14609358785757973
  (0, 2494)	0.12134758949098735
  (0, 2512)	0.10944913823489581
  (0, 2532)	0.20027174020982091
  (0, 2549)	0.060374616044504746
  :	:
  (3499, 942)	0.15714668619540115
  (3499, 1114)	0.170071285668632
  (3499, 1726)	0.14398727255443622
  (3499, 1855)	0.2831465382999318
  (3499, 1862)	0.06397994579086172
  (3499, 1993)	0.1065972178906785


## Multi-class Classification (One vs Rest)

In [15]:
!pip install -q scikit-learn

In [16]:
# import necessary libraries
import matplotlib.pyplot as plt
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.metrics import ConfusionMatrixDisplay
from mlxtend.plotting import plot_decision_regions

In [17]:
# create an instance of the SVM model
svm = SVC(probability=True, random_state=42)

In [18]:
# make SVM an OvR classifier
ovr_classifier = OneVsRestClassifier(svm)

In [19]:
# fit the data to the OvR classifier
ovr_classifier = ovr_classifier.fit(Train_X_Tfidf, y_train)

## Evaluate Results

In [20]:
# accuracy score
from sklearn.metrics import accuracy_score

# Predict the labels for the test set
y_pred = ovr_classifier.predict(Test_X_Tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.18666666666666668


In [21]:
# precision, recall, f1 score
from sklearn.metrics import classification_report
import warnings
# Suppress the specific UndefinedMetricWarning
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

# Generate a classification report with precision, recall, and F1-score
print(classification_report(y_test, y_pred, target_names=Encoder.classes_))

              precision    recall  f1-score   support

     Attempt       0.20      0.19      0.19       325
    Behavior       0.19      0.19      0.19       292
    Ideation       0.19      0.20      0.19       282
   Indicator       0.18      0.16      0.17       326
  Supportive       0.17      0.20      0.18       275

    accuracy                           0.19      1500
   macro avg       0.19      0.19      0.19      1500
weighted avg       0.19      0.19      0.19      1500



In [22]:
# Generate precision, recall, f1 score with different averaging methods
from sklearn.metrics import precision_score, recall_score, f1_score
import warnings
# Suppress the specific UndefinedMetricWarning
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

precision_macro = precision_score(y_test, y_pred, average='macro')
recall_macro = recall_score(y_test, y_pred, average='macro')
f1_macro = f1_score(y_test, y_pred, average='macro')

print(f'Macro Precision: {precision_macro}')
print(f'Macro Recall: {recall_macro}')
print(f'Macro F1-score: {f1_macro}')

precision_weighted = precision_score(y_test, y_pred, average='weighted')
recall_weighted = recall_score(y_test, y_pred, average='weighted')
f1_weighted = f1_score(y_test, y_pred, average='weighted')
print('\n')
print(f'Weighted Precision: {precision_weighted}')
print(f'Weighted Recall: {recall_weighted}')
print(f'Weighted F1-score: {f1_weighted}')

Macro Precision: 0.18694997022529486
Macro Recall: 0.1874189432267869
Macro F1-score: 0.18673341026458418


Weighted Precision: 0.18730235994144193
Weighted Recall: 0.18666666666666668
Weighted F1-score: 0.18654163122128076


In [23]:
from sklearn.metrics import roc_auc_score

# Compute ROC-AUC score using probabilities
roc_auc = roc_auc_score(y_test, ovr_classifier.predict_proba(Test_X_Tfidf), multi_class='ovr')
print(f'ROC-AUC: {roc_auc}')

ROC-AUC: 0.49967567229552057


## Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'estimator__C': [0.1, 1, 10],
    'estimator__kernel': ['linear', 'rbf', 'poly'],
    'estimator__gamma': [0.1, 1, 'scale', 'auto']
}

grid = GridSearchCV(ovr_classifier, param_grid, refit = True, verbose = 3)

# fitting the model for grid search
grid.fit(Train_X_Tfidf, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END estimator__C=0.1, estimator__gamma=0.1, estimator__kernel=linear;, score=0.187 total time= 2.2min
[CV 2/5] END estimator__C=0.1, estimator__gamma=0.1, estimator__kernel=linear;, score=0.211 total time= 2.1min
[CV 3/5] END estimator__C=0.1, estimator__gamma=0.1, estimator__kernel=linear;, score=0.210 total time= 2.1min
[CV 4/5] END estimator__C=0.1, estimator__gamma=0.1, estimator__kernel=linear;, score=0.199 total time= 2.3min
[CV 5/5] END estimator__C=0.1, estimator__gamma=0.1, estimator__kernel=linear;, score=0.194 total time= 2.1min
[CV 1/5] END estimator__C=0.1, estimator__gamma=0.1, estimator__kernel=rbf;, score=0.193 total time= 2.2min
[CV 2/5] END estimator__C=0.1, estimator__gamma=0.1, estimator__kernel=rbf;, score=0.190 total time= 2.1min
[CV 3/5] END estimator__C=0.1, estimator__gamma=0.1, estimator__kernel=rbf;, score=0.210 total time= 2.2min
[CV 4/5] END estimator__C=0.1, estimator__gamma=0.1, estima

In [None]:
# print best parameter after tuning
print(grid.best_params_)

# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

In [None]:
grid_predictions = grid.predict(Test_X_Tfidf)

# print classification report
print(classification_report(y_test, grid_predictions))

### ROC curve and AUC

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

# Binarize the labels
y_test_bin = label_binarize(y_test, classes=np.unique(y_test))
n_classes = y_test_bin.shape[1]

# Get probabilities for baseline SVM
y_score_baseline = ovr_classifier.predict_proba(Test_X_Tfidf)

# Get probabilities for tuned SVM
y_score_tuned = grid.predict_proba(Test_X_Tfidf)

In [None]:
# Compute ROC curve and ROC area for each class
fpr_baseline = dict()
tpr_baseline = dict()
roc_auc_baseline = dict()
fpr_tuned = dict()
tpr_tuned = dict()
roc_auc_tuned = dict()

for i in range(n_classes):
    fpr_baseline[i], tpr_baseline[i], _ = roc_curve(y_test_bin[:, i], y_score_baseline[:, i])
    roc_auc_baseline[i] = auc(fpr_baseline[i], tpr_baseline[i])
    fpr_tuned[i], tpr_tuned[i], _ = roc_curve(y_test_bin[:, i], y_score_tuned[:, i])
    roc_auc_tuned[i] = auc(fpr_tuned[i], tpr_tuned[i])

In [None]:
# Plot ROC curves for baseline SVM
plt.figure()
for i in range(n_classes):
    plt.plot(fpr_baseline[i], tpr_baseline[i],
             label='Baseline SVM - Class {0} (area = {1:0.2f})'
                   ''.format(i, roc_auc_baseline[i]))

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('AUC - ROC Curve (Baseline SVM)')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Plot ROC curves for tuned SVM
plt.figure()
for i in range(n_classes):
    plt.plot(fpr_tuned[i], tpr_tuned[i],
             label='Tuned SVM - Class {0} (area = {1:0.2f})'
                   ''.format(i, roc_auc_tuned[i]))

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('AUC - ROC Curve (Tuned SVM)')
plt.legend(loc="lower right")
plt.show()

### Confusion Matrix

In [None]:
# confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Display confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=Encoder.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.show()

In [None]:
# Generate confusion matrix for tuned SVM
cm_tuned = confusion_matrix(y_test, grid_predictions)

# Display confusion matrix
disp_tuned = ConfusionMatrixDisplay(confusion_matrix=cm_tuned, display_labels=Encoder.classes_)
disp_tuned.plot(cmap=plt.cm.Blues)
plt.show()