# Supervised baseline: TubeSpam

## Preamble

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier # Linear SVM.
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

## Read data

In [2]:
# Read data files.
df_train = pd.read_csv("../data/TubeSpam/TubeSpam_train.csv")
df_val = pd.read_csv("../data/TubeSpam/TubeSpam_val.csv")
df_test = pd.read_csv("../data/TubeSpam/TubeSpam_test.csv")

# Explore data.
print(df_train.info())
display(df_train.head())

print(df_val.info())
display(df_val.head())

print(df_test.info())
display(df_test.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1407 entries, 0 to 1406
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Comment ID  1407 non-null   object
 1   Author      1407 non-null   object
 2   Date        1229 non-null   object
 3   Text        1407 non-null   object
 4   Label       1407 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 55.1+ KB
None


Unnamed: 0,Comment ID,Author,Date,Text,Label
0,z12afdtaypuat3t5s04ccv4g4xe1elvbcdw,Lindsay Wofford,2014-11-07T13:12:45,What is he saying?!?!?!?!?!?!?!?$? ﻿,0
1,z130drwgasjgevh0n234ht3gfrmugfguz,Guren - MKII,2014-10-01T19:53:07.387000,everyday i&#39;m subscribe﻿,1
2,z13fwnbh5qusx1olr23bcfgjbxiljjv4u04,Flynn Rider,2014-08-20T19:04:01,Watch Maroon 5's latest 2nd single from V (It ...,1
3,z12qzzbpbzn4c5zwu04cir5gmzfozhy4cjk,lebanonwarior1,2014-11-06T18:07:21,Song name??﻿,0
4,z12cvbyjjmuwvxivx223tpujulrwwdt5j04,ampai gmuer,2015-01-27T13:23:56.061000,Check out this playlist on YouTube:👿👳👳👳👳👳﻿,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Comment ID  157 non-null    object
 1   Author      157 non-null    object
 2   Date        138 non-null    object
 3   Text        157 non-null    object
 4   Label       157 non-null    int64 
dtypes: int64(1), object(4)
memory usage: 6.3+ KB
None


Unnamed: 0,Comment ID,Author,Date,Text,Label
0,z13fehihaneyzh3wp04cfxrwkyfkwvgoex00k,pratik patel,2015-05-23T05:28:49.504000,"mindblowing dance.,.,.superbbb song﻿",0
1,LneaDw26bFt7AvG8x15igOZvrWApak5Zh5-PPag9YTs,themagicmangotree,,Check out my channel for funny skits! Thanks!,1
2,z13vc32olxm2znkjz04cd35gnozjebqhlig,TheLegitBroz,2014-11-08T00:26:35,The Funny Thing Is That this song was made in ...,0
3,z13ohniidsqhcjs1k22ti544qybadb25b04,reesekupp24,2015-05-14T14:10:58.987000,Check out this video on YouTube:﻿,1
4,z124inzqgoyeh33uw23iibficv2kuf2nx,anthony Jennings,2014-11-07T23:26:04,"People Who Say That ""This Song Is Too Old Now,...",0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Comment ID  392 non-null    object
 1   Author      392 non-null    object
 2   Date        344 non-null    object
 3   Text        392 non-null    object
 4   Label       392 non-null    int64 
dtypes: int64(1), object(4)
memory usage: 15.4+ KB
None


Unnamed: 0,Comment ID,Author,Date,Text,Label
0,z13mxjrbuyv5slha204cjdwg0wauylyap3w,Peter Wilkes,2014-09-15T15:48:57,"really want this video to get 1 billion views,...",0
1,z120uhiw0ubhcnxvi23uw5oorpuncvso204,tombraiderxXx12,2014-09-07T18:38:10,I'm not a big fan of the song but this video i...,0
2,_2viQ_Qnc6-pY-1yR6K2FhmC5i48-WuNx5CumlHLDAI,Sabina Pearson-Smith,2013-07-13T13:14:30.021000,I love this song for two reasons: 1.it is abou...,0
3,z13wtr2yssezhdaqa04cjvbody3qwjhpwk00k,Kirill Nazarethian,2014-11-07T22:59:37,2:05. Hahahahah ﻿,0
4,z12wuz2qqnawe50js04cejpzosrzdr0r1k40k,Dana Matich,2014-11-08T03:32:55,Hey guys! Check this out: Kollektivet - Don't ...,1


## Process data

In [3]:
# Build dictionary of features and transform documents to feature vectors.
# Value of a word in the vocabulary is its frequency in the whole training corpus.
# https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(df_train["Text"])
print(X_train_counts.shape)
print(X_train_counts)

# Regularize via “Term Frequency times Inverse Document Frequency.”
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1407, 3604)
  (0, 3450)	1
  (0, 1790)	1
  (0, 1596)	1
  (0, 2770)	1
  (1, 1232)	1
  (1, 170)	1
  (1, 3041)	1
  (2, 3418)	2
  (2, 2096)	1
  (2, 1926)	1
  (2, 148)	1
  (2, 2879)	1
  (2, 1415)	1
  (2, 1795)	1
  (2, 3413)	1
  (2, 416)	1
  (2, 3564)	1
  (2, 3538)	1
  (2, 3574)	1
  (2, 845)	1
  (2, 3233)	1
  (3, 2935)	1
  (3, 2248)	1
  (4, 3574)	1
  (4, 781)	1
  :	:
  (1404, 2594)	1
  (1405, 781)	1
  (1405, 2388)	1
  (1405, 1618)	1
  (1405, 1538)	1
  (1405, 2231)	2
  (1405, 1501)	1
  (1405, 2086)	1
  (1405, 2708)	1
  (1405, 1292)	1
  (1405, 3157)	1
  (1406, 1790)	1
  (1406, 1795)	1
  (1406, 781)	1
  (1406, 2354)	2
  (1406, 3156)	1
  (1406, 3206)	1
  (1406, 3364)	1
  (1406, 489)	1
  (1406, 719)	1
  (1406, 1845)	1
  (1406, 3407)	1
  (1406, 2939)	1
  (1406, 2235)	1
  (1406, 3265)	1


(1407, 3604)

In [4]:
%%capture
'''
# Regularize via “Term Frequency.”
tf_transformer = TfidfTransformer(use_idf = False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
print(X_train_tf.shape)
print(X_train_tf)
'''

In [5]:
# Call transform only, as featurizers have already been fit to training data.
X_test_counts = count_vect.transform(df_test["Text"])
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
print(X_test_tfidf.shape)
print(X_test_tfidf)

(392, 3604)
  (0, 3519)	0.3229561514173239
  (0, 3406)	0.3548912774734669
  (0, 3364)	0.2829763333881671
  (0, 3353)	0.20597503446501694
  (0, 3206)	0.1908626931503863
  (0, 3179)	0.15624298231280379
  (0, 2634)	0.3333573581967677
  (0, 1462)	0.3097042084914961
  (0, 615)	0.3391752187524297
  (0, 564)	0.2887941939438291
  (0, 420)	0.4253516283064763
  (1, 3353)	0.20833860122925563
  (1, 3179)	0.1580358729954015
  (1, 3156)	0.18694398976645737
  (1, 2935)	0.21848310744124433
  (1, 2330)	0.23863955623869001
  (1, 2306)	0.3283142492164623
  (1, 1790)	0.21961709151570477
  (1, 1287)	0.47479474602022353
  (1, 703)	0.29523703130167506
  (1, 608)	0.44498555545242796
  (1, 527)	0.354060609994544
  (2, 3413)	0.22208941066041948
  (2, 3271)	0.31437930814694454
  (2, 3179)	0.09845981137332324
  :	:
  (389, 2313)	0.18201680397553371
  (389, 2292)	0.2316140887542192
  (389, 2267)	0.15723683729759952
  (389, 2239)	0.09852322831959157
  (389, 1845)	0.1377819716011469
  (389, 1678)	0.14598042720773824

In [6]:
# Extract labels.
y_train = df_train["Label"]
y_test = df_test["Label"]

print(len(y_train))
print(y_train.value_counts(normalize = True))
print(y_train.head())

print(len(y_test))
print(y_test.value_counts(normalize = True))
print(y_test.head())

1407
1    0.513859
0    0.486141
Name: Label, dtype: float64
0    0
1    1
2    1
3    0
4    1
Name: Label, dtype: int64
392
1    0.512755
0    0.487245
Name: Label, dtype: float64
0    0
1    0
2    0
3    0
4    1
Name: Label, dtype: int64


## Train

In [7]:
# Train a multinomial Naive Bayes model.
mnb = MultinomialNB().fit(X_train_tfidf, y_train)

In [8]:
# Train a linear SVM.
# Regularized linear models with stochastic gradient descent (SGD) learning.
svm = SGDClassifier(loss = "hinge", 
                    penalty = "l2",
                    alpha = 1e-3, 
                    random_state = 42,
                    max_iter = 5, 
                    tol = None)
svm = svm.fit(X_train_tfidf, y_train)

## Test

In [9]:
# Predict on test set.
y_mnb = mnb.predict(X_test_tfidf)

# Evaluate performance.
print("\n--- NAIVE BAYES ---\n")
print(metrics.classification_report(y_test, y_mnb))

# MNB performance metrics.
# 0 = HAM, 1 = SPAM.
# Care most about false negatives – keeping SPAM when we only want HAM.
# Recall, F1, and accuracy rely on false negatives.
confusion_mnb = metrics.confusion_matrix(y_test, y_mnb)
acc_mnb = metrics.accuracy_score(y_test, y_mnb)
f1_mnb = metrics.f1_score(y_test, y_mnb, zero_division = 0)
precision_mnb = metrics.precision_score(y_test, y_mnb, zero_division = 0)
recall_mnb = metrics.recall_score(y_test, y_mnb, zero_division = 0)
roc_mnb = metrics.roc_auc_score(y_test, y_mnb)
        
print("\n--- NAIVE BAYES ---\n")
print("\n---------------------------------------------")
print("tn, fp, fn, tp =", confusion_mnb.ravel())
print("F1             =", f1_mnb)
print("Accuracy       =", acc_mnb)
print("Precision      =", precision_mnb)
print("Recall         =", recall_mnb)
print("ROC AUC        =", roc_mnb)
print("---------------------------------------------\n")


--- NAIVE BAYES ---

              precision    recall  f1-score   support

           0       0.95      0.88      0.92       191
           1       0.89      0.96      0.93       201

    accuracy                           0.92       392
   macro avg       0.92      0.92      0.92       392
weighted avg       0.92      0.92      0.92       392


--- NAIVE BAYES ---


---------------------------------------------
tn, fp, fn, tp = [168  23   8 193]
F1             = 0.9256594724220624
Accuracy       = 0.9209183673469388
Precision      = 0.8935185185185185
Recall         = 0.9601990049751243
ROC AUC        = 0.9198900784037926
---------------------------------------------



In [10]:
# Predict on test set.
y_svm = svm.predict(X_test_tfidf)

# Evaluate performance.
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print(metrics.classification_report(y_test, y_svm))

# SVM performance metrics.
# 0 = HAM, 1 = SPAM.
# Care most about false negatives – keeping SPAM when we only want HAM.
# Recall, F1, and accuracy rely on false negatives.
confusion_svm = metrics.confusion_matrix(y_test, y_svm)
acc_svm = metrics.accuracy_score(y_test, y_svm)
f1_svm = metrics.f1_score(y_test, y_svm, zero_division = 0)
precision_svm = metrics.precision_score(y_test, y_svm, zero_division = 0)
recall_svm = metrics.recall_score(y_test, y_svm, zero_division = 0)
roc_svm = metrics.roc_auc_score(y_test, y_svm)
        
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print("\n---------------------------------------------")
print("tn, fp, fn, tp =", confusion_svm.ravel())
print("F1             =", f1_svm)
print("Accuracy       =", acc_svm)
print("Precision      =", precision_svm)
print("Recall         =", recall_svm)
print("ROC AUC        =", roc_svm)
print("---------------------------------------------\n")


--- LINEAR SUPPORT VECTOR MACHINE ---

              precision    recall  f1-score   support

           0       0.94      0.95      0.95       191
           1       0.95      0.94      0.95       201

    accuracy                           0.95       392
   macro avg       0.95      0.95      0.95       392
weighted avg       0.95      0.95      0.95       392


--- LINEAR SUPPORT VECTOR MACHINE ---


---------------------------------------------
tn, fp, fn, tp = [182   9  12 189]
F1             = 0.9473684210526316
Accuracy       = 0.9464285714285714
Precision      = 0.9545454545454546
Recall         = 0.9402985074626866
ROC AUC        = 0.9465890443072595
---------------------------------------------



## Hyperparameter tuning

In [11]:
# Init search space for SVM.
parameters = {'alpha': [0.0, 0.01, 0.1, 0.5, 1.0, 10.0]}

# Tune SVM with 5-fold cross-validation.
mnb_tuned = GridSearchCV(MultinomialNB(), 
                         parameters,
                         cv = 5, 
                         n_jobs = -1)

# Fit tuned model.
mnb_tuned = mnb_tuned.fit(X_train_tfidf, y_train)

# Predict on test set.
y_mnb_tuned = mnb_tuned.predict(X_test_tfidf)

# Evaluate performance.
print("\n--- TUNED NAIVE BAYES ---\n")
print(metrics.classification_report(y_test, y_mnb_tuned))

# MNB tuned performance metrics.
# 0 = HAM, 1 = SPAM.
# Care most about false negatives – keeping SPAM when we only want HAM.
# Recall, F1, and accuracy rely on false negatives.
confusion_mnb_tuned = metrics.confusion_matrix(y_test, y_mnb_tuned)
acc_mnb_tuned = metrics.accuracy_score(y_test, y_mnb_tuned)
f1_mnb_tuned = metrics.f1_score(y_test, y_mnb_tuned, zero_division = 0)
precision_mnb_tuned = metrics.precision_score(y_test, y_mnb_tuned, zero_division = 0)
recall_mnb_tuned = metrics.recall_score(y_test, y_mnb_tuned, zero_division = 0)
roc_mnb_tuned = metrics.roc_auc_score(y_test, y_mnb_tuned)
        
print("\n--- TUNED NAIVE BAYES ---\n")
print("\n---------------------------------------------")
print("tn, fp, fn, tp =", confusion_mnb_tuned.ravel())
print("F1             =", f1_mnb_tuned)
print("Accuracy       =", acc_mnb_tuned)
print("Precision      =", precision_mnb_tuned)
print("Recall         =", recall_mnb_tuned)
print("ROC AUC        =", roc_mnb_tuned)
print("---------------------------------------------\n")


--- TUNED NAIVE BAYES ---

              precision    recall  f1-score   support

           0       0.96      0.88      0.92       191
           1       0.89      0.97      0.93       201

    accuracy                           0.92       392
   macro avg       0.93      0.92      0.92       392
weighted avg       0.93      0.92      0.92       392


--- TUNED NAIVE BAYES ---


---------------------------------------------
tn, fp, fn, tp = [168  23   7 194]
F1             = 0.9282296650717703
Accuracy       = 0.923469387755102
Precision      = 0.8940092165898618
Recall         = 0.9651741293532339
ROC AUC        = 0.9223776405928474
---------------------------------------------



In [12]:
'''
# Init search space for SVM.
parameters = {'clf__alpha': (1e-1, 1e-2, 1e-3)}

# Tune SVM with 5-fold cross-validation.
svm_tuned = SGDClassifier(loss = "hinge", 
                          penalty = "l2",
                          random_state = 42,
                          max_iter = 5, 
                          tol = None)
svm_tuned = GridSearchCV(svm_tuned, 
                         parameters,
                         cv = 5, 
                         n_jobs = -1)

# Fit tuned model.
svm_tuned = svm_tuned.fit(X_train_tfidf, y_train)

# Predict on test set.
y_svm_tuned = svm_tuned.predict(X_test_tfidf)

# Evaluate performance.
print("\n--- TUNED LINEAR SUPPORT VECTOR MACHINE ---\n")
print(metrics.classification_report(y_test, y_svm_tuned))
'''

'\n# Init search space for SVM.\nparameters = {\'clf__alpha\': (1e-1, 1e-2, 1e-3)}\n\n# Tune SVM with 5-fold cross-validation.\nsvm_tuned = SGDClassifier(loss = "hinge", \n                          penalty = "l2",\n                          random_state = 42,\n                          max_iter = 5, \n                          tol = None)\nsvm_tuned = GridSearchCV(svm_tuned, \n                         parameters,\n                         cv = 5, \n                         n_jobs = -1)\n\n# Fit tuned model.\nsvm_tuned = svm_tuned.fit(X_train_tfidf, y_train)\n\n# Predict on test set.\ny_svm_tuned = svm_tuned.predict(X_test_tfidf)\n\n# Evaluate performance.\nprint("\n--- TUNED LINEAR SUPPORT VECTOR MACHINE ---\n")\nprint(metrics.classification_report(y_test, y_svm_tuned))\n'

In [13]:
# Train a linear SVM.
# Regularized linear models with stochastic gradient descent (SGD) learning.
svm_tuned = SGDClassifier(loss = "hinge", 
                    penalty = "l2",
                    alpha = 1e-2, 
                    random_state = 42,
                    max_iter = 5, 
                    tol = None)
svm_tuned = svm_tuned.fit(X_train_tfidf, y_train)

# Predict on test set.
y_svm_tuned = svm_tuned.predict(X_test_tfidf)

# Evaluate performance.
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print(metrics.classification_report(y_test, y_svm_tuned))

# SVM performance metrics.
# 0 = HAM, 1 = SPAM.
# Care most about false negatives – keeping SPAM when we only want HAM.
# Recall, F1, and accuracy rely on false negatives.
confusion_svm_tuned = metrics.confusion_matrix(y_test, y_svm_tuned)
acc_svm_tuned = metrics.accuracy_score(y_test, y_svm_tuned)
f1_svm_tuned = metrics.f1_score(y_test, y_svm_tuned, zero_division = 0)
precision_svm_tuned = metrics.precision_score(y_test, y_svm_tuned, zero_division = 0)
recall_svm_tuned = metrics.recall_score(y_test, y_svm_tuned, zero_division = 0)
roc_svm_tuned = metrics.roc_auc_score(y_test, y_svm_tuned)
        
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print("\n---------------------------------------------")
print("tn, fp, fn, tp =", confusion_svm_tuned.ravel())
print("F1             =", f1_svm_tuned)
print("Accuracy       =", acc_svm_tuned)
print("Precision      =", precision_svm_tuned)
print("Recall         =", recall_svm_tuned)
print("ROC AUC        =", roc_svm_tuned)
print("---------------------------------------------\n")

# Evaluate performance.
print("\n--- CLASSIFICATION REPORT ---\n")
print(metrics.classification_report(y_test, y_svm_tuned))


--- LINEAR SUPPORT VECTOR MACHINE ---

              precision    recall  f1-score   support

           0       0.91      0.95      0.93       191
           1       0.95      0.91      0.93       201

    accuracy                           0.93       392
   macro avg       0.93      0.93      0.93       392
weighted avg       0.93      0.93      0.93       392


--- LINEAR SUPPORT VECTOR MACHINE ---


---------------------------------------------
tn, fp, fn, tp = [182   9  18 183]
F1             = 0.931297709923664
Accuracy       = 0.9311224489795918
Precision      = 0.953125
Recall         = 0.9104477611940298
ROC AUC        = 0.9316636711729313
---------------------------------------------


--- CLASSIFICATION REPORT ---

              precision    recall  f1-score   support

           0       0.91      0.95      0.93       191
           1       0.95      0.91      0.93       201

    accuracy                           0.93       392
   macro avg       0.93      0.93      0.93  

In [14]:
# Train a linear SVM.
# Regularized linear models with stochastic gradient descent (SGD) learning.
svm_tuned = SGDClassifier(loss = "hinge", 
                    penalty = "l2",
                    alpha = 1e-1, 
                    random_state = 42,
                    max_iter = 5, 
                    tol = None)
svm_tuned = svm_tuned.fit(X_train_tfidf, y_train)

# Predict on test set.
y_svm_tuned = svm_tuned.predict(X_test_tfidf)

# Evaluate performance.
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print(metrics.classification_report(y_test, y_svm_tuned))

# SVM performance metrics.
# 0 = HAM, 1 = SPAM.
# Care most about false negatives – keeping SPAM when we only want HAM.
# Recall, F1, and accuracy rely on false negatives.
confusion_svm_tuned = metrics.confusion_matrix(y_test, y_svm_tuned)
acc_svm_tuned = metrics.accuracy_score(y_test, y_svm_tuned)
f1_svm_tuned = metrics.f1_score(y_test, y_svm_tuned, zero_division = 0)
precision_svm_tuned = metrics.precision_score(y_test, y_svm_tuned, zero_division = 0)
recall_svm_tuned = metrics.recall_score(y_test, y_svm_tuned, zero_division = 0)
roc_svm_tuned = metrics.roc_auc_score(y_test, y_svm_tuned)
        
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print("\n---------------------------------------------")
print("tn, fp, fn, tp =", confusion_svm_tuned.ravel())
print("F1             =", f1_svm_tuned)
print("Accuracy       =", acc_svm_tuned)
print("Precision      =", precision_svm_tuned)
print("Recall         =", recall_svm_tuned)
print("ROC AUC        =", roc_svm_tuned)
print("---------------------------------------------\n")

# Evaluate performance.
print("\n--- CLASSIFICATION REPORT ---\n")
print(metrics.classification_report(y_test, y_svm_tuned))


--- LINEAR SUPPORT VECTOR MACHINE ---

              precision    recall  f1-score   support

           0       0.95      0.59      0.72       191
           1       0.71      0.97      0.82       201

    accuracy                           0.78       392
   macro avg       0.83      0.78      0.77       392
weighted avg       0.83      0.78      0.77       392


--- LINEAR SUPPORT VECTOR MACHINE ---


---------------------------------------------
tn, fp, fn, tp = [112  79   6 195]
F1             = 0.8210526315789474
Accuracy       = 0.7831632653061225
Precision      = 0.7116788321167883
Recall         = 0.9701492537313433
ROC AUC        = 0.7782683441431586
---------------------------------------------


--- CLASSIFICATION REPORT ---

              precision    recall  f1-score   support

           0       0.95      0.59      0.72       191
           1       0.71      0.97      0.82       201

    accuracy                           0.78       392
   macro avg       0.83      0.78 

In [15]:
# Train a linear SVM.
# Regularized linear models with stochastic gradient descent (SGD) learning.
svm_tuned = SGDClassifier(loss = "hinge", 
                    penalty = "l2",
                    alpha = 1e-4, 
                    random_state = 42,
                    max_iter = 5, 
                    tol = None)
svm_tuned = svm_tuned.fit(X_train_tfidf, y_train)

# Predict on test set.
y_svm_tuned = svm_tuned.predict(X_test_tfidf)

# Evaluate performance.
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print(metrics.classification_report(y_test, y_svm_tuned))

# SVM performance metrics.
# 0 = HAM, 1 = SPAM.
# Care most about false negatives – keeping SPAM when we only want HAM.
# Recall, F1, and accuracy rely on false negatives.
confusion_svm_tuned = metrics.confusion_matrix(y_test, y_svm_tuned)
acc_svm_tuned = metrics.accuracy_score(y_test, y_svm_tuned)
f1_svm_tuned = metrics.f1_score(y_test, y_svm_tuned, zero_division = 0)
precision_svm_tuned = metrics.precision_score(y_test, y_svm_tuned, zero_division = 0)
recall_svm_tuned = metrics.recall_score(y_test, y_svm_tuned, zero_division = 0)
roc_svm_tuned = metrics.roc_auc_score(y_test, y_svm_tuned)
        
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print("\n---------------------------------------------")
print("tn, fp, fn, tp =", confusion_svm_tuned.ravel())
print("F1             =", f1_svm_tuned)
print("Accuracy       =", acc_svm_tuned)
print("Precision      =", precision_svm_tuned)
print("Recall         =", recall_svm_tuned)
print("ROC AUC        =", roc_svm_tuned)
print("---------------------------------------------\n")

# Evaluate performance.
print("\n--- CLASSIFICATION REPORT ---\n")
print(metrics.classification_report(y_test, y_svm_tuned))


--- LINEAR SUPPORT VECTOR MACHINE ---

              precision    recall  f1-score   support

           0       0.94      0.91      0.93       191
           1       0.92      0.95      0.93       201

    accuracy                           0.93       392
   macro avg       0.93      0.93      0.93       392
weighted avg       0.93      0.93      0.93       392


--- LINEAR SUPPORT VECTOR MACHINE ---


---------------------------------------------
tn, fp, fn, tp = [174  17  11 190]
F1             = 0.9313725490196079
Accuracy       = 0.9285714285714286
Precision      = 0.9178743961352657
Recall         = 0.945273631840796
ROC AUC        = 0.9281341981193508
---------------------------------------------


--- CLASSIFICATION REPORT ---

              precision    recall  f1-score   support

           0       0.94      0.91      0.93       191
           1       0.92      0.95      0.93       201

    accuracy                           0.93       392
   macro avg       0.93      0.93  

## Optimal model

In [16]:
# BEST PERFORMING MODEL = linear SVM with below hyperparameters.
# Regularized linear models with stochastic gradient descent (SGD) learning.
svm_opt = SGDClassifier(loss = "hinge", 
                        penalty = "l2",
                        alpha = 1e-3, 
                        random_state = 42,
                        max_iter = 5, 
                        tol = None)

