# Supervised baseline: TubeSpam

## Preamble

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier # Linear SVM.
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

## Read data

In [2]:
# Read data files.
df_train = pd.read_csv("../../data/Spouse/spouse_train.csv")
df_val = pd.read_csv("../../data/Spouse/spouse_val.csv")
df_test = pd.read_csv("../../data/Spouse/spouse_test.csv")

# Explore data.
print(df_train.info())
display(df_train.head())

print(df_val.info())
display(df_val.head())

print(df_test.info())
display(df_test.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3858 entries, 0 to 3857
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   person1_word_idx      3858 non-null   object
 1   person2_word_idx      3858 non-null   object
 2   sentence              3858 non-null   object
 3   tokens                3858 non-null   object
 4   person1_right_tokens  3858 non-null   object
 5   person2_right_tokens  3858 non-null   object
 6   between_tokens        3858 non-null   object
 7   Label                 3858 non-null   int64 
dtypes: int64(1), object(7)
memory usage: 241.2+ KB
None


Unnamed: 0,person1_word_idx,person2_word_idx,sentence,tokens,person1_right_tokens,person2_right_tokens,between_tokens,Label
0,"(1, 2)","(7, 8)","With Dellen Millard, 30, and Mark Smich, 27, i...","['With', 'Dellen', 'Millard', ',', '30', ',', ...","[',', '30', ',', 'and']","[',', '27', ',', 'in']","[',', '30', ',', 'and']",0
1,"(0, 0)","(9, 10)","John is in Ukraine, where he met President Pet...","['John', 'is', 'in', 'Ukraine', ',', 'where', ...","['is', 'in', 'Ukraine', ',']","['and', 'called', 'for', 'the']","['is', 'in', 'Ukraine', ',', 'where', 'he', 'm...",0
2,"(5, 5)","(68, 68)",1 Serena Williams considers sister Venus 'the ...,"['1', 'Serena', 'Williams', 'considers', 'sist...","['', 'the', 'best', 'player']","['.', '', '*']","['', 'the', 'best', 'player', 'in', 'the', 'to...",0
3,"(12, 13)","(15, 16)","Her rules: Chrissy Teigen, right, posed next t...","['Her', 'rules', ':', 'Chrissy', 'Teigen', ','...","['and', 'Ashley', 'Tisdale', ',']","[',', 'left', ',', 'at']",['and'],0
4,"(44, 44)","(75, 75)",Devoted mum: Katie goes on to insist that she ...,"['Devoted', 'mum', ':', 'Katie', 'goes', 'on',...","['says', 'her', 'hectic', 'schedule']","['quipped', 'back', ':', '']","['says', 'her', 'hectic', 'schedule', 'and', '...",0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 553 entries, 0 to 552
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   person1_word_idx      553 non-null    object
 1   person2_word_idx      553 non-null    object
 2   sentence              553 non-null    object
 3   tokens                553 non-null    object
 4   person1_right_tokens  553 non-null    object
 5   person2_right_tokens  553 non-null    object
 6   between_tokens        553 non-null    object
 7   Label                 553 non-null    int64 
dtypes: int64(1), object(7)
memory usage: 34.7+ KB
None


Unnamed: 0,person1_word_idx,person2_word_idx,sentence,tokens,person1_right_tokens,person2_right_tokens,between_tokens,Label
0,"(0, 0)","(14, 14)",Zac said of the Dawson's Creek alum: 'When I f...,"['Zac', 'said', 'of', 'the', 'Dawson', 's', 'C...","['said', 'of', 'the', 'Dawson']","['and', 'her', 'rapport', 'on']","['said', 'of', 'the', 'Dawson', 's', 'Creek', ...",0
1,"(43, 44)","(55, 56)",Fast moving couple: Morena Baccarin's estrange...,"['Fast', 'moving', 'couple', ':', 'Morena', 'B...","['', 'At', 'the', 'centre']","[',', 'pictured', 'in', 'New']","['', 'At', 'the', 'centre', ':', 'Her', 'Gotha...",0
2,"(1, 1)","(12, 12)","Meanwhile Marie and her partner Emi, as well a...","['Meanwhile', 'Marie', 'and', 'her', 'partner'...","['and', 'her', 'partner', 'Emi']","['and', 'Vanessa', 'from', 'Sydney']","['and', 'her', 'partner', 'Emi', ',', 'as', 'w...",0
3,"(0, 0)","(10, 11)","Judi and Wayne Richardson (ME), parents of Dar...","['Judi', 'and', 'Wayne', 'Richardson', '(', 'M...","['and', 'Wayne', 'Richardson', '(']","[',', 'who', 'was', 'shot']","['and', 'Wayne', 'Richardson', '(', 'ME', ')',...",0
4,"(20, 20)","(58, 58)",Expecting: The Hotplate star Marie Yokoyama is...,"['Expecting', ':', 'The', 'Hotplate', 'star', ...","[',', 'pictured', 'with', 'her']","[',', 'who', 'is', 'now']","[',', 'pictured', 'with', 'her', 'work', 'part...",0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1101 entries, 0 to 1100
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   person1_word_idx      1101 non-null   object
 1   person2_word_idx      1101 non-null   object
 2   sentence              1101 non-null   object
 3   tokens                1101 non-null   object
 4   person1_right_tokens  1101 non-null   object
 5   person2_right_tokens  1101 non-null   object
 6   between_tokens        1101 non-null   object
 7   Label                 1101 non-null   int64 
dtypes: int64(1), object(7)
memory usage: 68.9+ KB
None


Unnamed: 0,person1_word_idx,person2_word_idx,sentence,tokens,person1_right_tokens,person2_right_tokens,between_tokens,Label
0,"(2, 3)","(6, 7)",Coordinated: Sam Burgess and fiancé Phoebe Hoo...,"['Coordinated', ':', 'Sam', 'Burgess', 'and', ...","['and', 'fiancé', 'Phoebe', 'Hooke']","[',', 'who', 'have', 'now']","['and', 'fiancé']",1
1,"(1, 1)","(5, 5)","After Betty's death, Robert filed forms to the...","['After', 'Betty', 's', 'death', ',', 'Robert'...","['s', 'death', ',', 'Robert']","['filed', 'forms', 'to', 'the']","['s', 'death', ',']",0
2,"(23, 24)","(43, 47)","Ceawlin Thynn, 41, said the row resulted in hi...","['Ceawlin', 'Thynn', ',', '41', ',', 'said', '...","[',', '29', '(', 'pictured']","['pictured', 'on', 'the', 'grounds']","[',', '29', '(', 'pictured', 'above', ')', ','...",0
3,"(9, 11)","(24, 24)",Source: News unlimited - 1 day ago Sheena Bor...,"['Source', ':', 'News', 'unlimited', '-', '1',...","['case', ':', '14', 'days']","[',', 'Sept', '07', '(']","['case', ':', '14', 'days', 'judicial', 'custo...",0
4,"(1, 4)","(6, 6)",' Loved-up: Sofía and Joe Manganiello are...,"['', '', 'Loved', '-', 'up', ':', 'Sofía', 'an...","[':', 'Sofía', 'and', 'Joe']","['and', 'Joe', 'Manganiello', '']",[':'],0


## Process data

In [3]:
# Build dictionary of features and transform documents to feature vectors.
# Value of a word in the vocabulary is its frequency in the whole training corpus.
# https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(df_train["sentence"])
print(X_train_counts.shape)
print(X_train_counts)

# Regularize via “Term Frequency times Inverse Document Frequency.”
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(3858, 10979)
  (0, 10790)	1
  (0, 2834)	1
  (0, 6490)	1
  (0, 154)	1
  (0, 646)	1
  (0, 6229)	1
  (0, 9164)	1
  (0, 143)	1
  (0, 5053)	2
  (0, 9910)	3
  (0, 7817)	1
  (0, 1463)	1
  (0, 3982)	1
  (0, 7753)	1
  (0, 7836)	1
  (0, 482)	1
  (0, 7033)	1
  (0, 9918)	1
  (0, 3891)	1
  (0, 2810)	1
  (0, 6691)	1
  (0, 10169)	1
  (0, 5492)	1
  (0, 5322)	1
  (0, 10246)	1
  :	:
  (3857, 279)	1
  (3857, 4802)	1
  (3857, 9938)	1
  (3857, 4647)	1
  (3857, 10043)	1
  (3857, 6665)	2
  (3857, 9948)	1
  (3857, 10673)	1
  (3857, 3968)	1
  (3857, 6207)	1
  (3857, 3813)	1
  (3857, 6246)	1
  (3857, 6668)	1
  (3857, 7750)	1
  (3857, 117)	1
  (3857, 9923)	1
  (3857, 7587)	1
  (3857, 7372)	2
  (3857, 5453)	1
  (3857, 2718)	2
  (3857, 3234)	1
  (3857, 3238)	1
  (3857, 7160)	1
  (3857, 9873)	1
  (3857, 6468)	1


(3858, 10979)

In [4]:
# Call transform only, as featurizers have already been fit to training data.
X_test_counts = count_vect.transform(df_test["sentence"])
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
print(X_test_tfidf.shape)
print(X_test_tfidf)

(1101, 10979)
  (0, 10828)	0.1026099438506754
  (0, 10726)	0.05070541563899101
  (0, 10722)	0.10135966630424763
  (0, 10063)	0.09265420721337225
  (0, 10054)	0.13545109451982273
  (0, 10034)	0.1240751959076648
  (0, 9910)	0.14892683727632133
  (0, 9618)	0.11647145069527888
  (0, 9565)	0.1574210630720089
  (0, 9476)	0.15295865458125757
  (0, 9397)	0.07326144597667783
  (0, 9346)	0.1493126013644262
  (0, 9332)	0.11293320327690753
  (0, 8997)	0.14929294245214494
  (0, 8992)	0.1435595562274054
  (0, 8641)	0.1323684004492796
  (0, 8528)	0.14120413965684353
  (0, 8181)	0.3150171585632369
  (0, 7538)	0.09606636387867862
  (0, 7516)	0.1574210630720089
  (0, 7035)	0.0799011815717665
  (0, 7033)	0.0330519940442177
  (0, 6961)	0.08279480917255869
  (0, 6652)	0.11850689360467614
  (0, 6286)	0.1323684004492796
  :	:
  (1099, 1307)	0.1635262627846521
  (1099, 1040)	0.14761029778948762
  (1099, 879)	0.04895063552587649
  (1099, 826)	0.09433125275646147
  (1099, 803)	0.08861197890849595
  (1099, 646)	

In [5]:
# Extract labels.
y_train = df_train["Label"]
y_test = df_test["Label"]

print(len(y_train))
print(y_train.value_counts(normalize = True))
print(y_train.head())

print(len(y_test))
print(y_test.value_counts(normalize = True))
print(y_test.head())

3858
0    0.925868
1    0.074132
Name: Label, dtype: float64
0    0
1    0
2    0
3    0
4    0
Name: Label, dtype: int64
1101
0    0.926431
1    0.073569
Name: Label, dtype: float64
0    1
1    0
2    0
3    0
4    0
Name: Label, dtype: int64


## Train

In [6]:
# Train a multinomial Naive Bayes model.
mnb = MultinomialNB().fit(X_train_tfidf, y_train)

In [7]:
# Train a linear SVM.
# Regularized linear models with stochastic gradient descent (SGD) learning.
svm = SGDClassifier(loss = "hinge", 
                    penalty = "l2",
                    alpha = 1e-3, 
                    random_state = 42,
                    max_iter = 5, 
                    tol = None)
svm = svm.fit(X_train_tfidf, y_train)

## Test

In [8]:
# Predict on test set.
y_mnb = mnb.predict(X_test_tfidf)

# Evaluate performance.
print("\n--- NAIVE BAYES ---\n")
print(metrics.classification_report(y_test, y_mnb))

# MNB performance metrics.
# 0 = HAM, 1 = SPAM.
# Care most about false negatives – keeping SPAM when we only want HAM.
# Recall, F1, and accuracy rely on false negatives.
confusion_mnb = metrics.confusion_matrix(y_test, y_mnb)
acc_mnb = metrics.accuracy_score(y_test, y_mnb)
f1_mnb = metrics.f1_score(y_test, y_mnb, zero_division = 0)
precision_mnb = metrics.precision_score(y_test, y_mnb, zero_division = 0)
recall_mnb = metrics.recall_score(y_test, y_mnb, zero_division = 0)
roc_mnb = metrics.roc_auc_score(y_test, y_mnb)
        
print("\n--- NAIVE BAYES ---\n")
print("\n---------------------------------------------")
print("tn, fp, fn, tp =", confusion_mnb.ravel())
print("F1             =", f1_mnb)
print("Accuracy       =", acc_mnb)
print("Precision      =", precision_mnb)
print("Recall         =", recall_mnb)
print("ROC AUC        =", roc_mnb)
print("---------------------------------------------\n")


--- NAIVE BAYES ---

              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1020
           1       0.00      0.00      0.00        81

    accuracy                           0.93      1101
   macro avg       0.46      0.50      0.48      1101
weighted avg       0.86      0.93      0.89      1101


--- NAIVE BAYES ---


---------------------------------------------
tn, fp, fn, tp = [1020    0   81    0]
F1             = 0.0
Accuracy       = 0.9264305177111717
Precision      = 0.0
Recall         = 0.0
ROC AUC        = 0.5
---------------------------------------------



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
# Predict on test set.
y_svm = svm.predict(X_test_tfidf)

# Evaluate performance.
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print(metrics.classification_report(y_test, y_svm))

# SVM performance metrics.
# 0 = HAM, 1 = SPAM.
# Care most about false negatives – keeping SPAM when we only want HAM.
# Recall, F1, and accuracy rely on false negatives.
confusion_svm = metrics.confusion_matrix(y_test, y_svm)
acc_svm = metrics.accuracy_score(y_test, y_svm)
f1_svm = metrics.f1_score(y_test, y_svm, zero_division = 0)
precision_svm = metrics.precision_score(y_test, y_svm, zero_division = 0)
recall_svm = metrics.recall_score(y_test, y_svm, zero_division = 0)
roc_svm = metrics.roc_auc_score(y_test, y_svm)
        
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print("\n---------------------------------------------")
print("tn, fp, fn, tp =", confusion_svm.ravel())
print("F1             =", f1_svm)
print("Accuracy       =", acc_svm)
print("Precision      =", precision_svm)
print("Recall         =", recall_svm)
print("ROC AUC        =", roc_svm)
print("---------------------------------------------\n")


--- LINEAR SUPPORT VECTOR MACHINE ---

              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1020
           1       0.00      0.00      0.00        81

    accuracy                           0.93      1101
   macro avg       0.46      0.50      0.48      1101
weighted avg       0.86      0.93      0.89      1101


--- LINEAR SUPPORT VECTOR MACHINE ---


---------------------------------------------
tn, fp, fn, tp = [1020    0   81    0]
F1             = 0.0
Accuracy       = 0.9264305177111717
Precision      = 0.0
Recall         = 0.0
ROC AUC        = 0.5
---------------------------------------------



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Hyperparameter tuning

In [17]:
# Init search space for SVM.
parameters = {'alpha': [0.0, 0.01, 0.1, 0.5, 1.0, 10.0]}

# Tune SVM with 5-fold cross-validation.
mnb_tuned = GridSearchCV(MultinomialNB(), 
                         parameters,
                         scoring = "f1",
                         cv = 5, 
                         n_jobs = -1)

# Fit tuned model.
mnb_tuned = mnb_tuned.fit(X_train_tfidf, y_train)

# Predict on test set.
y_mnb_tuned = mnb_tuned.predict(X_test_tfidf)

# Evaluate performance.
print("\n--- TUNED NAIVE BAYES ---\n")
print(metrics.classification_report(y_test, y_mnb_tuned))

# MNB tuned performance metrics.
# 0 = HAM, 1 = SPAM.
# Care most about false negatives – keeping SPAM when we only want HAM.
# Recall, F1, and accuracy rely on false negatives.
confusion_mnb_tuned = metrics.confusion_matrix(y_test, y_mnb_tuned)
acc_mnb_tuned = metrics.accuracy_score(y_test, y_mnb_tuned)
f1_mnb_tuned = metrics.f1_score(y_test, y_mnb_tuned, zero_division = 0)
precision_mnb_tuned = metrics.precision_score(y_test, y_mnb_tuned, zero_division = 0)
recall_mnb_tuned = metrics.recall_score(y_test, y_mnb_tuned, zero_division = 0)
roc_mnb_tuned = metrics.roc_auc_score(y_test, y_mnb_tuned)
        
print("\n--- TUNED NAIVE BAYES ---\n")
print("\n---------------------------------------------")
print("tn, fp, fn, tp =", confusion_mnb_tuned.ravel())
print("F1             =", f1_mnb_tuned)
print("Accuracy       =", acc_mnb_tuned)
print("Precision      =", precision_mnb_tuned)
print("Recall         =", recall_mnb_tuned)
print("ROC AUC        =", roc_mnb_tuned)
print("---------------------------------------------\n")




--- TUNED NAIVE BAYES ---

              precision    recall  f1-score   support

           0       0.94      0.93      0.93      1020
           1       0.19      0.20      0.19        81

    accuracy                           0.88      1101
   macro avg       0.56      0.57      0.56      1101
weighted avg       0.88      0.88      0.88      1101


--- TUNED NAIVE BAYES ---


---------------------------------------------
tn, fp, fn, tp = [952  68  65  16]
F1             = 0.1939393939393939
Accuracy       = 0.8792007266121707
Precision      = 0.19047619047619047
Recall         = 0.19753086419753085
ROC AUC        = 0.5654320987654321
---------------------------------------------



In [None]:
%%capture
'''
# Init search space for SVM.
parameters = {'clf__alpha': (1e-1, 1e-2, 1e-3)}

# Tune SVM with 5-fold cross-validation.
svm_tuned = SGDClassifier(loss = "hinge", 
                          penalty = "l2",
                          random_state = 42,
                          max_iter = 5, 
                          tol = None)
svm_tuned = GridSearchCV(svm_tuned, 
                         parameters,
                         cv = 5, 
                         scoring = "f1",
                         n_jobs = -1)

# Fit tuned model.
svm_tuned = svm_tuned.fit(X_train_tfidf, y_train)

# Predict on test set.
y_svm_tuned = svm_tuned.predict(X_test_tfidf)

# Evaluate performance.
print("\n--- TUNED LINEAR SUPPORT VECTOR MACHINE ---\n")
print(metrics.classification_report(y_test, y_svm_tuned))
'''

In [12]:
# Train a linear SVM.
# Regularized linear models with stochastic gradient descent (SGD) learning.
svm_tuned = SGDClassifier(loss = "hinge", 
                    penalty = "l2",
                    alpha = 1e-2, 
                    random_state = 42,
                    max_iter = 5, 
                    tol = None)
svm_tuned = svm_tuned.fit(X_train_tfidf, y_train)

# Predict on test set.
y_svm_tuned = svm_tuned.predict(X_test_tfidf)

# Evaluate performance.
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print(metrics.classification_report(y_test, y_svm_tuned))

# SVM performance metrics.
# 0 = HAM, 1 = SPAM.
# Care most about false negatives – keeping SPAM when we only want HAM.
# Recall, F1, and accuracy rely on false negatives.
confusion_svm_tuned = metrics.confusion_matrix(y_test, y_svm_tuned)
acc_svm_tuned = metrics.accuracy_score(y_test, y_svm_tuned)
f1_svm_tuned = metrics.f1_score(y_test, y_svm_tuned, zero_division = 0)
precision_svm_tuned = metrics.precision_score(y_test, y_svm_tuned, zero_division = 0)
recall_svm_tuned = metrics.recall_score(y_test, y_svm_tuned, zero_division = 0)
roc_svm_tuned = metrics.roc_auc_score(y_test, y_svm_tuned)
        
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print("\n---------------------------------------------")
print("tn, fp, fn, tp =", confusion_svm_tuned.ravel())
print("F1             =", f1_svm_tuned)
print("Accuracy       =", acc_svm_tuned)
print("Precision      =", precision_svm_tuned)
print("Recall         =", recall_svm_tuned)
print("ROC AUC        =", roc_svm_tuned)
print("---------------------------------------------\n")

# Evaluate performance.
print("\n--- CLASSIFICATION REPORT ---\n")
print(metrics.classification_report(y_test, y_svm_tuned))


--- LINEAR SUPPORT VECTOR MACHINE ---

              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1020
           1       0.00      0.00      0.00        81

    accuracy                           0.93      1101
   macro avg       0.46      0.50      0.48      1101
weighted avg       0.86      0.93      0.89      1101


--- LINEAR SUPPORT VECTOR MACHINE ---


---------------------------------------------
tn, fp, fn, tp = [1020    0   81    0]
F1             = 0.0
Accuracy       = 0.9264305177111717
Precision      = 0.0
Recall         = 0.0
ROC AUC        = 0.5
---------------------------------------------


--- CLASSIFICATION REPORT ---

              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1020
           1       0.00      0.00      0.00        81

    accuracy                           0.93      1101
   macro avg       0.46      0.50      0.48      1101
weighted avg       0.86      0.93   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
# Train a linear SVM.
# Regularized linear models with stochastic gradient descent (SGD) learning.
svm_tuned = SGDClassifier(loss = "hinge", 
                    penalty = "l2",
                    alpha = 1e-1, 
                    random_state = 42,
                    max_iter = 5, 
                    tol = None)
svm_tuned = svm_tuned.fit(X_train_tfidf, y_train)

# Predict on test set.
y_svm_tuned = svm_tuned.predict(X_test_tfidf)

# Evaluate performance.
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print(metrics.classification_report(y_test, y_svm_tuned))

# SVM performance metrics.
# 0 = HAM, 1 = SPAM.
# Care most about false negatives – keeping SPAM when we only want HAM.
# Recall, F1, and accuracy rely on false negatives.
confusion_svm_tuned = metrics.confusion_matrix(y_test, y_svm_tuned)
acc_svm_tuned = metrics.accuracy_score(y_test, y_svm_tuned)
f1_svm_tuned = metrics.f1_score(y_test, y_svm_tuned, zero_division = 0)
precision_svm_tuned = metrics.precision_score(y_test, y_svm_tuned, zero_division = 0)
recall_svm_tuned = metrics.recall_score(y_test, y_svm_tuned, zero_division = 0)
roc_svm_tuned = metrics.roc_auc_score(y_test, y_svm_tuned)
        
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print("\n---------------------------------------------")
print("tn, fp, fn, tp =", confusion_svm_tuned.ravel())
print("F1             =", f1_svm_tuned)
print("Accuracy       =", acc_svm_tuned)
print("Precision      =", precision_svm_tuned)
print("Recall         =", recall_svm_tuned)
print("ROC AUC        =", roc_svm_tuned)
print("---------------------------------------------\n")

# Evaluate performance.
print("\n--- CLASSIFICATION REPORT ---\n")
print(metrics.classification_report(y_test, y_svm_tuned))


--- LINEAR SUPPORT VECTOR MACHINE ---

              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1020
           1       0.00      0.00      0.00        81

    accuracy                           0.93      1101
   macro avg       0.46      0.50      0.48      1101
weighted avg       0.86      0.93      0.89      1101


--- LINEAR SUPPORT VECTOR MACHINE ---


---------------------------------------------
tn, fp, fn, tp = [1020    0   81    0]
F1             = 0.0
Accuracy       = 0.9264305177111717
Precision      = 0.0
Recall         = 0.0
ROC AUC        = 0.5
---------------------------------------------


--- CLASSIFICATION REPORT ---

              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1020
           1       0.00      0.00      0.00        81

    accuracy                           0.93      1101
   macro avg       0.46      0.50      0.48      1101
weighted avg       0.86      0.93   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
# Train a linear SVM.
# Regularized linear models with stochastic gradient descent (SGD) learning.
svm_tuned = SGDClassifier(loss = "hinge", 
                    penalty = "l2",
                    alpha = 1e-4, 
                    random_state = 42,
                    max_iter = 5, 
                    tol = None)
svm_tuned = svm_tuned.fit(X_train_tfidf, y_train)

# Predict on test set.
y_svm_tuned = svm_tuned.predict(X_test_tfidf)

# Evaluate performance.
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print(metrics.classification_report(y_test, y_svm_tuned))

# SVM performance metrics.
# 0 = HAM, 1 = SPAM.
# Care most about false negatives – keeping SPAM when we only want HAM.
# Recall, F1, and accuracy rely on false negatives.
confusion_svm_tuned = metrics.confusion_matrix(y_test, y_svm_tuned)
acc_svm_tuned = metrics.accuracy_score(y_test, y_svm_tuned)
f1_svm_tuned = metrics.f1_score(y_test, y_svm_tuned, zero_division = 0)
precision_svm_tuned = metrics.precision_score(y_test, y_svm_tuned, zero_division = 0)
recall_svm_tuned = metrics.recall_score(y_test, y_svm_tuned, zero_division = 0)
roc_svm_tuned = metrics.roc_auc_score(y_test, y_svm_tuned)
        
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print("\n---------------------------------------------")
print("tn, fp, fn, tp =", confusion_svm_tuned.ravel())
print("F1             =", f1_svm_tuned)
print("Accuracy       =", acc_svm_tuned)
print("Precision      =", precision_svm_tuned)
print("Recall         =", recall_svm_tuned)
print("ROC AUC        =", roc_svm_tuned)
print("---------------------------------------------\n")

# Evaluate performance.
print("\n--- CLASSIFICATION REPORT ---\n")
print(metrics.classification_report(y_test, y_svm_tuned))


--- LINEAR SUPPORT VECTOR MACHINE ---

              precision    recall  f1-score   support

           0       0.93      0.98      0.95      1020
           1       0.31      0.14      0.19        81

    accuracy                           0.91      1101
   macro avg       0.62      0.56      0.57      1101
weighted avg       0.89      0.91      0.90      1101


--- LINEAR SUPPORT VECTOR MACHINE ---


---------------------------------------------
tn, fp, fn, tp = [995  25  70  11]
F1             = 0.18803418803418803
Accuracy       = 0.9137148047229791
Precision      = 0.3055555555555556
Recall         = 0.13580246913580246
ROC AUC        = 0.5556463326071168
---------------------------------------------


--- CLASSIFICATION REPORT ---

              precision    recall  f1-score   support

           0       0.93      0.98      0.95      1020
           1       0.31      0.14      0.19        81

    accuracy                           0.91      1101
   macro avg       0.62      0.5

## Optimal model

In [19]:
# BEST PERFORMING MODEL = linear SVM with below hyperparameters.
# Regularized linear models with stochastic gradient descent (SGD) learning.
svm_opt = SGDClassifier(loss = "hinge", 
                        penalty = "l2",
                        alpha = 1e-4, 
                        random_state = 42,
                        max_iter = 5, 
                        tol = None)

svm_opt = svm_opt.fit(X_train_tfidf, y_train)

# Predict on test set.
y_svm_opt = svm_opt.predict(X_test_tfidf)

# Evaluate performance.
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print(metrics.classification_report(y_test, y_svm_opt))

ERROR! Session/line number was not unique in database. History logging moved to new session 1034

--- LINEAR SUPPORT VECTOR MACHINE ---

              precision    recall  f1-score   support

           0       0.93      0.98      0.95      1020
           1       0.31      0.14      0.19        81

    accuracy                           0.91      1101
   macro avg       0.62      0.56      0.57      1101
weighted avg       0.89      0.91      0.90      1101

