# Supervised baseline: RNA

## Preamble

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier # Linear SVM.
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

## Read data

In [2]:
# Read data files.
df_train = pd.read_csv("../data/rna_titles/rna_tuned_train.csv")
df_val = pd.read_csv("../data/rna_titles/rna_tuned_val.csv")
df_test = pd.read_csv("../data/rna_titles/rna_tuned_test.csv")

# Explore data.
print(df_train.info())
display(df_train.head())

print(df_val.info())
display(df_val.head())

print(df_test.info())
display(df_test.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1656 entries, 0 to 1655
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   PMID              1656 non-null   int64 
 1   Title             1656 non-null   object
 2   Authors           1655 non-null   object
 3   Citation          1656 non-null   object
 4   First Author      1655 non-null   object
 5   Journal/Book      1656 non-null   object
 6   Publication Year  1656 non-null   int64 
 7   Create Date       1656 non-null   object
 8   PMCID             943 non-null    object
 9   NIHMS ID          149 non-null    object
 10  DOI               1656 non-null   object
 11  Label             1656 non-null   int64 
dtypes: int64(3), object(9)
memory usage: 155.4+ KB
None


Unnamed: 0,PMID,Title,Authors,Citation,First Author,Journal/Book,Publication Year,Create Date,PMCID,NIHMS ID,DOI,Label
0,21205307,time-series clustering of gene expression in i...,"Ghandhi SA, Sinha A, Markatou M, Amundson SA.",BMC Genomics. 2011 Jan 4;12:2. doi: 10.1186/14...,Ghandhi SA,BMC Genomics,2011,2011/01/06,PMC3022823,,10.1186/1471-2164-12-2,1
1,25409906,dynamic analyses of alternative polyadenylatio...,"Xia Z, Donehower LA, Cooper TA, Neilson JR, Wh...",Nat Commun. 2014 Nov 20;5:5274. doi: 10.1038/n...,Xia Z,Nat Commun,2014,2014/11/21,PMC4467577,NIHMS674338,10.1038/ncomms6274,0
2,14984408,temporal and spatial patterns of kv1.1 and kv1...,"Karimi-Abdolrezaee S, Eftekharpour E, Fehlings...",Eur J Neurosci. 2004 Feb;19(3):577-89. doi: 10...,Karimi-Abdolrezaee S,Eur J Neurosci,2004,2004/02/27,,,10.1111/j.0953-816x.2004.03164.x,0
3,19340919,simultaneously segmenting multiple gene expres...,"Tadepalli S, Ramakrishnan N, Watson LT, Mishra...",J Bioinform Comput Biol. 2009 Apr;7(2):339-56....,Tadepalli S,J Bioinform Comput Biol,2009,2009/04/03,,,10.1142/s0219720009004114,1
4,18547802,rna dynamics: it is about time,"Al-Hashimi HM, Walter NG.",Curr Opin Struct Biol. 2008 Jun;18(3):321-9. d...,Al-Hashimi HM,Curr Opin Struct Biol,2008,2008/06/13,PMC2580758,NIHMS74438,10.1016/j.sbi.2008.04.004,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184 entries, 0 to 183
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   PMID              184 non-null    int64 
 1   Title             184 non-null    object
 2   Authors           184 non-null    object
 3   Citation          184 non-null    object
 4   First Author      184 non-null    object
 5   Journal/Book      184 non-null    object
 6   Publication Year  184 non-null    int64 
 7   Create Date       184 non-null    object
 8   PMCID             108 non-null    object
 9   NIHMS ID          23 non-null     object
 10  DOI               184 non-null    object
 11  Label             184 non-null    int64 
dtypes: int64(3), object(9)
memory usage: 17.4+ KB
None


Unnamed: 0,PMID,Title,Authors,Citation,First Author,Journal/Book,Publication Year,Create Date,PMCID,NIHMS ID,DOI,Label
0,27249342,recent advances in dynamic m6a rna modification,"Cao G, Li HB, Yin Z, Flavell RA.",Open Biol. 2016 Apr;6(4):160003. doi: 10.1098/...,Cao G,Open Biol,2016,2016/06/02,PMC4852458,,10.1098/rsob.160003,0
1,16420705,discovery of time-delayed gene regulatory netw...,"Li X, Rao S, Jiang W, Li C, Xiao Y, Guo Z, Zha...",BMC Bioinformatics. 2006 Jan 18;7:26. doi: 10....,Li X,BMC Bioinformatics,2006,2006/01/20,PMC1386718,,10.1186/1471-2105-7-26,0
2,27409645,evaluation of immortalized avpv- and arcuate-s...,"Jacobs DC, Veitch RE, Chappell PE.",Endocrinology. 2016 Sep;157(9):3410-9. doi: 10...,Jacobs DC,Endocrinology,2016,2016/07/14,,,10.1210/en.2016-1294,1
3,29615554,flying the rna nest: drosophila reveals novel ...,"Lefebvre FA, Lécuyer É.",J Dev Biol. 2018 Mar 7;6(1):5. doi: 10.3390/jd...,Lefebvre FA,J Dev Biol,2018,2018/04/05,PMC5875563,,10.3390/jdb6010005,1
4,23157550,exploring the dynamics of four rna-dependent r...,"Shen H, Moustafa IM, Cameron CE, Colina CM.",J Phys Chem B. 2012 Dec 20;116(50):14515-24. d...,Shen H,J Phys Chem B,2012,2012/11/20,PMC5718632,NIHMS922948,10.1021/jp302709v,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 460 entries, 0 to 459
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   PMID              460 non-null    int64 
 1   Title             460 non-null    object
 2   Authors           458 non-null    object
 3   Citation          460 non-null    object
 4   First Author      458 non-null    object
 5   Journal/Book      460 non-null    object
 6   Publication Year  460 non-null    int64 
 7   Create Date       460 non-null    object
 8   PMCID             250 non-null    object
 9   NIHMS ID          51 non-null     object
 10  DOI               459 non-null    object
 11  Label             460 non-null    int64 
dtypes: int64(3), object(9)
memory usage: 43.2+ KB
None


Unnamed: 0,PMID,Title,Authors,Citation,First Author,Journal/Book,Publication Year,Create Date,PMCID,NIHMS ID,DOI,Label
0,13677474,temporal progression of gene expression respon...,"Wang H, Miyazaki S, Kawai K, Deyholos M, Galbr...",Plant Mol Biol. 2003 Jul;52(4):873-91. doi: 10...,Wang H,Plant Mol Biol,2003,2003/09/19,,,10.1023/a:1025029026375,1
1,31997507,temporal profile of hev rna concentration in b...,"Goel A, V Padmaprakash K, Benjamin M, Katiyar ...",J Viral Hepat. 2020 Jun;27(6):631-637. doi: 10...,Goel A,J Viral Hepat,2020,2020/01/31,,,10.1111/jvh.13266,0
2,19147844,dynamic gene expression is required for anteri...,"Pechmann M, McGregor AP, Schwager EE, Feitosa ...",Proc Natl Acad Sci U S A. 2009 Feb 3;106(5):14...,Pechmann M,Proc Natl Acad Sci U S A,2009,2009/01/17,PMC2635816,,10.1073/pnas.0811150106,0
3,11606741,development of a two-part transcription probe ...,"Li Z, Piggot PJ.",Proc Natl Acad Sci U S A. 2001 Oct 23;98(22):1...,Li Z,Proc Natl Acad Sci U S A,2001,2001/10/19,PMC60089,,10.1073/pnas.221454798,1
4,21176199,the complexity of gene expression dynamics rev...,"Sun X, Zou Y, Nikiforova V, Kurths J, Walther D.",BMC Bioinformatics. 2010 Dec 22;11:607. doi: 1...,Sun X,BMC Bioinformatics,2010,2010/12/24,PMC3098107,,10.1186/1471-2105-11-607,1


## Process data

In [3]:
# Build dictionary of features and transform documents to feature vectors.
# Value of a word in the vocabulary is its frequency in the whole training corpus.
# https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(df_train["Title"])
print(X_train_counts.shape)
print(X_train_counts)

(1656, 4468)
  (0, 4113)	1
  (0, 3680)	1
  (0, 713)	2
  (0, 2810)	2
  (0, 1646)	1
  (0, 1456)	1
  (0, 1974)	1
  (0, 2118)	1
  (0, 222)	1
  (0, 534)	1
  (0, 1511)	1
  (0, 209)	1
  (0, 267)	1
  (0, 1488)	1
  (1, 2810)	1
  (1, 1233)	1
  (1, 212)	1
  (1, 185)	1
  (1, 3136)	1
  (1, 1606)	1
  (1, 3553)	1
  (1, 3674)	1
  (1, 3519)	1
  (1, 4320)	1
  (1, 2213)	1
  :	:
  (1654, 4208)	1
  (1654, 99)	1
  (1654, 4356)	1
  (1654, 3442)	1
  (1654, 770)	1
  (1654, 3725)	1
  (1654, 1529)	1
  (1654, 858)	1
  (1654, 3763)	1
  (1654, 4460)	1
  (1654, 4395)	1
  (1655, 2810)	1
  (1655, 1646)	1
  (1655, 1456)	1
  (1655, 222)	2
  (1655, 1233)	1
  (1655, 4083)	1
  (1655, 2825)	1
  (1655, 3909)	1
  (1655, 3562)	1
  (1655, 1586)	1
  (1655, 2398)	1
  (1655, 665)	1
  (1655, 2305)	1
  (1655, 1897)	1


In [4]:
%%capture
'''
# Regularize via “Term Frequency.”
tf_transformer = TfidfTransformer(use_idf = False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
print(X_train_tf.shape)
print(X_train_tf)
'''

In [5]:
# Regularize via “Term Frequency times Inverse Document Frequency.”
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1656, 4468)

In [6]:
# Call transform only, as featurizers have already been fit to training data.
X_test_counts = count_vect.transform(df_test["Title"])
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
print(X_test_tfidf.shape)
print(X_test_tfidf)

(460, 4468)
  (0, 4129)	0.18762593697179625
  (0, 4055)	0.13795009633576574
  (0, 3704)	0.4038868013919094
  (0, 3590)	0.45828339022661657
  (0, 3566)	0.4038868013919094
  (0, 3501)	0.31592974777919386
  (0, 3243)	0.35707917870897676
  (0, 2810)	0.08483258781007404
  (0, 2359)	0.38391176369580393
  (0, 1974)	0.10326528055314137
  (0, 1646)	0.0929566474021971
  (0, 1456)	0.09103243454822718
  (1, 4427)	0.33437719074511624
  (1, 4055)	0.10814230677478488
  (1, 3889)	0.35925906753300363
  (1, 3553)	0.08751230066393002
  (1, 3408)	0.3403893464400544
  (1, 3229)	0.23323058572619954
  (1, 2979)	0.24766471797497455
  (1, 2810)	0.06650224957528644
  (1, 1974)	0.161904137008808
  (1, 1806)	0.29474306498690495
  (1, 1665)	0.35925906753300363
  (1, 1606)	0.1615406481695132
  (1, 799)	0.3270010662599543
  :	:
  (458, 4083)	0.13031835678320042
  (458, 4055)	0.1339907359530225
  (458, 3719)	0.3371978032328088
  (458, 2810)	0.0823977740892973
  (458, 2702)	0.445130017032037
  (458, 2575)	0.2384964651

In [7]:
# Extract labels.
y_train = df_train["Label"]
y_test = df_test["Label"]

print(len(y_train))
print(y_train.value_counts(normalize = True))
print(y_train.head())

print(len(y_test))
print(y_test.value_counts(normalize = True))
print(y_test.head())

1656
0    0.620773
1    0.379227
Name: Label, dtype: float64
0    1
1    0
2    0
3    1
4    0
Name: Label, dtype: int64
460
0    0.621739
1    0.378261
Name: Label, dtype: float64
0    1
1    0
2    0
3    1
4    1
Name: Label, dtype: int64


## Train

In [8]:
# Train a multinomial Naive Bayes model.
mnb = MultinomialNB().fit(X_train_tfidf, y_train)

In [9]:
# Train a linear SVM.
# Regularized linear models with stochastic gradient descent (SGD) learning.
svm = SGDClassifier(loss = "hinge", 
                    penalty = "l2",
                    alpha = 1e-3, 
                    random_state = 42,
                    max_iter = 5, 
                    tol = None)
svm = svm.fit(X_train_tfidf, y_train)

## Test

In [10]:
# Predict on test set.
y_mnb = mnb.predict(X_test_tfidf)

# Evaluate performance.
print("\n--- NAIVE BAYES ---\n")
print(metrics.classification_report(y_test, y_mnb))

# MNB performance metrics.
# Care about most about false negatives – discarding papers we want to keep.
# Recall, F1, and accuracy rely on false negatives.
confusion_mnb = metrics.confusion_matrix(y_test, y_mnb)
acc_mnb = metrics.accuracy_score(y_test, y_mnb)
f1_mnb = metrics.f1_score(y_test, y_mnb, zero_division = 0)
precision_mnb = metrics.precision_score(y_test, y_mnb, zero_division = 0)
recall_mnb = metrics.recall_score(y_test, y_mnb, zero_division = 0)
roc_mnb = metrics.roc_auc_score(y_test, y_mnb)
        
print("\n--- NAIVE BAYES ---\n")
print("\n---------------------------------------------")
print("tn, fp, fn, tp =", confusion_mnb.ravel())
print("F1             =", f1_mnb)
print("Accuracy       =", acc_mnb)
print("Precision      =", precision_mnb)
print("Recall         =", recall_mnb)
print("ROC AUC        =", roc_mnb)
print("---------------------------------------------\n")


--- NAIVE BAYES ---

              precision    recall  f1-score   support

           0       0.79      0.96      0.86       286
           1       0.89      0.57      0.69       174

    accuracy                           0.81       460
   macro avg       0.84      0.76      0.78       460
weighted avg       0.83      0.81      0.80       460


--- NAIVE BAYES ---


---------------------------------------------
tn, fp, fn, tp = [274  12  75  99]
F1             = 0.6947368421052632
Accuracy       = 0.8108695652173913
Precision      = 0.8918918918918919
Recall         = 0.5689655172413793
ROC AUC        = 0.7635037376416688
---------------------------------------------



In [11]:
# Predict on test set.
y_svm = svm.predict(X_test_tfidf)

# Evaluate performance.
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print(metrics.classification_report(y_test, y_svm))

# SVM performance metrics.
# Care about most about false negatives – discarding papers we want to keep.
# Recall, F1, and accuracy rely on false negatives.
confusion_svm = metrics.confusion_matrix(y_test, y_svm)
acc_svm = metrics.accuracy_score(y_test, y_svm)
f1_svm = metrics.f1_score(y_test, y_svm, zero_division = 0)
precision_svm = metrics.precision_score(y_test, y_svm, zero_division = 0)
recall_svm = metrics.recall_score(y_test, y_svm, zero_division = 0)
roc_svm = metrics.roc_auc_score(y_test, y_svm)
        
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print("\n---------------------------------------------")
print("tn, fp, fn, tp =", confusion_svm.ravel())
print("F1             =", f1_svm)
print("Accuracy       =", acc_svm)
print("Precision      =", precision_svm)
print("Recall         =", recall_svm)
print("ROC AUC        =", roc_svm)
print("---------------------------------------------\n")


--- LINEAR SUPPORT VECTOR MACHINE ---

              precision    recall  f1-score   support

           0       0.87      0.86      0.87       286
           1       0.78      0.79      0.78       174

    accuracy                           0.83       460
   macro avg       0.82      0.83      0.83       460
weighted avg       0.84      0.83      0.84       460


--- LINEAR SUPPORT VECTOR MACHINE ---


---------------------------------------------
tn, fp, fn, tp = [246  40  36 138]
F1             = 0.7840909090909092
Accuracy       = 0.8347826086956521
Precision      = 0.7752808988764045
Recall         = 0.7931034482758621
ROC AUC        = 0.8266216542078612
---------------------------------------------



## Hyperparameter tuning

In [12]:
# Init search space.
parameters = {'alpha': [0.0, 0.01, 0.1, 0.5, 1.0, 10.0]}

# Tune SVM with 5-fold cross-validation.
mnb_tuned = GridSearchCV(MultinomialNB(), 
                         parameters,
                         cv = 5, 
                         n_jobs = -1)

# Fit tuned model.
mnb_tuned = mnb_tuned.fit(X_train_tfidf, y_train)

# Predict on test set.
y_mnb_tuned = mnb_tuned.predict(X_test_tfidf)

# Evaluate performance.
print("\n--- TUNED NAIVE BAYES ---\n")
print(metrics.classification_report(y_test, y_mnb_tuned))

# MNB tuned performance metrics.
# Care most about false negatives.
# Recall, F1, and accuracy rely on false negatives.
confusion_mnb_tuned = metrics.confusion_matrix(y_test, y_mnb_tuned)
acc_mnb_tuned = metrics.accuracy_score(y_test, y_mnb_tuned)
f1_mnb_tuned = metrics.f1_score(y_test, y_mnb_tuned, zero_division = 0)
precision_mnb_tuned = metrics.precision_score(y_test, y_mnb_tuned, zero_division = 0)
recall_mnb_tuned = metrics.recall_score(y_test, y_mnb_tuned, zero_division = 0)
roc_mnb_tuned = metrics.roc_auc_score(y_test, y_mnb_tuned)
        
print("\n--- TUNED NAIVE BAYES ---\n")
print("\n---------------------------------------------")
print("tn, fp, fn, tp =", confusion_mnb_tuned.ravel())
print("F1             =", f1_mnb_tuned)
print("Accuracy       =", acc_mnb_tuned)
print("Precision      =", precision_mnb_tuned)
print("Recall         =", recall_mnb_tuned)
print("ROC AUC        =", roc_mnb_tuned)
print("---------------------------------------------\n")


--- TUNED NAIVE BAYES ---

              precision    recall  f1-score   support

           0       0.81      0.94      0.87       286
           1       0.86      0.64      0.74       174

    accuracy                           0.83       460
   macro avg       0.84      0.79      0.80       460
weighted avg       0.83      0.83      0.82       460


--- TUNED NAIVE BAYES ---


---------------------------------------------
tn, fp, fn, tp = [268  18  62 112]
F1             = 0.7368421052631579
Accuracy       = 0.8260869565217391
Precision      = 0.8615384615384616
Recall         = 0.6436781609195402
ROC AUC        = 0.7903705489912387
---------------------------------------------



In [13]:
%%capture
'''
# Init search space for SVM.
parameters = {'clf__alpha': (1e-1, 1e-2, 1e-3)}

# Tune SVM with 5-fold cross-validation.
svm_tuned = SGDClassifier(loss = "hinge", 
                          penalty = "l2",
                          random_state = 42,
                          max_iter = 5, 
                          tol = None)
svm_tuned = GridSearchCV(svm_tuned, 
                         parameters,
                         cv = 5, 
                         n_jobs = -1)

# Fit tuned model.
svm_tuned = svm_tuned.fit(X_train_tfidf, y_train)

# Predict on test set.
y_svm_tuned = svm_tuned.predict(X_test_tfidf)

# Evaluate performance.
print("\n--- TUNED LINEAR SUPPORT VECTOR MACHINE ---\n")
print(metrics.classification_report(y_test, y_svm_tuned))
'''

In [14]:
# Train a linear SVM.
# Regularized linear models with stochastic gradient descent (SGD) learning.
svm_tuned = SGDClassifier(loss = "hinge", 
                          penalty = "l2",
                          alpha = 1e-2, 
                          random_state = 42,
                          max_iter = 5, 
                          tol = None)
svm_tuned = svm_tuned.fit(X_train_tfidf, y_train)

# Predict on test set.
y_svm_tuned = svm_tuned.predict(X_test_tfidf)

# Evaluate performance.
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print(metrics.classification_report(y_test, y_svm_tuned))

# SVM performance metrics.
# Care most about false negatives.
# Recall, F1, and accuracy rely on false negatives.
confusion_svm_tuned = metrics.confusion_matrix(y_test, y_svm_tuned)
acc_svm_tuned = metrics.accuracy_score(y_test, y_svm_tuned)
f1_svm_tuned = metrics.f1_score(y_test, y_svm_tuned, zero_division = 0)
precision_svm_tuned = metrics.precision_score(y_test, y_svm_tuned, zero_division = 0)
recall_svm_tuned = metrics.recall_score(y_test, y_svm_tuned, zero_division = 0)
roc_svm_tuned = metrics.roc_auc_score(y_test, y_svm_tuned)
        
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print("\n---------------------------------------------")
print("tn, fp, fn, tp =", confusion_svm_tuned.ravel())
print("F1             =", f1_svm_tuned)
print("Accuracy       =", acc_svm_tuned)
print("Precision      =", precision_svm_tuned)
print("Recall         =", recall_svm_tuned)
print("ROC AUC        =", roc_svm_tuned)
print("---------------------------------------------\n")


--- LINEAR SUPPORT VECTOR MACHINE ---

              precision    recall  f1-score   support

           0       0.68      0.99      0.81       286
           1       0.93      0.23      0.37       174

    accuracy                           0.70       460
   macro avg       0.80      0.61      0.59       460
weighted avg       0.77      0.70      0.64       460


--- LINEAR SUPPORT VECTOR MACHINE ---


---------------------------------------------
tn, fp, fn, tp = [283   3 134  40]
F1             = 0.36866359447004604
Accuracy       = 0.7021739130434783
Precision      = 0.9302325581395349
Recall         = 0.22988505747126436
ROC AUC        = 0.609697773490877
---------------------------------------------



In [15]:
# Train a linear SVM.
# Regularized linear models with stochastic gradient descent (SGD) learning.
svm_tuned = SGDClassifier(loss = "hinge", 
                    penalty = "l2",
                    alpha = 1e-1, 
                    random_state = 42,
                    max_iter = 5, 
                    tol = None)
svm_tuned = svm_tuned.fit(X_train_tfidf, y_train)

# Predict on test set.
y_svm_tuned = svm_tuned.predict(X_test_tfidf)

# Evaluate performance.
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print(metrics.classification_report(y_test, y_svm_tuned))

# SVM performance metrics.
# Care most about false negatives.
# Recall, F1, and accuracy rely on false negatives.
confusion_svm_tuned = metrics.confusion_matrix(y_test, y_svm_tuned)
acc_svm_tuned = metrics.accuracy_score(y_test, y_svm_tuned)
f1_svm_tuned = metrics.f1_score(y_test, y_svm_tuned, zero_division = 0)
precision_svm_tuned = metrics.precision_score(y_test, y_svm_tuned, zero_division = 0)
recall_svm_tuned = metrics.recall_score(y_test, y_svm_tuned, zero_division = 0)
roc_svm_tuned = metrics.roc_auc_score(y_test, y_svm_tuned)
        
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print("\n---------------------------------------------")
print("tn, fp, fn, tp =", confusion_svm_tuned.ravel())
print("F1             =", f1_svm_tuned)
print("Accuracy       =", acc_svm_tuned)
print("Precision      =", precision_svm_tuned)
print("Recall         =", recall_svm_tuned)
print("ROC AUC        =", roc_svm_tuned)
print("---------------------------------------------\n")

# Evaluate performance.
print("\n--- CLASSIFICATION REPORT ---\n")
print(metrics.classification_report(y_test, y_svm_tuned))


--- LINEAR SUPPORT VECTOR MACHINE ---

              precision    recall  f1-score   support

           0       0.62      1.00      0.77       286
           1       0.00      0.00      0.00       174

    accuracy                           0.62       460
   macro avg       0.31      0.50      0.38       460
weighted avg       0.39      0.62      0.48       460


--- LINEAR SUPPORT VECTOR MACHINE ---


---------------------------------------------
tn, fp, fn, tp = [286   0 174   0]
F1             = 0.0
Accuracy       = 0.6217391304347826
Precision      = 0.0
Recall         = 0.0
ROC AUC        = 0.5
---------------------------------------------


--- CLASSIFICATION REPORT ---

              precision    recall  f1-score   support

           0       0.62      1.00      0.77       286
           1       0.00      0.00      0.00       174

    accuracy                           0.62       460
   macro avg       0.31      0.50      0.38       460
weighted avg       0.39      0.62      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
# Train a linear SVM.
# Regularized linear models with stochastic gradient descent (SGD) learning.
svm_tuned = SGDClassifier(loss = "hinge", 
                    penalty = "l2",
                    alpha = 1e-4, 
                    random_state = 42,
                    max_iter = 5, 
                    tol = None)
svm_tuned = svm_tuned.fit(X_train_tfidf, y_train)

# Predict on test set.
y_svm_tuned = svm_tuned.predict(X_test_tfidf)

# Evaluate performance.
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print(metrics.classification_report(y_test, y_svm_tuned))

# SVM performance metrics.
# Care most about false negative.
# Recall, F1, and accuracy rely on false negatives.
confusion_svm_tuned = metrics.confusion_matrix(y_test, y_svm_tuned)
acc_svm_tuned = metrics.accuracy_score(y_test, y_svm_tuned)
f1_svm_tuned = metrics.f1_score(y_test, y_svm_tuned, zero_division = 0)
precision_svm_tuned = metrics.precision_score(y_test, y_svm_tuned, zero_division = 0)
recall_svm_tuned = metrics.recall_score(y_test, y_svm_tuned, zero_division = 0)
roc_svm_tuned = metrics.roc_auc_score(y_test, y_svm_tuned)
        
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print("\n---------------------------------------------")
print("tn, fp, fn, tp =", confusion_svm_tuned.ravel())
print("F1             =", f1_svm_tuned)
print("Accuracy       =", acc_svm_tuned)
print("Precision      =", precision_svm_tuned)
print("Recall         =", recall_svm_tuned)
print("ROC AUC        =", roc_svm_tuned)
print("---------------------------------------------\n")

# Evaluate performance.
print("\n--- CLASSIFICATION REPORT ---\n")
print(metrics.classification_report(y_test, y_svm_tuned))


--- LINEAR SUPPORT VECTOR MACHINE ---

              precision    recall  f1-score   support

           0       0.90      0.87      0.88       286
           1       0.79      0.83      0.81       174

    accuracy                           0.85       460
   macro avg       0.84      0.85      0.85       460
weighted avg       0.86      0.85      0.86       460


--- LINEAR SUPPORT VECTOR MACHINE ---


---------------------------------------------
tn, fp, fn, tp = [248  38  29 145]
F1             = 0.8123249299719888
Accuracy       = 0.8543478260869565
Precision      = 0.7923497267759563
Recall         = 0.8333333333333334
ROC AUC        = 0.8502331002331003
---------------------------------------------


--- CLASSIFICATION REPORT ---

              precision    recall  f1-score   support

           0       0.90      0.87      0.88       286
           1       0.79      0.83      0.81       174

    accuracy                           0.85       460
   macro avg       0.84      0.85 

In [17]:
# Train a linear SVM.
# Regularized linear models with stochastic gradient descent (SGD) learning.
svm_tuned = SGDClassifier(loss = "hinge", 
                    penalty = "l2",
                    alpha = 1e-5, 
                    random_state = 42,
                    max_iter = 5, 
                    tol = None)
svm_tuned = svm_tuned.fit(X_train_tfidf, y_train)

# Predict on test set.
y_svm_tuned = svm_tuned.predict(X_test_tfidf)

# Evaluate performance.
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print(metrics.classification_report(y_test, y_svm_tuned))

# SVM performance metrics.
# Care most about false negatives.
# Recall, F1, and accuracy rely on false negatives.
confusion_svm_tuned = metrics.confusion_matrix(y_test, y_svm_tuned)
acc_svm_tuned = metrics.accuracy_score(y_test, y_svm_tuned)
f1_svm_tuned = metrics.f1_score(y_test, y_svm_tuned, zero_division = 0)
precision_svm_tuned = metrics.precision_score(y_test, y_svm_tuned, zero_division = 0)
recall_svm_tuned = metrics.recall_score(y_test, y_svm_tuned, zero_division = 0)
roc_svm_tuned = metrics.roc_auc_score(y_test, y_svm_tuned)
        
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print("\n---------------------------------------------")
print("tn, fp, fn, tp =", confusion_svm_tuned.ravel())
print("F1             =", f1_svm_tuned)
print("Accuracy       =", acc_svm_tuned)
print("Precision      =", precision_svm_tuned)
print("Recall         =", recall_svm_tuned)
print("ROC AUC        =", roc_svm_tuned)
print("---------------------------------------------\n")

# Evaluate performance.
print("\n--- CLASSIFICATION REPORT ---\n")
print(metrics.classification_report(y_test, y_svm_tuned))


--- LINEAR SUPPORT VECTOR MACHINE ---

              precision    recall  f1-score   support

           0       0.86      0.90      0.88       286
           1       0.83      0.76      0.79       174

    accuracy                           0.85       460
   macro avg       0.84      0.83      0.84       460
weighted avg       0.85      0.85      0.85       460


--- LINEAR SUPPORT VECTOR MACHINE ---


---------------------------------------------
tn, fp, fn, tp = [258  28  41 133]
F1             = 0.7940298507462686
Accuracy       = 0.85
Precision      = 0.8260869565217391
Recall         = 0.764367816091954
ROC AUC        = 0.8332328590949281
---------------------------------------------


--- CLASSIFICATION REPORT ---

              precision    recall  f1-score   support

           0       0.86      0.90      0.88       286
           1       0.83      0.76      0.79       174

    accuracy                           0.85       460
   macro avg       0.84      0.83      0.84      

In [18]:
# Train a linear SVM.
# Regularized linear models with stochastic gradient descent (SGD) learning.
svm_tuned = SGDClassifier(loss = "hinge", 
                    penalty = "l2",
                    alpha = 1e-6, 
                    random_state = 42,
                    max_iter = 5, 
                    tol = None)
svm_tuned = svm_tuned.fit(X_train_tfidf, y_train)

# Predict on test set.
y_svm_tuned = svm_tuned.predict(X_test_tfidf)

# Evaluate performance.
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print(metrics.classification_report(y_test, y_svm_tuned))

# SVM performance metrics.
# Care most about false negatives.
# Recall, F1, and accuracy rely on false negatives.
confusion_svm_tuned = metrics.confusion_matrix(y_test, y_svm_tuned)
acc_svm_tuned = metrics.accuracy_score(y_test, y_svm_tuned)
f1_svm_tuned = metrics.f1_score(y_test, y_svm_tuned, zero_division = 0)
precision_svm_tuned = metrics.precision_score(y_test, y_svm_tuned, zero_division = 0)
recall_svm_tuned = metrics.recall_score(y_test, y_svm_tuned, zero_division = 0)
roc_svm_tuned = metrics.roc_auc_score(y_test, y_svm_tuned)
        
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print("\n---------------------------------------------")
print("tn, fp, fn, tp =", confusion_svm_tuned.ravel())
print("F1             =", f1_svm_tuned)
print("Accuracy       =", acc_svm_tuned)
print("Precision      =", precision_svm_tuned)
print("Recall         =", recall_svm_tuned)
print("ROC AUC        =", roc_svm_tuned)
print("---------------------------------------------\n")

# Evaluate performance.
print("\n--- CLASSIFICATION REPORT ---\n")
print(metrics.classification_report(y_test, y_svm_tuned))


--- LINEAR SUPPORT VECTOR MACHINE ---

              precision    recall  f1-score   support

           0       0.87      0.88      0.87       286
           1       0.80      0.78      0.79       174

    accuracy                           0.84       460
   macro avg       0.83      0.83      0.83       460
weighted avg       0.84      0.84      0.84       460


--- LINEAR SUPPORT VECTOR MACHINE ---


---------------------------------------------
tn, fp, fn, tp = [252  34  38 136]
F1             = 0.7906976744186047
Accuracy       = 0.8434782608695652
Precision      = 0.8
Recall         = 0.7816091954022989
ROC AUC        = 0.83136403826059
---------------------------------------------


--- CLASSIFICATION REPORT ---

              precision    recall  f1-score   support

           0       0.87      0.88      0.87       286
           1       0.80      0.78      0.79       174

    accuracy                           0.84       460
   macro avg       0.83      0.83      0.83       4

## Optimal model

In [19]:
# Fit model.
svm_opt = SGDClassifier(loss = "hinge", 
                        penalty = "l2",
                        alpha = 1e-4, 
                        random_state = 42,
                        max_iter = 5, 
                        tol = None)
svm_opt = svm_opt.fit(X_train_tfidf, y_train)

# Predict on test set.
y_svm_opt = svm_opt.predict(X_test_tfidf)

# Evaluate performance.
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print(metrics.classification_report(y_test, y_svm_opt))

# SVM performance metrics.
# Care about most about false negatives – discarding papers we want to keep.
# Recall, F1, and accuracy rely on false negatives.
confusion_svm_opt = metrics.confusion_matrix(y_test, y_svm_opt)
acc_svm_opt = metrics.accuracy_score(y_test, y_svm_opt)
f1_svm_opt = metrics.f1_score(y_test, y_svm_opt, zero_division = 0)
precision_svm_opt = metrics.precision_score(y_test, y_svm_opt, zero_division = 0)
recall_svm_opt = metrics.recall_score(y_test, y_svm_opt, zero_division = 0)
roc_svm_opt = metrics.roc_auc_score(y_test, y_svm_opt)
        
print("\n--- LINEAR SUPPORT VECTOR MACHINE ---\n")
print("\n---------------------------------------------")
print("tn, fp, fn, tp =", confusion_svm_opt.ravel())
print("F1             =", f1_svm_opt)
print("Accuracy       =", acc_svm_opt)
print("Precision      =", precision_svm_opt)
print("Recall         =", recall_svm_opt)
print("ROC AUC        =", roc_svm_opt)
print("---------------------------------------------\n")


--- LINEAR SUPPORT VECTOR MACHINE ---

              precision    recall  f1-score   support

           0       0.90      0.87      0.88       286
           1       0.79      0.83      0.81       174

    accuracy                           0.85       460
   macro avg       0.84      0.85      0.85       460
weighted avg       0.86      0.85      0.86       460


--- LINEAR SUPPORT VECTOR MACHINE ---


---------------------------------------------
tn, fp, fn, tp = [248  38  29 145]
F1             = 0.8123249299719888
Accuracy       = 0.8543478260869565
Precision      = 0.7923497267759563
Recall         = 0.8333333333333334
ROC AUC        = 0.8502331002331003
---------------------------------------------



