# Training Support Vector Machine

## Code for training a suppor vector machine model using cross-validation

In [1]:
import os
import joblib
import pandas as pd
import numpy as np
from sklearn import svm
import matplotlib.pyplot as plt
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV,RepeatedStratifiedKFold, cross_val_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
"""Functions def"""
def read_data(raw_clinical_note):
    data = pd.read_csv(raw_clinical_note, header=0,na_filter=True)
    return data

def saving_model(model,model_name,logs_file):
    model_name = model_name + '.pkl'
    url_save = os.path.join(logs_file,str(model_name))
    joblib.dump(model, url_save)

In [3]:
"""Loading training sets"""
path = "C:/Users/Salvador/Modelo_COVID19/Libretas manuscrito/BCM Infectius diseases/BCM/"
x_train = pd.read_csv(path + "X_train.csv")
y_train = pd.read_csv(path + "y_train.csv")
print ("x_train:", x_train.shape, "y_train:",y_train.shape)

x_train: (9251, 35) y_train: (9251, 2)


In [4]:
x_train = x_train.drop(['Unnamed: 0'],axis=1)
y_train = y_train.drop(['Unnamed: 0'],axis=1)
print ("x_train:", x_train.shape, "y_train:", y_train.shape)

x_train: (9251, 34) y_train: (9251, 1)


In [5]:
y_train.value_counts()

lethality
0            7953
1            1298
dtype: int64

In [6]:
x_train

Unnamed: 0,Sex,Age,Fever,Cough,Odynophagia,Dyspnea,Irritability,Diarrhea,Chest pain,Chills,...,COPD,Asthma,Immunosuppression,Hypertension,Cardiovascular disease,Obesity,CKD,Smoking,Use of antipyretics,Days elapsed
0,0.923870,-1.200365,0.426006,0.390861,-1.264644,-0.886320,-0.425121,-0.519490,-0.696870,0.957761,...,-0.142059,-0.138453,-0.078738,-0.560746,-0.147124,-0.452107,-0.172061,-0.258154,-1.053191,0.226939
1,-1.082403,0.340091,0.426006,0.390861,0.790736,-0.886320,-0.425121,-0.519490,-0.696870,0.957761,...,-0.142059,-0.138453,-0.078738,-0.560746,-0.147124,-0.452107,-0.172061,-0.258154,-1.053191,-1.026691
2,-1.082403,0.054821,-2.347382,0.390861,-1.264644,1.128261,-0.425121,-0.519490,-0.696870,0.957761,...,7.039312,-0.138453,-0.078738,1.783339,-0.147124,-0.452107,-0.172061,-0.258154,-1.053191,-0.190938
3,0.923870,-1.314472,-2.347382,0.390861,0.790736,-0.886320,-0.425121,-0.519490,-0.696870,0.957761,...,-0.142059,-0.138453,-0.078738,-0.560746,-0.147124,-0.452107,-0.172061,-0.258154,-1.053191,1.898446
4,-1.082403,-0.173394,-2.347382,0.390861,0.790736,-0.886320,-0.425121,-0.519490,1.434989,0.957761,...,-0.142059,-0.138453,-0.078738,-0.560746,-0.147124,-0.452107,-0.172061,-0.258154,-1.053191,-1.026691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9246,-1.082403,-0.515718,0.426006,0.390861,-1.264644,-0.886320,-0.425121,1.924966,1.434989,-1.044102,...,-0.142059,-0.138453,-0.078738,-0.560746,-0.147124,-0.452107,-0.172061,-0.258154,0.949495,-0.608815
9247,-1.082403,0.682414,0.426006,0.390861,0.790736,1.128261,-0.425121,1.924966,-0.696870,0.957761,...,-0.142059,-0.138453,-0.078738,1.783339,-0.147124,2.211867,-0.172061,-0.258154,-1.053191,0.226939
9248,0.923870,1.538223,0.426006,0.390861,-1.264644,1.128261,2.352274,-0.519490,-0.696870,-1.044102,...,-0.142059,-0.138453,-0.078738,1.783339,-0.147124,-0.452107,-0.172061,3.873653,-1.053191,-0.608815
9249,0.923870,0.796522,0.426006,0.390861,-1.264644,-0.886320,2.352274,-0.519490,1.434989,0.957761,...,-0.142059,-0.138453,-0.078738,1.783339,-0.147124,-0.452107,-0.172061,-0.258154,-1.053191,0.644816


In [7]:
estimator = svm.SVC()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=442)
sample_weights = class_weight.compute_sample_weight('balanced', y_train)
parameters={ 'C':np.logspace(-4, 4, 5),
            'class_weight': ['balanced'],
            'gamma':['scale','auto'],
            'random_state': [422]}
grid = GridSearchCV(estimator=estimator, param_grid=parameters, cv = cv, scoring='roc_auc',refit = True)
grid.fit(x_train,y_train,sample_weight = sample_weights)

In [8]:
print('Best params achieved via GridSearch: \n',grid.best_params_)
print('Best score in training:', grid.best_score_)

Best params achieved via GridSearch: 
 {'C': 0.01, 'class_weight': 'balanced', 'gamma': 'scale', 'random_state': 422}
Best score in training: 0.8308387174308328


In [9]:
"""Visualing metrics from gridSearchCV"""
cv_results = pd.DataFrame.from_dict(grid.cv_results_)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_gamma,param_random_state,params,split0_test_score,...,split93_test_score,split94_test_score,split95_test_score,split96_test_score,split97_test_score,split98_test_score,split99_test_score,mean_test_score,std_test_score,rank_test_score
0,4.463953,0.295518,2.269032,2.246353,0.0001,balanced,scale,422,"{'C': 0.0001, 'class_weight': 'balanced', 'gam...",0.826218,...,0.829149,0.839032,0.815791,0.784819,0.832356,0.855559,0.828486,0.830023,0.01557,3
1,4.710653,0.924657,0.917949,0.226124,0.0001,balanced,auto,422,"{'C': 0.0001, 'class_weight': 'balanced', 'gam...",0.825106,...,0.828849,0.838771,0.815907,0.784935,0.831292,0.854446,0.826783,0.829846,0.015668,4
2,4.787917,1.056997,0.927559,0.347183,0.01,balanced,scale,422,"{'C': 0.01, 'class_weight': 'balanced', 'gamma...",0.825097,...,0.832748,0.836768,0.816894,0.786318,0.831601,0.855539,0.827228,0.830839,0.015611,1
3,4.51262,0.970374,0.88953,0.327421,0.01,balanced,auto,422,"{'C': 0.01, 'class_weight': 'balanced', 'gamma...",0.825106,...,0.832719,0.836768,0.816797,0.786502,0.831669,0.855462,0.827092,0.830836,0.015605,2
4,3.234324,0.945879,0.593566,0.198053,1.0,balanced,scale,422,"{'C': 1.0, 'class_weight': 'balanced', 'gamma'...",0.802223,...,0.806759,0.817213,0.784635,0.777136,0.82567,0.839361,0.811824,0.814031,0.016911,6
5,3.299633,0.982149,0.583689,0.19197,1.0,balanced,auto,422,"{'C': 1.0, 'class_weight': 'balanced', 'gamma'...",0.802203,...,0.806797,0.817175,0.784644,0.777271,0.825544,0.839381,0.811805,0.814032,0.016906,5
6,3.549114,0.938133,0.343625,0.108974,100.0,balanced,scale,422,"{'C': 100.0, 'class_weight': 'balanced', 'gamm...",0.805982,...,0.782966,0.7988,0.764296,0.744673,0.778152,0.803387,0.770924,0.781533,0.016922,8
7,3.60482,0.847554,0.360987,0.128653,100.0,balanced,auto,422,"{'C': 100.0, 'class_weight': 'balanced', 'gamm...",0.80602,...,0.783227,0.7988,0.76419,0.744557,0.778229,0.803367,0.770847,0.781545,0.016932,7
8,10.081484,2.440878,0.363682,0.135171,10000.0,balanced,scale,422,"{'C': 10000.0, 'class_weight': 'balanced', 'ga...",0.801923,...,0.794935,0.760261,0.744286,0.774214,0.747954,0.76389,0.757987,0.761413,0.017796,10
9,9.465243,2.133213,0.345241,0.126067,10000.0,balanced,auto,422,"{'C': 10000.0, 'class_weight': 'balanced', 'ga...",0.801846,...,0.794954,0.76031,0.744277,0.774001,0.74805,0.763764,0.758084,0.761443,0.017788,9


In [10]:
"""Saving AUC (cross-validation)"""
cv_results = pd.DataFrame.from_dict(grid.cv_results_)
auc_training = cv_results.mean_test_score
auc_training.to_csv(path+"AUC_crossval_SVM_0.csv")

In [11]:
""" Re-training the model with the best params"""
best_params = grid.best_params_
model = svm.SVC(**best_params)
model = model.fit(x_train,y_train)

In [12]:
"""Saving model"""
saving_model(model,"SVM_model_0",path)