# Support Vector Classifier - Detection of Swallowing Disorders

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from pandas import ExcelWriter
import os
import numpy as np
import random
plt.style.use("seaborn")
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import seaborn as sns
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

### Loading the dataset

In [2]:
healthy = pd.read_excel("D:/DATA SCIENCE/INTERNSHIP PROJECT/Modelling/HealthySamples.xlsx")
healthy = healthy[healthy["Sample status"] == "Perfect"]
healthy["Swallow_type"] = "healthy"
unhealthy = pd.read_excel("D:/DATA SCIENCE/INTERNSHIP PROJECT/Modelling/UnHealthySamples.xlsx")
unhealthy = unhealthy[unhealthy["Sample status"] == "Perfect"]
unhealthy["Swallow_type"] = "unhealthy"
data = healthy.append(unhealthy)[["Patient", "Sensor1_max", "Sensor2_max", "Sensor3_min", "Latency", "UES_start", "UES_end", "UES_Duration", "Swallow_type"]]
data["Label"] = np.where(data["Swallow_type"] == "healthy", 0, 1)
data

Unnamed: 0,Patient,Sensor1_max,Sensor2_max,Sensor3_min,Latency,UES_start,UES_end,UES_Duration,Swallow_type,Label
0,Control 1.txt,192.279698,113.474479,-20.601968,-0.028,1552.752,1553.392,0.640,healthy,0
1,Control 1.txt,167.821012,98.336004,-11.776150,-0.028,1782.184,1782.896,0.712,healthy,0
2,Control 1.txt,171.739528,249.844358,-24.181735,-0.036,611.556,612.284,0.728,healthy,0
3,Control 1.txt,200.377661,107.578393,-16.358434,-0.092,2333.764,2334.412,0.648,healthy,0
4,Control 1.txt,193.675898,105.770199,-25.335317,-0.172,1697.164,1697.784,0.620,healthy,0
...,...,...,...,...,...,...,...,...,...,...
194,HD11A.txt,114.937700,101.913100,-27.384550,0.211,117.157,117.828,0.671,unhealthy,1
195,HD11A.txt,195.568900,97.228870,-14.957330,0.200,163.116,164.111,0.995,unhealthy,1
196,HD11A.txt,200.185000,116.615400,-9.014693,-0.042,124.009,124.538,0.529,unhealthy,1
197,HD11B.txt,209.113400,79.385630,-12.359970,0.192,61.255,61.856,0.601,unhealthy,1


### Correlation matrix of Independent variables

In [3]:
corr = data[["Sensor1_max", "Sensor2_max", "Sensor3_min", "Latency", "UES_Duration"]].corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Sensor1_max,Sensor2_max,Sensor3_min,Latency,UES_Duration
Sensor1_max,1.0,-0.0346022,-0.038871,0.149575,0.0296155
Sensor2_max,-0.0346022,1.0,-0.124292,0.0283056,0.056447
Sensor3_min,-0.038871,-0.124292,1.0,-0.0584012,0.063469
Latency,0.149575,0.0283056,-0.0584012,1.0,0.316552
UES_Duration,0.0296155,0.056447,0.063469,0.316552,1.0


### Training and Test data split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data[["Sensor1_max", "Sensor2_max", "Sensor3_min", "Latency", "UES_Duration"]], data[["Label"]], test_size=0.3, random_state=0)

### Composition of Training and test data

In [5]:
y_train.apply(pd.Series.value_counts)

Unnamed: 0,Label
0,86
1,84


In [6]:
y_test.apply(pd.Series.value_counts)

Unnamed: 0,Label
0,38
1,36


## Fitting a Support Vector Classifier Model

In [7]:
model = SVC()
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

### Comparing Predicted and Actual values

In [8]:
y_pred = model.predict(X_test)

In [9]:
pred = pd.DataFrame(y_pred)
pred["Predicted"] = np.where(pred[0] == 0, "Healthy", "Unhealthy")
pred["Actual"] = np.where(y_test["Label"] == 0, "Healthy", "Unhealthy")
pred[["Predicted", "Actual"]]

Unnamed: 0,Predicted,Actual
0,Healthy,Healthy
1,Healthy,Healthy
2,Healthy,Healthy
3,Healthy,Healthy
4,Healthy,Unhealthy
...,...,...
69,Healthy,Healthy
70,Healthy,Healthy
71,Healthy,Healthy
72,Healthy,Healthy


### Confusion Matrix

In [10]:
from sklearn.metrics import confusion_matrix
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print(confusion_matrix)
plt.figure(figsize = (7,7))
sns.set(font_scale=1.5)

[[36  2]
 [32  4]]


<Figure size 504x504 with 0 Axes>

### Model Evaluation metrics

In [11]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.53      0.95      0.68        38
           1       0.67      0.11      0.19        36

    accuracy                           0.54        74
   macro avg       0.60      0.53      0.43        74
weighted avg       0.60      0.54      0.44        74



# Hyperparameter tuning in Support Vector Classifier

## Evaluating best performing parameters

In [12]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001], 
              'kernel': ['rbf'],
              "max_iter": [100,200,300,400,500,600,700,800,900],
             "probability": [True]}

In [13]:
random_search = RandomizedSearchCV(model, param_distributions=param_grid, cv=6, scoring="roc_auc")

In [14]:
random_search.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

RandomizedSearchCV(cv=6, error_score='raise-deprecating',
                   estimator=SVC(C=1.0, cache_size=200, class_weight=None,
                                 coef0=0.0, decision_function_shape='ovr',
                                 degree=3, gamma='auto_deprecated',
                                 kernel='rbf', max_iter=-1, probability=False,
                                 random_state=None, shrinking=True, tol=0.001,
                                 verbose=False),
                   iid='warn', n_iter=10, n_jobs=None,
                   param_distributions={'C': [0.1, 1, 10, 100, 1000],
                                        'gamma': [1, 0.1, 0.01, 0.001, 0.0001,
                                                  1e-05],
                                        'kernel': ['rbf'],
                                        'max_iter': [100, 200, 300, 400, 500,
                                                     600, 700, 800, 900],
                                        'pro

In [15]:
random_search.best_estimator_

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
    max_iter=800, probability=True, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

## Fitting a Support Vector Classifier with best performing parameters

In [16]:
model = random_search.best_estimator_
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
    max_iter=800, probability=True, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [17]:
model.predict_proba(X_test)[:,1]

array([0.5       , 0.52806599, 0.41119699, 0.54309766, 0.51787601,
       0.42845156, 0.51569969, 0.42119728, 0.52576994, 0.45463314,
       0.51330946, 0.41250566, 0.58407654, 0.57021979, 0.55702347,
       0.41960231, 0.55445826, 0.4524491 , 0.51527214, 0.55523504,
       0.43610208, 0.47004652, 0.43454088, 0.43662036, 0.52626478,
       0.5317688 , 0.48560431, 0.60314272, 0.4243391 , 0.46833316,
       0.42510392, 0.54440968, 0.53120594, 0.42759103, 0.48015601,
       0.53127026, 0.57737562, 0.60513898, 0.41744961, 0.6362319 ,
       0.60913195, 0.40871829, 0.55562249, 0.5350234 , 0.41682982,
       0.4484958 , 0.41589662, 0.46208818, 0.46666131, 0.60270843,
       0.43128935, 0.43340216, 0.43543852, 0.56028843, 0.44216186,
       0.41519047, 0.44223043, 0.60503228, 0.43343557, 0.53169763,
       0.4133034 , 0.41273976, 0.4353679 , 0.44719327, 0.55129327,
       0.47934126, 0.53553193, 0.59049272, 0.5       , 0.43493629,
       0.4512948 , 0.48359589, 0.63430685, 0.51368788])

### Comparing Predicted and Actual values

In [18]:
y_pred = model.predict(X_test)

In [19]:
pred = pd.DataFrame(y_pred)
pred["Predicted"] = np.where(pred[0] == 0, "Healthy", "Unhealthy")
pred["Actual"] = np.where(y_test["Label"] == 0, "Healthy", "Unhealthy")
pred[["Predicted", "Actual"]]

Unnamed: 0,Predicted,Actual
0,Healthy,Healthy
1,Unhealthy,Healthy
2,Healthy,Healthy
3,Unhealthy,Healthy
4,Unhealthy,Unhealthy
...,...,...
69,Healthy,Healthy
70,Healthy,Healthy
71,Healthy,Healthy
72,Unhealthy,Healthy


## Recalculating Model Evaluation metrics for Model fitted with best performing parameters

In [20]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.74      0.71        38
           1       0.70      0.64      0.67        36

    accuracy                           0.69        74
   macro avg       0.69      0.69      0.69        74
weighted avg       0.69      0.69      0.69        74



In [21]:
from sklearn.metrics import confusion_matrix
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print(confusion_matrix)
plt.figure(figsize = (7,7))
sns.set(font_scale=1.5)

[[28 10]
 [13 23]]


<Figure size 504x504 with 0 Axes>

## Changing Probability threshold to achieve 100% detection of Unhealthy patients

In [31]:
THRESHOLD = 0.41
y_pred = np.where(model.predict_proba(X_test)[:,1] > THRESHOLD, 1, 0)

## Reevaluating Model Evaluation metrics with new threshold

In [32]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.03      0.05        38
           1       0.49      1.00      0.66        36

    accuracy                           0.50        74
   macro avg       0.75      0.51      0.36        74
weighted avg       0.75      0.50      0.35        74



In [33]:
from sklearn.metrics import confusion_matrix
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print(confusion_matrix)
plt.figure(figsize = (7,7))
sns.set(font_scale=1.5)

[[ 1 37]
 [ 0 36]]


<Figure size 504x504 with 0 Axes>