In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Loading data:

In [2]:
data = pd.read_csv('data/all_data.csv').T
labels = pd.read_csv('data/labels.csv')

## Preprocessing indexes:

In [3]:
labels.drop('type_sample', axis=1, inplace=True)
labels = labels.reset_index(drop=True)
labels.rename(columns={ labels.columns[0]: "Participant" }, inplace=True)
labels.IBD = labels.IBD.astype('bool')

data = data.reset_index()
data.columns = data.iloc[0]
data = data.iloc[1:]
data = data.reset_index(drop=True)
data.rename(columns={ data.columns[0]: "Participant" }, inplace=True)

## Train Val Test split:

In [4]:
def split_data(data, seed = 43, train_val_test_split = (80,10,10)):
    indices = np.arange(len(data))
    np.random.seed(seed)
    np.random.shuffle(indices)
    train_indices = indices[:train_val_test_split[0]*len(indices)//100]
    val_indices = indices[train_val_test_split[0]*len(indices)//100:train_val_test_split[0]*len(indices)//100\
                          + train_val_test_split[1]*len(indices)//100]
    test_indices = indices[train_val_test_split[0]*len(indices)//100+ train_val_test_split[1]*len(indices)//100:]
    
    return data.iloc[train_indices], data.iloc[val_indices], data.iloc[test_indices]

In [5]:
train_data, val_data, test_data = split_data(data)
train_labels, val_labels, test_labels = split_data(labels)

## Linear SVM 

In [6]:
from sklearn.svm import SVC

svclassifier = SVC(kernel='linear')
svclassifier.fit(train_data.iloc[:, 1:], train_labels['IBD'])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [7]:
test_pred = svclassifier.predict(test_data.iloc[:, 1:])

In [8]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print("ROC score = {}".format(roc_auc_score(test_labels['IBD'], test_pred)))
print("\nClassification Report:") 
print(classification_report(test_labels['IBD'], test_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(test_labels['IBD'], test_pred))

ROC score = 0.698587570621469

Classification Report:
             precision    recall  f1-score   support

      False       0.63      0.57      0.60        30
       True       0.79      0.83      0.81        59

avg / total       0.74      0.74      0.74        89


Confusion Matrix:
[[17 13]
 [10 49]]


## Polynomial SVM, Degree 2

In [9]:
sv2classifier = SVC(kernel='poly', degree=2)
sv2classifier.fit(train_data.iloc[:, 1:], train_labels['IBD'])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=2, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [10]:
test2_pred = sv2classifier.predict(test_data.iloc[:, 1:])

In [11]:
print("ROC score = {}".format(roc_auc_score(test_labels['IBD'], test2_pred)))
print("\nClassification Report:") 
print(classification_report(test_labels['IBD'], test2_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(test_labels['IBD'], test2_pred))

ROC score = 0.7991525423728814

Classification Report:
             precision    recall  f1-score   support

      False       0.78      0.70      0.74        30
       True       0.85      0.90      0.88        59

avg / total       0.83      0.83      0.83        89


Confusion Matrix:
[[21  9]
 [ 6 53]]
