In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Loading data:

In [2]:
data = pd.read_csv('data/all_data.csv').T
labels = pd.read_csv('data/labels.csv')

## Preprocessing indexes:

In [3]:
labels.drop('type_sample', axis=1, inplace=True)
labels = labels.reset_index(drop=True)
labels.rename(columns={ labels.columns[0]: "Participant" }, inplace=True)
labels.IBD = labels.IBD.astype('bool')

data = data.reset_index()
data.columns = data.iloc[0]
data = data.iloc[1:]
data = data.reset_index(drop=True)
data.rename(columns={ data.columns[0]: "Participant" }, inplace=True)

## Train Val Test split:

In [4]:
def split_data(data, seed = 43, train_val_test_split = (80,10,10)):
    indices = np.arange(len(data))
    np.random.seed(seed)
    np.random.shuffle(indices)
    train_indices = indices[:train_val_test_split[0]*len(indices)//100]
    val_indices = indices[train_val_test_split[0]*len(indices)//100:train_val_test_split[0]*len(indices)//100\
                          + train_val_test_split[1]*len(indices)//100]
    test_indices = indices[train_val_test_split[0]*len(indices)//100+ train_val_test_split[1]*len(indices)//100:]
    
    return data.iloc[train_indices], data.iloc[val_indices], data.iloc[test_indices]

In [5]:
train_data, val_data, test_data = split_data(data)
train_labels, val_labels, test_labels = split_data(labels)

## Linear SVM 

In [6]:
from sklearn.svm import SVC

svclassifier = SVC(kernel='linear')
svclassifier.fit(train_data.iloc[:, 1:], train_labels['IBD'])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [7]:
val_pred = svclassifier.predict(val_data.iloc[:, 1:])

In [8]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print("ROC score = {}".format(roc_auc_score(val_labels['IBD'], val_pred)))
print("\nClassification Report:") 
print(classification_report(val_labels['IBD'], val_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(val_labels['IBD'], val_pred))

ROC score = 0.6593406593406593

Classification Report:
             precision    recall  f1-score   support

      False       0.57      0.46      0.51        26
       True       0.79      0.86      0.82        63

avg / total       0.73      0.74      0.73        89


Confusion Matrix:
[[12 14]
 [ 9 54]]


## Polynomial SVM, Degree 2

In [9]:
sv2classifier = SVC(kernel='poly', degree=2)
sv2classifier.fit(train_data.iloc[:, 1:], train_labels['IBD'])
train2_pred = sv2classifier.predict(train_data.iloc[:, 1:])
print("ROC score = {}".format(roc_auc_score(train_labels['IBD'], train2_pred)))
print("\nClassification Report:") 
print(classification_report(train_labels['IBD'], train2_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(train_labels['IBD'], train2_pred))

ROC score = 1.0

Classification Report:
             precision    recall  f1-score   support

      False       1.00      1.00      1.00       187
       True       1.00      1.00      1.00       525

avg / total       1.00      1.00      1.00       712


Confusion Matrix:
[[187   0]
 [  0 525]]


In [10]:
val2_pred = sv2classifier.predict(val_data.iloc[:, 1:])
print("ROC score = {}".format(roc_auc_score(val_labels['IBD'], val2_pred)))
print("\nClassification Report:") 
print(classification_report(val_labels['IBD'], val2_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(val_labels['IBD'], val2_pred))

ROC score = 0.6944444444444444

Classification Report:
             precision    recall  f1-score   support

      False       0.65      0.50      0.57        26
       True       0.81      0.89      0.85        63

avg / total       0.76      0.78      0.77        89


Confusion Matrix:
[[13 13]
 [ 7 56]]


In [11]:
test2_pred = sv2classifier.predict(test_data.iloc[:, 1:])
print("ROC score = {}".format(roc_auc_score(test_labels['IBD'], test2_pred)))
print("\nClassification Report:") 
print(classification_report(test_labels['IBD'], test2_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(test_labels['IBD'], test2_pred))

ROC score = 0.7991525423728814

Classification Report:
             precision    recall  f1-score   support

      False       0.78      0.70      0.74        30
       True       0.85      0.90      0.88        59

avg / total       0.83      0.83      0.83        89


Confusion Matrix:
[[21  9]
 [ 6 53]]


## PCA and SVD

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaler.fit(train_data.iloc[:, 1:])

scaled_train_data = scaler.transform(train_data.iloc[:, 1:])
scaled_test_data = scaler.transform(test_data.iloc[:, 1:])

from sklearn.decomposition import PCA, TruncatedSVD
pca = PCA(10)
svd = TruncatedSVD(10)

pca_train_data = pca.fit_transform(train_data.iloc[:, 1:])
pca_val_data = pca.transform(val_data.iloc[:, 1:])
pca_test_data = pca.transform(test_data.iloc[:, 1:])

svd_train_data = svd.fit_transform(train_data.iloc[:, 1:])
svd_val_data = svd.transform(val_data.iloc[:, 1:])
svd_test_data = svd.transform(test_data.iloc[:, 1:])

sv2classifier.fit(pca_train_data, train_labels['IBD'])
train2_pred_pca = sv2classifier.predict(pca_train_data)
val2_pred_pca = sv2classifier.predict(pca_val_data)
test2_pred_pca = sv2classifier.predict(pca_test_data)

sv2classifier.fit(svd_train_data, train_labels['IBD'])
train2_pred_svd = sv2classifier.predict(svd_train_data)
val2_pred_svd = sv2classifier.predict(svd_val_data)
test2_pred_svd = sv2classifier.predict(svd_test_data)

## PCA Train:

In [None]:
print("ROC score = {}".format(roc_auc_score(train_labels['IBD'], train2_pred_pca)))
print("\nClassification Report:") 
print(classification_report(train_labels['IBD'], train2_pred_pca))
print("\nConfusion Matrix:")
print(confusion_matrix(train_labels['IBD'], train2_pred_pca))

## PCA Val:

In [None]:
print("ROC score = {}".format(roc_auc_score(val_labels['IBD'], val2_pred_pca)))
print("\nClassification Report:") 
print(classification_report(val_labels['IBD'], val2_pred_pca))
print("\nConfusion Matrix:")
print(confusion_matrix(val_labels['IBD'], val2_pred_pca))

## PCA Test:

In [None]:
print("ROC score = {}".format(roc_auc_score(test_labels['IBD'], test2_pred_pca)))
print("\nClassification Report:") 
print(classification_report(test_labels['IBD'], test2_pred_pca))
print("\nConfusion Matrix:")
print(confusion_matrix(test_labels['IBD'], test2_pred_pca))

## SVD Train:

In [None]:
print("ROC score = {}".format(roc_auc_score(train_labels['IBD'], train2_pred_svd)))
print("\nClassification Report:") 
print(classification_report(train_labels['IBD'], train2_pred_svd))
print("\nConfusion Matrix:")
print(confusion_matrix(train_labels['IBD'], train2_pred_svd))

## SVD Val:

In [None]:
print("ROC score = {}".format(roc_auc_score(val_labels['IBD'], val2_pred_svd)))
print("\nClassification Report:") 
print(classification_report(val_labels['IBD'], val2_pred_svd))
print("\nConfusion Matrix:")
print(confusion_matrix(val_labels['IBD'], val2_pred_svd))

## SVD Test:

In [None]:
print("ROC score = {}".format(roc_auc_score(test_labels['IBD'], test2_pred_svd)))
print("\nClassification Report:") 
print(classification_report(test_labels['IBD'], test2_pred_svd))
print("\nConfusion Matrix:")
print(confusion_matrix(test_labels['IBD'], test2_pred_svd))