In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Loading data:

In [6]:
data = pd.read_csv('data/all_data.csv').T
labels = pd.read_csv('data/labels.csv')

## Preprocessing indexes:

In [7]:
labels.drop('type_sample', axis=1, inplace=True)
labels = labels.reset_index(drop=True)
labels.rename(columns={ labels.columns[0]: "Participant" }, inplace=True)
labels.IBD = labels.IBD.astype('bool')

data = data.reset_index()
data.columns = data.iloc[0]
data = data.iloc[1:]
data = data.reset_index(drop=True)
data.rename(columns={ data.columns[0]: "Participant" }, inplace=True)

## Train Val Test split:

In [8]:
def split_data(data, seed = 43, train_val_test_split = (80,10,10)):
    indices = np.arange(len(data))
    np.random.seed(seed)
    np.random.shuffle(indices)
    train_indices = indices[:train_val_test_split[0]*len(indices)//100]
    val_indices = indices[train_val_test_split[0]*len(indices)//100:train_val_test_split[0]*len(indices)//100\
                          + train_val_test_split[1]*len(indices)//100]
    test_indices = indices[train_val_test_split[0]*len(indices)//100+ train_val_test_split[1]*len(indices)//100:]
    
    return data.iloc[train_indices], data.iloc[val_indices], data.iloc[test_indices]

In [9]:
train_data, val_data, test_data = split_data(data)
train_labels, val_labels, test_labels = split_data(labels)

## Scaling data:

In [19]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaler.fit(train_data.iloc[:, 1:])

scaled_train_data = scaler.transform(train_data.iloc[:, 1:])
scaled_test_data = scaler.transform(test_data.iloc[:, 1:])

In [96]:
from sklearn.decomposition import PCA, TruncatedSVD
pca = PCA(10)
svd = TruncatedSVD(10)

In [93]:
pca_train_data = pca.fit_transform(train_data.iloc[:, 1:])
svd_train_data = svd.fit_transform(train_data.iloc[:, 1:])

pca_test_data = pca.transform(test_data.iloc[:, 1:])
svd_test_data = svd.transform(test_data.iloc[:, 1:])

In [91]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=10, random_state=43, criterion='entropy')
rfc.fit(train_data.iloc[:, 1:], train_labels['IBD'])

test_pred = rfc.predict(test_data.iloc[:, 1:])

In [92]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print("ROC score = {}".format(roc_auc_score(test_labels['IBD'], test_pred)))
print("\nClassification Report:") 
print(classification_report(test_labels['IBD'], test_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(test_labels['IBD'], test_pred))

ROC score = 0.6491525423728814

Classification Report:
             precision    recall  f1-score   support

      False       0.67      0.40      0.50        30
       True       0.75      0.90      0.82        59

avg / total       0.72      0.73      0.71        89


Confusion Matrix:
[[12 18]
 [ 6 53]]


In [94]:
rfc.fit(pca_train_data, train_labels['IBD'])
pca_test_pred = rfc.predict(pca_test_data)

print("ROC score = {}".format(roc_auc_score(test_labels['IBD'], pca_test_pred)))
print("\nClassification Report:") 
print(classification_report(test_labels['IBD'], pca_test_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(test_labels['IBD'], pca_test_pred))

ROC score = 0.5994350282485876

Classification Report:
             precision    recall  f1-score   support

      False       0.67      0.27      0.38        30
       True       0.71      0.93      0.81        59

avg / total       0.70      0.71      0.66        89


Confusion Matrix:
[[ 8 22]
 [ 4 55]]


In [95]:
rfc.fit(svd_train_data, train_labels['IBD'])
svd_test_pred = rfc.predict(svd_test_data)

print("ROC score = {}".format(roc_auc_score(test_labels['IBD'], svd_test_pred)))
print("\nClassification Report:") 
print(classification_report(test_labels['IBD'], svd_test_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(test_labels['IBD'], svd_test_pred))

ROC score = 0.7242937853107344

Classification Report:
             precision    recall  f1-score   support

      False       0.76      0.53      0.63        30
       True       0.79      0.92      0.85        59

avg / total       0.78      0.79      0.78        89


Confusion Matrix:
[[16 14]
 [ 5 54]]
