In [None]:
import numpy as np # linear algebra
np.random.seed(0)
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from matplotlib import pyplot as plt
from sklearn import tree
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

list_fn = ['/kaggle/input/end-als/end-als/transcriptomics-data/DESeq2/bulbar_vs_limb.csv', 
'/kaggle/input/end-als/end-als/transcriptomics-data/DESeq2/median_low_vs_high.csv',
'/kaggle/input/end-als/end-als/transcriptomics-data/DESeq2/ctrl_vs_case.csv',
'/kaggle/input/end-als/end-als/genomics-data/geno_bin.csv']

rename_lambda = lambda x: x[5:]

ctrl_vs_case = pd.read_csv(list_fn[2])
x_cols = ctrl_vs_case.columns[2:]
y_cols = ctrl_vs_case.columns[1:2]
data   = ctrl_vs_case[x_cols].to_numpy()
labels = ctrl_vs_case[y_cols].to_numpy().ravel()

data = data[:, data.any(axis=0)]
data = normalize(data, axis=1)
sp = SelectPercentile(chi2, percentile=5).fit(data, labels)

print(labels.shape, data.shape)
if np.any(np.isnan(data)) or np.any(np.isinf(data)):
    print('error')

train_size = 0.8
    
indices = np.random.permutation(labels.size)
train = indices[:int(labels.size * train_size)]
valid = indices[int(labels.size * train_size):]

X = data[train]
Y = labels[train]
Tx = data[valid]
Ty = labels[valid]

datasets = [pd.read_csv(path) for path in list_fn[:-1]]
df = datasets[1]
datasets = [datasets[0], datasets[2]]
for dataset in datasets:
    df = pd.merge(df, dataset[[dataset.columns[0], dataset.columns[1]]], on=dataset.columns[0], how='left')



import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, roc_curve, auc, precision_recall_curve

# TODO: sensitivity & specificity of model similar scores for both

def class_one_acc(labels):
    return sum(labels)/len(labels)

def acc(true, preds):
    return accuracy_score(true, preds)

def roc_auc(true, conf_scores, verbose=False):
    fpr, tpr, _ = roc_curve(true, conf_scores)
    return auc(fpr, tpr) if not verbose else (fpr, tpr, auc(fpr, tpr))

def prc_auc(true, conf_scores, verbose=False):
    precision, recall, _ = precision_recall_curve(true, conf_scores)
    return auc(recall, precision) if not verbose else (recall, precision, auc(recall, precision))

def plot_auc(true, conf_scores, mode='roc', lw=2):
    if mode == 'roc':   
        metric = roc_auc
        xlabel = 'False Positive Rate'
        ylabel = 'True Positive Rate'
        title = 'ROC AUC'
        p1 = [0, 1]
        p2 = [0, 1]
    elif mode == 'prc': 
        metric = prc_auc
        xlabel = 'Recall'
        ylabel = 'Precision'
        title = 'PRC AUC'
        p1 = [0, 1]
        p2 = [class_one_acc(true), class_one_acc(true)]
    else: return;
    
    scores = metric(true, conf_scores, verbose=True)

    plt.figure()
    plt.plot(scores[0], scores[1], color='red', lw=lw, label='ROC curve (area = %0.4f)' % scores[2])
    plt.plot(p1, p2, color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.legend(loc="lower right")
    plt.show()

# Embedding Model (unsupervised)

In [None]:
!pip install autokeras

In [None]:
print('model created')
_X = sp.transform(X)
_Y = Y.copy()

negs = _Y == 0
ratio = int(negs.size / negs.sum()) - 1
if ratio > 0:
    x_neg = np.concatenate([_X[negs]] * ratio)
    y_neg = np.concatenate([_Y[negs]] * ratio)
    _X = np.concatenate([_X, x_neg])
    _Y = np.concatenate([_Y, y_neg])

print(np.max(_X), np.min(_X))


import autokeras as ak
import tensorflow.keras as K
print('model created')
_Tx = sp.transform(Tx)
_Ty = Ty.copy()

clf = ak.StructuredDataClassifier(overwrite=True, max_trials=50)
clf.fit(_X, _Y, epochs=50)
model = clf.export_model()
print(model.summary())
model.trainable = False
# output_model = model.layers[:-1]

output_model = K.Model(inputs=model.inputs, outputs=model.layers[-3].output)
print(output_model.summary())

_Tx = sp.transform(Tx)
_Tx = output_model.predict(_Tx)
_X = sp.transform(X)
_X = output_model.predict(_X)

clf = tree.DecisionTreeClassifier()
clf = clf.fit(_X, Y)
tree.plot_tree(clf, rounded=True)

print(clf.score(_Tx, Ty))
conf_score = clf.predict(_Tx)
plot_auc(Ty, conf_score, mode='roc')

# Working Model

## Without embedding or dim reduction

In [None]:
from matplotlib import pyplot as plt
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, Y)
tree.plot_tree(clf, rounded=True)
print(clf.score(Tx, Ty))

## With embedding and dim reduction

In [None]:
_Tx = sp.transform(Tx)
_Tx = output_model.predict(_Tx)
_X = sp.transform(X)
_X = output_model.predict(_X)

clf2 = tree.DecisionTreeClassifier()
clf2 = clf2.fit(_X, Y)
tree.plot_tree(clf2, rounded=True)



print(clf2.score(_Tx, Ty))


## Without embedding and With dim reduction 1

In [None]:
_Tx = sp.transform(Tx)
_X = sp.transform(X)

clf3 = tree.DecisionTreeClassifier()
clf3 = clf3.fit(_X, Y)
tree.plot_tree(clf3, rounded=True)

print(clf3.score(_Tx, Ty))

# With Kmeans

## with embedding and with dim reduction

In [None]:
_Tx = sp.transform(Tx)
_Tx = output_model.predict(_Tx)
_X = sp.transform(X)
_X = output_model.predict(_X)
print(_X.shape)
from sklearn import cluster
clus1 = cluster.KMeans(5)
_X = clus1.fit_transform(_X)
_Tx = clus1.transform(_Tx)
clf6 = tree.DecisionTreeClassifier()
print(_X.shape)
clf6 = clf6.fit(_X, Y)
tree.plot_tree(clf6, rounded=True)

print(clf6.score(_Tx, Ty))

In [None]:
print(clf6.score(_X, Y))

In [None]:
_Tx = sp.transform(Tx)
_X = sp.transform(X)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(_X, Y)
tree.plot_tree(clf, rounded=True)
conf_score = clf.predict(_Tx)
print(clf.score(_Tx, Ty))
plot_auc(Ty, conf_score, mode='roc')

In [None]:
from sklearn.ensemble import RandomForestClassifier
_Tx = sp.transform(Tx)
_X = sp.transform(X)
rf = RandomForestClassifier(n_estimators=2)
rf.fit(_X, Y)
tree.plot_tree(rf.estimators_[0], rounded=True)
print(rf.score(_Tx, Ty))
conf_score = rf.predict(_Tx)
plot_auc(Ty, conf_score, mode='roc')