In [2]:
import ast
import numpy as np
import time
import sys
import pandas as pd
import subprocess
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import learning_curve
import random as rand

In [3]:
# Loading kmer table
with open('data/km_table_0.py', 'rb') as f:
    kmer_table = pickle.load(f)

In [4]:
# Loading patient table
patient_table = pd.read_csv('data/leucegene.csv', index_col=0, skipfooter=1, engine='python')
patients = patient_table.index.values

In [5]:
# Extracting flt3 label
flt3 = [(-1 if i == '-' else int(i)) for i in patient_table['FLT3-ITD mutation']]

In [6]:
# Filtering out patients with no data
kmer_table_filtered = kmer_table.copy()
for row in flt3:
    if row == -1:
        kmer_table_filtered = kmer_table_filtered.drop(kmer_table_filtered.index[row])
patient_list_filtered = []
flt3_filtered = []
for i in range(len(flt3)):
    if flt3[i] != -1:
        patient_list_filtered.append(patients[i])
        flt3_filtered.append(flt3[i])

In [7]:
print(sum(np.array(flt3_filtered) == 1), sum(np.array(flt3_filtered) == 0))

132 320


In [8]:
# Splitting data
x_train, x_test, y_train, y_test = train_test_split(kmer_table_filtered, flt3_filtered, random_state=4)

In [9]:
# Data transform
def transform_log10(x):
    return np.log10(x+1)
x_train, x_test = transform_log10(x_train), transform_log10(x_test)

In [41]:
# Define classifier
classifier = MLPClassifier(hidden_layer_sizes=(1000,500,250,125), max_iter=1000, activation = 'relu', solver='adam', random_state=1, verbose=1)

In [42]:
# Selects batch from training set for half positive half negative
def get_batch(n):
    batch_x = []
    batch_y = []
    count = 0
    while len(batch_x) < n:
        i = rand.randint(0, len(y_train)-1)
        if y_train[i] == (count % 2):
            batch_x.append(x_train.iloc[i])
            batch_y.append(y_train[i])
            count += 1
    return pd.DataFrame(batch_x), batch_y

In [None]:
# Train model
for n in range(200):
    x_batch, y_batch = get_batch(64)
    classifier.partial_fit(x_batch, y_batch, classes=np.unique(y_train))

In [45]:
# Predict & accuracy
y_pred = classifier.predict(x_test)
y_pred_prob = classifier.predict_proba(x_test)
print(roc_auc_score(y_test, y_pred))

0.5


In [46]:
roc_auc_score([0, 0, 1, 1], [0, 1, 1, 0])

0.5

In [47]:
print(y_pred)
print(y_pred_prob)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1]
[[0.45070529 0.54929471]
 [0.43834082 0.56165918]
 [0.44713394 0.55286606]
 [0.47998546 0.52001454]
 [0.42531012 0.57468988]
 [0.4338166  0.5661834 ]
 [0.46749426 0.53250574]
 [0.46856381 0.53143619]
 [0.45279323 0.54720677]
 [0.42846912 0.57153088]
 [0.43605798 0.56394202]
 [0.47617154 0.52382846]
 [0.44966188 0.55033812]
 [0.41554873 0.58445127]
 [0.46734989 0.53265011]
 [0.44460379 0.55539621]
 [0.4389744  0.5610256 ]
 [0.44641811 0.55358189]
 [0.43293963 0.56706037]
 [0.48440408 0.51559592]
 [0.42553988 0.57446012]
 [0.47814532 0.52185468]
 [0.427383   0.572617  ]
 [0.45010442 0.54989558]
 [0.45713546 0.54286454]
 [0.46297413 0.53702587]
 [0.45242103 0.54757897]
 [0.46999933 0.53000067]
 [0.43836378 0.56163622]
 [0.43714114 0.56285886]
 [0.47356155 0.5264

In [18]:
print(np.array(y_test))

[1 0 0 1 0 0 0 1 0 0 1 1 0 1 1 0 0 1 0 0 0 1 1 0 0 1 0 0 1 1 0 1 0 0 0 0 0
 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0
 0 1]
