In [1]:
import numpy as np
import itertools
from sklearn.linear_model import LogisticRegression, Lasso, RidgeClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import rpy2.robjects as robjects
import os

import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms

import matplotlib.pyplot as plt

from functions import *

np.set_printoptions(precision=3)
np.random.seed(42)

%load_ext autoreload
%autoreload 2

We will try [fDNN](https://www.nature.com/articles/s41598-018-34833-6) model

In [2]:
labels = load_file('data/labels_for_microarray_data.csv')
labels = np.array(labels, dtype=np.int32)
dataset = load_file('data/microarray_data.csv')[:, 1:].T
print(labels.shape, dataset.shape)
X_train, X_test, y_train, y_test = load_train_and_test_parts()
print("Train and test sizes: {} {}".format(X_train.shape, X_test.shape))
print("(1, 0) labels count in train test: {} {}".format((np.count_nonzero(y_train==1), np.count_nonzero(y_train==0)), 
                                                        (np.count_nonzero(y_test==1), np.count_nonzero(y_test==0))))
standarizer = StandardScaler().fit(X_train)
X_std_train = standarizer.transform(X_train)
X_std_test = standarizer.transform(X_test)

(969,) (969, 12179)
Train and test sizes: (726, 12179) (243, 12179)
(1, 0) labels count in train test: (289, 437) (104, 139)


In [3]:
clf_forest = fit_clf(RandomForestClassifier(max_depth=4, n_estimators=50, min_samples_leaf=10), X_train, y_train, X_test, y_test)

Train and test scores: 0.8732782369146006 0.691358024691358


In [4]:
def tree_representation(clf, data):
    result = np.zeros((data.shape[0], len(clf.estimators_)), dtype=np.int32)
    for i, tree in enumerate(clf.estimators_):
        result[:, i] = tree.predict(data)
    return result

In [5]:
X_tree_train = tree_representation(clf_forest, X_train)
X_tree_test = tree_representation(clf_forest, X_test)

In [None]:
clf_logit = fit_models(X_tree_train, y_train, X_tree_test, y_test)

In [16]:
class ExpressionDataset(Dataset):
    def __init__(self, data, labels, transform):
        self.data = data
        self.transform = transform
        self.labels = labels
        
    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        sample = self.data[idx, :]
        if self.transform:
            sample = self.transform(sample)
        return sample, self.labels[idx]
    
tree_train_dataset = ExpressionDataset(X_tree_train, y_train, lambda x : torch.from_numpy(x))
tree_test_dataset = ExpressionDataset(X_tree_test, y_test, lambda x : torch.from_numpy(x))

tree_train_loader = DataLoader(tree_train_dataset, batch_size=16)
tree_test_loader = DataLoader(tree_test_dataset, batch_size=16)

In [49]:
class SimpleNet(nn.Module):

    def __init__(self, size):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(size[0], size[1])
        self.fc2 = nn.Linear(size[1], size[2])
        self.fc3 = nn.Linear(size[2], 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return torch.sigmoid(self.fc3(x)).view(-1)


net = SimpleNet([tree_train_dataset[0][0].shape[0], 50, 10])

In [50]:
tree_train_dataset[0][0].shape

torch.Size([50])

In [51]:
lr = 0.01
criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr=lr)

In [52]:
running_loss = 0.0
for i, (inputs, labels) in enumerate(tree_train_loader):

    optimizer.zero_grad()

    outputs = net(inputs.float())
    loss = criterion(outputs, labels.float())
    loss.backward()
    optimizer.step()

    running_loss += loss.item()
    print('Batch %d loss: %.3f' % (i + 1, running_loss))
    running_loss = 0.0

Batch 1 loss: 0.696
Batch 2 loss: 0.674
Batch 3 loss: 0.675
Batch 4 loss: 0.653
Batch 5 loss: 0.664
Batch 6 loss: 0.557
Batch 7 loss: 0.569
Batch 8 loss: 0.668
Batch 9 loss: 0.644
Batch 10 loss: 0.401
Batch 11 loss: 0.680
Batch 12 loss: 0.680
Batch 13 loss: 0.566
Batch 14 loss: 0.618
Batch 15 loss: 0.508
Batch 16 loss: 0.565
Batch 17 loss: 0.508
Batch 18 loss: 0.469
Batch 19 loss: 0.458
Batch 20 loss: 0.442
Batch 21 loss: 0.775
Batch 22 loss: 0.516
Batch 23 loss: 0.377
Batch 24 loss: 0.306
Batch 25 loss: 0.355
Batch 26 loss: 0.521
Batch 27 loss: 0.361
Batch 28 loss: 0.377
Batch 29 loss: 0.383
Batch 30 loss: 0.478
Batch 31 loss: 0.211
Batch 32 loss: 0.300
Batch 33 loss: 0.379
Batch 34 loss: 0.239
Batch 35 loss: 0.330
Batch 36 loss: 0.188
Batch 37 loss: 0.449
Batch 38 loss: 0.239
Batch 39 loss: 0.344
Batch 40 loss: 0.130
Batch 41 loss: 0.499
Batch 42 loss: 0.222
Batch 43 loss: 0.262
Batch 44 loss: 0.463
Batch 45 loss: 0.136
Batch 46 loss: 0.653


In [53]:
correct = 0
total = 0
with torch.no_grad():
    for data in tree_test_loader:
        images, labels = data
        outputs = net(images.float())
        predicted = torch.round(outputs)
        total += labels.size(0)
        correct += (predicted == labels.float()).sum().item()

print('Accuracy of the network on the test images: %d %%' % (
    100 * correct / total))

Accuracy of the network on the test images: 70 %


In [54]:
correct = 0
total = 0
with torch.no_grad():
    for data in tree_train_loader:
        images, labels = data
        outputs = net(images.float())
        predicted = torch.round(outputs)
        total += labels.size(0)
        correct += (predicted == labels.float()).sum().item()

print('Accuracy of the network on the test images: %d %%' % (
    100 * correct / total))

Accuracy of the network on the test images: 90 %
