In [None]:
import sys
sys.path.append('/usr/local/lib/python3/dist-packages/')

In [None]:
import sys
import os
import lzma
import random
from collections import defaultdict
import math

In [None]:
path = !pwd
sys.path.append(path[0])
sys.path.append(os.path.abspath(path[0] + '/../'))

In [None]:
from typing import *

In [None]:
import numpy
import pandas

In [None]:
import xgboost as xgb

In [None]:
import sklearn

In [None]:
from catboost import Pool, CatBoostClassifier
import catboost

In [None]:
from data_util.util import *

In [None]:
cancer_data_dir = '/home/noskill/projects/cancer.old/data'
dataset_dict = load_merged_dataset(cancer_data_dir)

In [None]:
bmc = dataset_dict['bmc']
bmc = bmc.sort_values(by='patient_ID')

# load detailed treatment

In [None]:
treatment = dataset_dict['treatment'].sort_values(by='patient_ID')

In [None]:
bmc.head()

# load genes expression data

In [None]:
genes_features = dataset_dict['genes_features']

In [None]:
genes_features.head(5)

In [None]:
genes_features = genes_features.sort_values(by='patient_ID')

# columns to use for training

In [None]:
pam50col = genes_features.columns[genes_features.columns.isin(pam50).nonzero()[0]].to_list()

In [None]:
aggregated_treatment_columns = ['radio', 'surgery', 'chemo', 'hormone']
label_columns = ['pCR', 'RFS', 'DFS', 'posOutcome']
label_columns = ['posOutcome']
genes_columns = genes_features.columns.to_list()[1:]
feature_columns =  xgboost_top_100 # genes_columns + treatment_columns #  # label_columns +  # pam50col #  +   + aggregated_treatment_columns

## merged genes expression + averaged treatment + detailed treatment

In [None]:
merged = dataset_dict['merged']
merged.head()

# catboost

In [None]:
res = defaultdict(list)
model = CatBoostClassifier(iterations=3600,
                           depth=7,
                           use_best_model=True,
                           learning_rate=0.005,
                           loss_function='Logloss',
                           model_size_reg=20,
                           verbose=True,
                           scale_pos_weight=0.605,
                           l2_leaf_reg=20,
                           od_type='Iter', od_wait=200)
model = CatBoostClassifier(verbose=True)
train_data, train_labels, val_data, val_labels = random_split(merged,
                                                              feature_columns, 
                                                              label_columns)
catboost_pool = Pool(train_data, 
                    train_labels)

test_data = Pool(val_data,
                 val_labels) 
# train the model
clf = model.fit(train_data, train_labels, 
          eval_set=test_data,
          save_snapshot=False, snapshot_file='vasya')
y_pred = clf.predict(val_data)
x_pred = clf.predict(train_data)
compute_metrics(res, val_labels.flatten(), y_pred, train_labels, x_pred)
res

# xgboost

In [None]:
import xgboost as xgb
res = defaultdict(list)

for i in range(3):
    print(i)
    model = xgb.XGBClassifier()

    train_data, train_labels, val_data, val_labels = random_split(merged,
                                                                  feature_columns, 
                                                                  label_columns)

    # train the model
    clf = model.fit(train_data, train_labels,
                    eval_set=[(train_data, train_labels), (val_data, val_labels)], 
                    early_stopping_rounds=50, verbose=False)
    y_pred = clf.predict(val_data)
    x_pred = clf.predict(train_data)
    compute_metrics(res, val_labels.flatten(), y_pred, train_labels, x_pred)
for key in res:
    ave = numpy.asarray(res[key]).mean(axis=0)
    print('{0}: {1}'.format(key, ave))

In [None]:
print(train_data.shape)
print(val_data.shape)

In [None]:
weights = model.get_booster().get_score(importance_type="gain")

In [None]:
weights_sorted = sorted([(y, x) for (x,y) in weights.items()], reverse=True)

In [None]:
[feature_columns[int(num[1:])] for (_, num) in weights_sorted[:100]]

In [None]:
for (w, num) in weights_sorted[:100]:
    print(feature_columns[int(num[1:])], w)

# SVM

In [None]:
from sklearn import datasets, svm, metrics
svm_total = defaultdict(list)
for i in range(5):
    print('iteration {0}'.format(i))
    model = svm.SVC(C=1, kernel='rbf', class_weight={1: 0.5})
    train_data, train_labels, val_data, val_labels = random_split(merged,
                                                                  feature_columns, 
                                                                  label_columns)
    # train the model
    clf = model.fit(numpy.nan_to_num(train_data), numpy.nan_to_num(train_labels))
    y_pred = clf.predict(numpy.nan_to_num(val_data))
    x_pred = clf.predict(numpy.nan_to_num(train_data))
    compute_metrics(svm_total, val_labels.flatten(), y_pred, train_labels, x_pred)
for key in svm_total:
    ave = numpy.asarray(svm_total[key]).mean(axis=0)
    print('{0}: {1}'.format(key, ave))

In [None]:
print(train_data.shape)
print(val_data.shape)

# SVM - single study

In [None]:
from sklearn import datasets, svm, metrics

svm_total = defaultdict(list)
for study in set(bmc.study):
    print('\n' * 2 + str(study))
    for i in range(20):
        train_data, train_labels, val_data, val_labels = random_split(merged,
                                                                  feature_columns, 
                                                                  label_columns, ratio=0.1, study_name=study)
        C = 1
        model = svm.SVC(C=C, kernel='rbf', class_weight={1: (1 - numpy.mean(train_labels))  / numpy.mean(train_labels)})
        # train the model
        clf = model.fit(numpy.nan_to_num(train_data), numpy.nan_to_num(train_labels))
        y_pred = clf.predict(numpy.nan_to_num(val_data))
        # print(y_pred)
        x_pred = clf.predict(numpy.nan_to_num(train_data))
        compute_metrics(svm_total, val_labels.flatten(), y_pred, train_labels, x_pred)
for key in svm_total:
    ave = numpy.asarray(svm_total[key]).mean(axis=0)
    print('{0}: {1}'.format(key, ave))

In [None]:
print(train_data.shape)
print(val_data.shape)

# nearest neigbour classifier

In [None]:
def predict(train_data, validation_data, train_labels):
    tmp = []
    for i in range(len(validation_data)):
        diff = train_data - validation_data[i]
        idx = numpy.argmin(numpy.sqrt(numpy.sum(diff ** 2, axis=1)))
        tmp.append(idx)
    return train_labels[tmp]

In [None]:
nearest_total = defaultdict(list)
for i in range(10):
    train_data, train_labels, val_data, val_labels = random_split(merged,
                                                                  feature_columns, 
                                                                  label_columns)
    y_pred = predict(numpy.nan_to_num(train_data), numpy.nan_to_num(val_data), train_labels)
    x_pred = predict(numpy.nan_to_num(train_data), numpy.nan_to_num(train_data), train_labels)
    compute_metrics(nearest_total, val_labels.flatten(), y_pred, train_labels, x_pred)

for key in nearest_total:
    ave = numpy.asarray(nearest_total[key]).mean(axis=0)
    print('{0}: {1}'.format(key, ave))

In [None]:
print(train_data.shape)
print(val_data.shape)

# naive bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nearest_total = defaultdict(list)
for i in range(10):
    gnb = GaussianNB()
    train_data, train_labels, val_data, val_labels = random_split(merged,
                                                                  feature_columns, 
                                                                  label_columns)
    print(val_labels.sum() / len(val_labels))
    model = gnb.fit(numpy.nan_to_num(train_data, nan=-1), numpy.nan_to_num(train_labels))
    y_pred = model.predict(numpy.nan_to_num(val_data, nan=-1))
    x_pred = model.predict(numpy.nan_to_num(train_data))
    compute_metrics(nearest_total, val_labels.flatten(), y_pred, train_labels, x_pred)

for key in nearest_total:
    ave = numpy.asarray(nearest_total[key]).mean(axis=0)
    print('{0}: {1}'.format(key, ave))



In [None]:
train_data.shape

# binarization

In [None]:
merged.head()

In [None]:
bin_data = binarize_dataset(merged, genes_columns, feature_columns, to_letters=False)

In [None]:
merged.study

In [None]:
bin_data.taxaneGeneral.max()

In [None]:
#bin_data.drop('patient_ID', axis=1, inplace=True)

## test binarization by xgboost

In [None]:
import xgboost as xgb
res = defaultdict(list)

for i in range(5):
    print(i)
    model = xgb.XGBClassifier()

    train_data, train_labels, val_data, val_labels = random_split(bin_data,
                                                                  feature_columns, 
                                                                  label_columns)

    # train the model
    clf = model.fit(train_data, train_labels,
                    eval_set=[(train_data, train_labels), (val_data, val_labels)], 
                    early_stopping_rounds=50, verbose=False)
    print(val_labels.sum() / len(val_labels))
    y_pred = clf.predict(val_data)
    x_pred = clf.predict(train_data)
    compute_metrics(res, val_labels.flatten(), y_pred, train_labels, x_pred)
for key in res:
    ave = numpy.asarray(res[key]).mean(axis=0)
    print('{0}: {1}'.format(key, ave))

### test leave one study out

In [None]:
bin_data.study.unique()

In [None]:
from data_util.util import study_mapping
res = defaultdict(list)
for study_name, study_id in study_mapping.items():
    train_data, train_labels, val_data, val_labels = next(split_by_study(bin_data,
                                                              feature_columns, 
                                                              label_columns,
                                                              study=study_id,
                                                              to_numpy=True))
    print(study_name)
    model = xgb.XGBClassifier()
    # train the model
    clf = model.fit(train_data, train_labels,
                   eval_set=[(train_data, train_labels), (val_data, val_labels)], 
                   early_stopping_rounds=50, verbose=False)

    print(val_data.shape)
    print(val_labels.sum() / len(val_labels))
    y_pred = clf.predict(val_data)
    x_pred = clf.predict(train_data)
    compute_metrics(res, val_labels.flatten(), y_pred, train_labels, x_pred)
for key in res:
   ave = numpy.asarray(res[key]).mean(axis=0)
   print('{0}: {1}'.format(key, ave))

In [None]:
val_data

In [None]:
 compute_metrics(res, val_labels.flatten(), y_pred, train_labels, x_pred)

In [None]:
res['confusion'][-1]

### generate test and train sets

In [None]:
subset_moses_features = ['posOutcome'] + feature_columns

In [None]:
train_data, train_labels, val_data, val_labels = random_split(bin_data,
                                                              subset_moses_features, 
                                                              label_columns,
                                                              balance_by_study=True,
                                                              to_numpy=False)

In [None]:
print(len(subset_moses_features))

In [None]:
train_data.shape

### save split

In [None]:
train_data.to_csv('/tmp/cancer_bin_100_train_balance_by_study.csv', header=True, index=False)

In [None]:
val_data.to_csv('/tmp/cancer_bin_100_val_balance.csv', header=True, index=False)

### leave one study out

In [None]:
from data_util.util import study_mapping
for study_name, study_id in study_mapping.items():
    train_data, train_labels, val_data, val_labels = next(split_by_study(bin_data,
                                                              subset_moses_features, 
                                                              label_columns,
                                                              study=study_id,
                                                              to_numpy=False))
    print(study_name)
    print(train_data.shape)
    print(val_data.shape)
    train_data.to_csv('/tmp/cancer_bin_100_train_leave_{0}.csv'.format(study_name),
                      header=True, index=False)
    val_data.to_csv('/tmp/cancer_bin_100_val_leave_{0}.csv'.format(study_name),
                    header=True, index=False)

In [None]:
    train_data, train_labels, val_data, val_labels = next(split_by_study(bin_data,
                                                              subset_moses_features, 
                                                              label_columns,
                                                              study=study_id,
                                                              to_numpy=False))

In [None]:
merged[merged.study == 16]

In [None]:
bin_data.study.unique()

# moses

In [None]:
from opencog.pyasmoses import moses

In [None]:
train_data, train_labels, val_data, val_labels, expected = random_split(ratio=0.1, to_numpy=False)

In [None]:
train_data.shape

save data to file

In [None]:
train_data = train_data.fillna(-1)
train_data.to_csv('/tmp/input_data.csv', header=True, index=False)

In [None]:
set(train_data.surgery_type.to_list())

In [None]:
input_data = numpy.concatenate([train_labels[..., numpy.newaxis], train_data], axis=1)

In [None]:
print(input_data.shape)

In [None]:
mos = moses()

In [None]:
args = "--log-file log1.txt.log --hc-fraction-of-nn 0.01 -j5 --balance 1 -m 100000 --result-count 100 --reduct-knob-building-effort=2 --hc-widen-search=1 --enable-fs=1 --fs-algo=smd --fs-target-size=1000 --hc-crossover-min-neighbors=5000 --fs-focus=all --fs-seed=init  --hc-max-nn-evals=10000 --hc-crossover-pop-size=1000 -l debug --noise 0.2 -q 0.05"

In [None]:
output = mos.run(input=numpy.nan_to_num(input_data), python=True, args=args)

In [None]:
output[0].score

In [None]:
mos = moses()
input_data = [[0, 0, 0],
              [0.2, 0.2, 0.4],
              [1, 1, 2],
              [1, 0, 1],
              [2., 1, 3]]
output = mos.run(input=input_data, python=True, args='-m 1000000 --max-time=60 --balance=1')
print (output[0].score) # Prints: 0
model = output[0].eval
print(model([0, 1]))  # Returns: True
print(model([1, 1]))  # Returns: False

In [None]:
moses_args = []
# target column
moses_args.append('--problem_type=it')
