In [None]:
import sys
sys.path.append('/usr/local/lib/python3/dist-packages/')

In [None]:
import sys
import os
import lzma
import random
from collections import defaultdict
import math

In [None]:
from typing import *

In [None]:
import numpy
import pandas

In [None]:
import xgboost as xgb

In [None]:
import sklearn

In [None]:
from catboost import Pool, CatBoostClassifier
import catboost

In [None]:
from util import *

In [None]:
cancer_data_dir = '/home/noskill/projects/cancer/data'
dump_dir = os.path.join(cancer_data_dir, 'bcDump/example15bmc')
clinical_table_path = os.path.join(cancer_data_dir, 'bcClinicalTable.csv')
merged_path = os.path.join(dump_dir, 'ex15bmcMerged.csv.xz')
bmc_all_path = os.path.join(dump_dir, 'bmc15mldata1.csv')

In [None]:
dtype = {'DFS': pandas.Int64Dtype(),
         'pCR': pandas.Int64Dtype(),
         'RFS': pandas.Int64Dtype(), 
         'DFS': pandas.Int64Dtype(), 
         'posOutcome': pandas.Int64Dtype()}

# load averaged treatment table

In [None]:
bmc = pandas.read_csv(bmc_all_path, dtype=dtype, converters=converters)
bmc = bmc.sort_values(by='patient_ID')

# load detailed treatment

In [None]:
treatment = pandas.read_csv(clinical_table_path, converters=converters).sort_values(by='patient_ID')
treatment = treatment[treatment.patient_ID.isin(bmc.patient_ID)]

In [None]:
bmc.head()

# load genes expression data

In [None]:
gene_expression = pandas.read_csv(lzma.open(merged_path))

In [None]:
gene_expression.head(5)

In [None]:
genes_features = gene_expression[gene_expression.patient_ID.isin(bmc.patient_ID)]

In [None]:
genes_features = genes_features.sort_values(by='patient_ID')

# columns to use for training

In [22]:
pam50col = genes_features.columns[genes_features.columns.isin(pam50).nonzero()[0]].to_list()

In [23]:
aggregated_treatment_columns = ['radio', 'surgery', 'chemo', 'hormone']
label_columns = ['pCR', 'RFS', 'DFS', 'posOutcome']
label_columns = ['posOutcome']
genes_columns = genes_features.columns.to_list()[1:]
feature_columns = xgboost_top_100 #genes_columns + treatment_columns # label_columns +  # pam50col #  +   + aggregated_treatment_columns

## merge genes expression + averaged treatment + detailed treatment

In [24]:
merged = pandas.merge(genes_features, bmc, left_on='patient_ID', right_on='patient_ID')
merged = pandas.merge(merged, treatment, left_on='patient_ID', right_on='patient_ID')
merged.insert(0, 'row_num', range(0,len(merged)))

# catboost

In [21]:
res = defaultdict(list)
model = CatBoostClassifier(iterations=3600,
                           depth=7,
                           use_best_model=True,
                           learning_rate=0.005,
                           loss_function='Logloss',
                           model_size_reg=20,
                           verbose=True,
                           scale_pos_weight=0.605,
                           l2_leaf_reg=20,
                           od_type='Iter', od_wait=200)
model = CatBoostClassifier(verbose=True)
train_data, train_labels, val_data, val_labels, expected = random_split(balance_by_study=True)
catboost_pool = Pool(train_data, 
                    train_labels)

test_data = Pool(val_data,
                 val_labels) 
# train the model
clf = model.fit(train_data, train_labels, 
          eval_set=test_data,
          save_snapshot=False, snapshot_file='vasya')
y_pred = clf.predict(val_data)
x_pred = clf.predict(train_data)
compute_metrics(res, val_labels.flatten(), y_pred, train_labels, x_pred)
res

TypeError: random_split() missing 4 required positional arguments: 'merged', 'bmc', 'feature_columns', and 'label_columns'

# xgboost

In [None]:
import xgboost as xgb
res = defaultdict(list)

for i in range(30):
    print(i)
    model = xgb.XGBClassifier()

    train_data, train_labels, val_data, val_labels, expected = random_split(merged, bmc,
                                                                           feature_columns, label_columns)

    # train the model
    clf = model.fit(train_data, train_labels,
                    eval_set=[(train_data, train_labels), (val_data, val_labels)], 
                    early_stopping_rounds=50, verbose=False)
    y_pred = clf.predict(val_data)
    x_pred = clf.predict(train_data)
    compute_metrics(res, val_labels.flatten(), y_pred, train_labels, x_pred)
for key in res:
    ave = numpy.asarray(res[key]).mean(axis=0)
    print('{0}: {1}'.format(key, ave))

In [None]:
print(train_data.shape)
print(val_data.shape)

In [None]:
weights = model.get_booster().get_score(importance_type="gain")

In [None]:
weights_sorted = sorted([(y, x) for (x,y) in weights.items()], reverse=True)

In [None]:
[feature_columns[int(num[1:])] for (_, num) in weights_sorted[:100]]

In [None]:
for (w, num) in weights_sorted[:100]:
    print(feature_columns[int(num[1:])], w)

# SVM

In [None]:
from sklearn import datasets, svm, metrics
svm_total = defaultdict(list)
for i in range(5):
    print('iteration {0}'.format(i))
    model = svm.SVC(C=1, kernel='rbf', class_weight={1: 0.5})
    train_data, train_labels, val_data, val_labels, expected = random_split()
    # train the model
    clf = model.fit(numpy.nan_to_num(train_data), numpy.nan_to_num(train_labels))
    y_pred = clf.predict(numpy.nan_to_num(val_data))
    x_pred = clf.predict(numpy.nan_to_num(train_data))
    compute_metrics(svm_total, val_labels.flatten(), y_pred, train_labels, x_pred)
for key in svm_total:
    ave = numpy.asarray(svm_total[key]).mean(axis=0)
    print('{0}: {1}'.format(key, ave))

In [None]:
print(train_data.shape)
print(val_data.shape)

# SVM - single study

In [None]:
from sklearn import datasets, svm, metrics

svm_total = defaultdict(list)
for study in set(bmc.study):
#     if study != 'study_25065_GPL96_MDACC-bmc15':
#         continue
    print('\n' * 2 + study)
    for i in range(20):
        train_data, train_labels, val_data, val_labels, expected = random_split(ratio=0.1, study_name=study)
        C = 1
        if study == 'study_16446_GPL570_all-bmc15':
            C = 1.5
        if study == 'study_22358_GPL5325_all-bmc15':
            C = 0.1
        if study == 'study_22226_GPL1708_all-bmc15':
            C = 2
        if study == 'study_20181_GPL96_all-bmc15':
            C = 0.72
        if study == 'study_25065_GPL96_MDACC-bmc15':
            C = 1.72
        model = svm.SVC(C=C, kernel='rbf', class_weight={1: (1 - numpy.mean(train_labels))  / numpy.mean(train_labels)})
        # train the model
        clf = model.fit(numpy.nan_to_num(train_data), numpy.nan_to_num(train_labels))
        y_pred = clf.predict(numpy.nan_to_num(val_data))
        # print(y_pred)
        x_pred = clf.predict(numpy.nan_to_num(train_data))
        compute_metrics(svm_total, val_labels.flatten(), y_pred, train_labels, x_pred)
for key in svm_total:
    ave = numpy.asarray(svm_total[key]).mean(axis=0)
    print('{0}: {1}'.format(key, ave))

In [None]:
print(train_data.shape)
print(val_data.shape)

# nearest neigbour classifier

In [None]:
def predict(train_data, validation_data, train_labels):
    tmp = []
    for i in range(len(validation_data)):
        diff = train_data - validation_data[i]
        idx = numpy.argmin(numpy.sqrt(numpy.sum(diff ** 2, axis=1)))
        tmp.append(idx)
    return train_labels[tmp]

In [None]:
nearest_total = defaultdict(list)
for i in range(10):
    train_data, train_labels, val_data, val_labels, expected = random_split()
    y_pred = predict(numpy.nan_to_num(train_data), numpy.nan_to_num(val_data), train_labels)
    # x_pred = predict(numpy.nan_to_num(train_data), numpy.nan_to_num(train_data), train_labels)
    compute_metrics(nearest_total, val_labels.flatten(), y_pred, train_labels, x_pred)

for key in nearest_total:
    ave = numpy.asarray(nearest_total[key]).mean(axis=0)
    print('{0}: {1}'.format(key, ave))

In [None]:
print(train_data.shape)
print(val_data.shape)

# naive bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
train_data, train_labels, val_data, val_labels, expected = random_split(ratio=0.1)

In [None]:
nearest_total = defaultdict(list)
for i in range(10):
    gnb = GaussianNB()
    train_data, train_labels, val_data, val_labels, expected = random_split()
    model = gnb.fit(numpy.nan_to_num(train_data, nan=-1), numpy.nan_to_num(train_labels))
    y_pred = model.predict(numpy.nan_to_num(val_data, nan=-1))
    x_pred = model.predict(numpy.nan_to_num(train_data))
    compute_metrics(nearest_total, val_labels.flatten(), y_pred, train_labels, x_pred)

for key in nearest_total:
    ave = numpy.asarray(nearest_total[key]).mean(axis=0)
    print('{0}: {1}'.format(key, ave))



In [None]:
train_data.shape

# binarization

In [25]:
merged.head()

Unnamed: 0,row_num,patient_ID,MAGEA12,MAGEA11,KLF1,ADH7,MSH4,BIRC3,AKR1C4,GBX2,...,chemotherapy,hormone_therapy,no_treatment,methotrexate,cetuximab,carboplatin,other,taxaneGeneral,neoadjuvant_or_adjuvant,study_specific_protocol_number
0,0,22449,-0.118953,1.180345,0.252643,-0.262987,0.142903,0.167314,0.498846,0.774632,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,adj,1.0
1,1,22450,0.423693,-0.922374,-1.202192,-0.105451,-0.061571,-0.093231,-0.09555,-0.481403,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,adj,1.0
2,2,22451,-0.239183,-0.733389,0.523791,-0.081958,-0.004635,-0.008094,0.268636,-0.614192,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,adj,1.0
3,3,22452,0.500445,-0.177686,-0.216638,-0.13085,-0.261039,-0.048521,1.479664,-0.10012,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,adj,1.0
4,4,22453,-0.609235,0.259494,-0.071802,0.027963,0.162509,0.112654,-0.239435,0.229737,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,adj,1.0


In [31]:
bin_data = binarize_dataset(merged, genes_columns, feature_columns, to_letters=False)

In [27]:
bin_data.drop('patient_ID', axis=1, inplace=True)

## test binarization by xgboost

In [32]:
for col in bin_data.dtypes[bin_data.dtypes == object].keys():
    col_dict = {x: i for (i, x) in enumerate(sorted(set(bin_data[col])))}
    bin_data[col] = bin_data[col].apply(lambda x: col_dict[x])

In [34]:
import xgboost as xgb
res = defaultdict(list)

for i in range(10):
    print(i)
    model = xgb.XGBClassifier()

    train_data, train_labels, val_data, val_labels, expected = random_split(bin_data, bmc,
                                                                           feature_columns, label_columns)

    # train the model
    clf = model.fit(train_data, train_labels,
                    eval_set=[(train_data, train_labels), (val_data, val_labels)], 
                    early_stopping_rounds=50, verbose=False)
    y_pred = clf.predict(val_data)
    x_pred = clf.predict(train_data)
    compute_metrics(res, val_labels.flatten(), y_pred, train_labels, x_pred)
for key in res:
    ave = numpy.asarray(res[key]).mean(axis=0)
    print('{0}: {1}'.format(key, ave))

0
1
2
3
4
5
6
7
8
9
recall: 0.8075000000000001
precision: 0.5531623318157181
f1: 0.6562547735359168
confusion: [[41.8 78.2]
 [23.1 96.9]]
train_f1: 0.9537830728816186
train_confusion: [[ 620.7   98.3]
 [  26.1 1239.9]]
accuracy: 0.5779166666666666


In [None]:
subset_moses_features = ['posOutcome'] + feature_columns

In [None]:
bin_data[subset_moses_features].to_csv('/tmp/cancer_bin_100.csv', header=True, index=False)

In [None]:
subset = bin_data[subset_moses_features]

In [None]:
subset.columns[17]

In [None]:
bin_data[subset_moses_features[:30]].head(20).to_csv('/tmp/cancer_bin_100_small.csv', header=True, index=False)

In [None]:
res_ar[0][-4:]

In [None]:
[12 < 2 ** x for x in range(1,5)]

In [None]:
for oth in other_columns:
    print(oth)
    print(set(digitize_non_genes_data(merged[oth])))

In [None]:
other_columns[6]

In [None]:
set(numpy.nan_to_num(merged.hormone_therapy, nan=-1))

In [None]:
other_columns[0]

In [None]:
numpy.histogram_bin_edges(r, bins=14)

In [None]:
other_columns[18]

In [None]:
edges = numpy.histogram_bin_edges(r, bins=14)

In [None]:
edges

# moses

In [None]:
from opencog.pyasmoses import moses

In [None]:
train_data, train_labels, val_data, val_labels, expected = random_split(ratio=0.1, to_numpy=False)

In [None]:
train_data.shape

save data to file

In [None]:
train_data = train_data.fillna(-1)
train_data.to_csv('/tmp/input_data.csv', header=True, index=False)

In [None]:
set(train_data.surgery_type.to_list())

In [None]:
input_data = numpy.concatenate([train_labels[..., numpy.newaxis], train_data], axis=1)

In [None]:
print(input_data.shape)

In [None]:
mos = moses()

In [None]:
args = "--log-file log1.txt.log --hc-fraction-of-nn 0.01 -j5 --balance 1 -m 100000 --result-count 100 --reduct-knob-building-effort=2 --hc-widen-search=1 --enable-fs=1 --fs-algo=smd --fs-target-size=1000 --hc-crossover-min-neighbors=5000 --fs-focus=all --fs-seed=init  --hc-max-nn-evals=10000 --hc-crossover-pop-size=1000 -l debug --noise 0.2 -q 0.05"

In [None]:
output = mos.run(input=numpy.nan_to_num(input_data), python=True, args=args)

In [None]:
output[0].score

In [None]:
mos = moses()
input_data = [[0, 0, 0],
              [0.2, 0.2, 0.4],
              [1, 1, 2],
              [1, 0, 1],
              [2., 1, 3]]
output = mos.run(input=input_data, python=True, args='-m 1000000 --max-time=60 --balance=1')
print (output[0].score) # Prints: 0
model = output[0].eval
print(model([0, 1]))  # Returns: True
print(model([1, 1]))  # Returns: False

In [None]:
moses_args = []
# target column
moses_args.append('--problem_type=it')
