In [3]:
import sys
import os
import lzma
import random
from collections import defaultdict
import math

In [4]:
import numpy
import pandas

In [5]:
import xgboost as xgb

ModuleNotFoundError: No module named 'xgboost'

In [6]:
import sklearn

In [7]:
from sklearn.metrics import recall_score, precision_score, f1_score, confusion_matrix

In [8]:
from catboost import Pool, CatBoostClassifier
import catboost

In [9]:
treatment_columns = ['tumor_size_cm_preTrt_preSurgery', 
                     'tumor_size_cm_secondAxis_preTrt_preSurgery', 
                     'preTrt_lymph_node_status', 
                     'preTrt_totalLymphNodes', 
                     'preTrt_numPosLymphNodes', 
                     'hist_grade', 
                     'nuclear_grade_preTrt', 
                     'age', 'race', 'menopausal_status', 'surgery_type', 'intarvenous', 'intramuscular', 'oral', 
                     'radiotherapyClass', 'chemotherapyClass', 'hormone_therapyClass', 'postmenopausal_only',
                     'immediate_biol_target', 'anthracycline', 'taxane', 'anti_estrogen', 'aromatase_inhibitor',
                     'estrogen_receptor_blocker', 'estrogen_receptor_blocker_and_stops_production', 
                     'estrogen_receptor_blocker_and_eliminator', 'anti_HER2', 
                     'tamoxifen', 'doxorubicin', 
                     'epirubicin', 'docetaxel', 'capecitabine', 'fluorouracil',
                     'paclitaxel', 'cyclophosphamide', 'anastrozole', 
                     'fulvestrant', 'gefitinib', 'trastuzumab', 'letrozole', 'chemotherapy', 'hormone_therapy',
                     'no_treatment', 'methotrexate', 'cetuximab', 'carboplatin', 'other', 'taxaneGeneral']

In [10]:
cancer_data_dir = '/home/noskill/projects/cancer/data'
dump_dir = os.path.join(cancer_data_dir, 'bcDump/example15bmc')
clinical_table_path = os.path.join(cancer_data_dir, 'bcClinicalTable.csv')
merged_path = os.path.join(dump_dir, 'ex15bmcMerged.csv.xz')
bmc_all_path = os.path.join(dump_dir, 'bmc15mldata1.csv')

In [11]:
dtype = {'DFS': pandas.Int64Dtype(),
         'pCR': pandas.Int64Dtype(),
         'RFS': pandas.Int64Dtype(), 
         'DFS': pandas.Int64Dtype(), 
         'posOutcome': pandas.Int64Dtype()}

In [12]:
def convert_surgery(x, surgery_mapping=dict()):
    if x not in surgery_mapping:
        surgery_mapping[x] = len(surgery_mapping) + 1
    return surgery_mapping[x]


def convert_node_status(x, mapping=dict()):
    if x == 'NA' or x == 'NaN':
        return numpy.nan
    if not isinstance(x, str) and numpy.isnan(x):
        return x
    if x not in mapping:
        mapping[x] = len(mapping) + 1
    return mapping[x]


def convert_race(x, mapping=dict()):
    return convert_node_status(x, mapping)

def convert_menapause(x, mapping=dict()):
    return convert_node_status(x, mapping)

converters=dict(preTrt_lymph_node_status=convert_node_status,
               race=convert_race,
               menopausal_status=convert_menapause,
               surgery_type=convert_surgery,
               surgery=convert_surgery)

In [13]:
bmc = pandas.read_csv(bmc_all_path, dtype=dtype, converters=converters)
bmc = bmc.sort_values(by='patient_ID')

In [14]:
treatment = pandas.read_csv(clinical_table_path, converters=converters).sort_values(by='patient_ID')
treatment = treatment[treatment.patient_ID.isin(bmc.patient_ID)]

In [15]:
for col in treatment_columns:
    if any([isinstance(x, str) for x in set(treatment[treatment_columns][col])]):
        print(col)

In [16]:
bmc.head()

Unnamed: 0,study,patient_ID,radio,surgery,chemo,hormone,pCR,RFS,DFS,posOutcome
0,study_1379_GPL1223_all-bmc15,22449,0,1,0,1,,,0,0
1,study_1379_GPL1223_all-bmc15,22450,0,1,0,1,,,0,0
2,study_1379_GPL1223_all-bmc15,22451,0,1,0,1,,,0,0
3,study_1379_GPL1223_all-bmc15,22452,0,1,0,1,,,0,0
4,study_1379_GPL1223_all-bmc15,22453,0,1,0,1,,,1,1


In [17]:
gene_expression = pandas.read_csv(lzma.open(merged_path))

In [18]:
gene_expression.head(5)

Unnamed: 0,patient_ID,MAGEA12,MAGEA11,KLF1,ADH7,MSH4,BIRC3,AKR1C4,GBX2,GCGR,...,ZNF80,ZNF83,ZNF84,ZNF91,ZNHIT2,ZSCAN2,ZXDC,ZYX,ZZEF1,ZZZ3
0,22449,-0.118953,1.180345,0.252643,-0.262987,0.142903,0.167314,0.498846,0.774632,0.104353,...,-1.564143,0.466733,0.827552,-0.617981,0.303161,1.260602,-0.217995,0.219529,0.389849,1.313703
1,22450,0.423693,-0.922374,-1.202192,-0.105451,-0.061571,-0.093231,-0.09555,-0.481403,-0.214238,...,0.711752,0.358388,0.037911,2.304784,0.328942,-1.028791,-0.850002,-0.292574,-0.068982,0.722123
2,22451,-0.239183,-0.733389,0.523791,-0.081958,-0.004635,-0.008094,0.268636,-0.614192,0.027471,...,-0.011786,-0.474762,-0.349981,-0.097197,0.100946,-0.5547,-0.367363,0.094464,-0.372665,-0.790771
3,22452,0.500445,-0.177686,-0.216638,-0.13085,-0.261039,-0.048521,1.479664,-0.10012,0.233178,...,0.757255,0.590212,0.06015,2.287583,-0.108866,-1.1325,-0.106976,-0.216267,0.393671,-0.027349
4,22453,-0.609235,0.259494,-0.071802,0.027963,0.162509,0.112654,-0.239435,0.229737,-0.132271,...,0.407159,0.570637,0.851658,-0.41295,0.105692,-1.047445,0.08448,-0.224081,-0.021074,0.764555


In [19]:
genes_features = gene_expression[gene_expression.patient_ID.isin(bmc.patient_ID)]

In [20]:
genes_features = genes_features.sort_values(by='patient_ID')

In [127]:
aggregated_treatment_columns = ['radio', 'surgery', 'chemo', 'hormone']
feature_columns = genes_features.columns.to_list()[1:] + aggregated_treatment_columns
label_columns = ['pCR', 'RFS', 'DFS', 'posOutcome']
label_columns = ['posOutcome']
#feature_columns = genes_features.columns.to_list()[1:]

In [113]:
merged = pandas.merge(genes_features, bmc, left_on='patient_ID', right_on='patient_ID')
merged = pandas.merge(merged, treatment, left_on='patient_ID', right_on='patient_ID')

In [114]:
def split_by_study(study_name=None):
    """
    Split one study out for cross-validation
    """
    for eval_study in set(bmc.study):
        if study_name:
            eval_study = study_name
        print(eval_study)
        bmc_train = bmc[bmc.study != eval_study]
        bmc_val = bmc[bmc.study == eval_study]
        assert (not set(bmc_train.patient_ID).intersection(set(bmc_val.patient_ID)))

        train_split = merged[merged.patient_ID.isin(bmc_train.patient_ID)]
        val_split = merged[merged.patient_ID.isin(bmc_val.patient_ID)]
        assert val_split.patient_ID.to_list() == bmc_val.patient_ID.to_list()
        train_data = train_split[feature_columns].to_numpy()
        train_labels = train_split[label_columns].to_numpy().astype(int)
        val_data = val_split[feature_columns].to_numpy()
        val_labels = val_split[label_columns].to_numpy().astype(int)
        yield train_data, train_labels, val_data, val_labels
        if study_name:
            break

In [115]:
def select_balanced_idx(study, num):
    if not num % 2 == 0:
        num = num + 1
    validation = []
    pos_outcome = study[study.posOutcome == 1].patient_ID
    neg_outcome = study[study.posOutcome == 0].patient_ID
    pos_idx = numpy.arange(len(pos_outcome))
    neg_idx = numpy.arange(len(neg_outcome))
    random.shuffle(pos_idx)
    random.shuffle(neg_idx)
    i = 0
    while not (len(validation) >= num):
        validation.append(pos_outcome.iloc[pos_idx[i]])
        validation.append(neg_outcome.iloc[neg_idx[i]])
        i += 1
    train = study[~study.patient_ID.isin(validation)]
    validation = study[study.patient_ID.isin(validation)]
    return train, validation

In [116]:
def random_split(ratio=0.1):
    val_patients = []
    train_patients = []
    expected = dict()
    expected['TN'] = 0
    expected['FN'] = 0
    expected['FP'] = 0
    expected['TP'] = 0
    for eval_study in set(bmc.study):
        study = bmc[bmc.study == eval_study]
        num_select = math.ceil(len(study) * ratio)
        study_patients = bmc[bmc.study == eval_study]
        bmc_train, bmc_val = select_balanced_idx(study_patients, num_select)
        pos_prob_train = bmc_train.posOutcome.sum() / len(bmc_train)
        neg_prob_train = 1 - pos_prob_train
        P = bmc_val.posOutcome.sum()
        N = len(bmc_val) - P
        TN = N * neg_prob_train
        TP= P * pos_prob_train
        FP = N - TN
        FN = P - TP
        expected['TN'] += TN
        expected['TP'] += TP
        expected['FP'] += FP
        expected['FN'] += FN
        val_patients += bmc_val.patient_ID.to_list()
        train_patients += bmc_train.patient_ID.to_list()
        
    train_split = merged[merged.patient_ID.isin(train_patients)]
    val_split = merged[merged.patient_ID.isin(val_patients)]
    train_data = train_split[feature_columns].to_numpy()
    train_labels = train_split[label_columns].to_numpy().astype(int)
    val_data = val_split[feature_columns].to_numpy()
    val_labels = val_split[label_columns].to_numpy().astype(int)
    return train_data, train_labels, val_data, val_labels, expected

In [117]:
train_data, train_labels, val_data, val_labels, expected = random_split()

In [108]:
val_data.shape

(240, 8832)

In [109]:
expected

{'FN': 43.39815972277333,
 'FP': 76.60184027722667,
 'TN': 43.39815972277332,
 'TP': 76.60184027722667}

In [110]:
def compute_metrics(result, y_true, y_pred, x_true, x_pred):
    result['recall'].append(recall_score(y_true, y_pred))
    result['precision'].append( precision_score(y_true, y_pred))
    result['f1'].append(f1_score(y_true, y_pred))
    result['confusion'].append(confusion_matrix(y_true, y_pred))
    result['train_f1'].append(f1_score(x_true, x_pred))
    result['train_confusion'].append(confusion_matrix(x_true, x_pred))
    confusion = result['confusion'][-1]
    accuracy = (confusion[0][0] + confusion[1][1]) / (sum(confusion[0]) + sum(confusion[1]))
    result['accuracy'].append(accuracy)

In [56]:
for key in total_xgboost:
    print('{0}: {1}'.format(key, numpy.mean(total_xgboost[key])))

NameError: name 'total_xgboost' is not defined

In [57]:
for key in total_catboost:
    if key.count('confusion'):
        array = numpy.stack(total_catboost[key])
        print(array.mean(axis=0))
    else:
        print('{0}: {1}'.format(key, numpy.mean(total_catboost[key])))

NameError: name 'total_catboost' is not defined

In [None]:
for key in svm_total:
    if key.count('confusion'):
        array = numpy.stack(svm_total[key])
        print(array.mean(axis=0))
    else:
        print('{0}: {1}'.format(key, numpy.mean(svm_total[key])))

In [124]:
res = defaultdict(list)
model = CatBoostClassifier(iterations=3600,
                           depth=4,
                           use_best_model=True,
                           learning_rate=0.015,
                           loss_function='Logloss',
                           model_size_reg=2,
                           verbose=True,
                           scale_pos_weight=0.605,
                           l2_leaf_reg=2,
                           od_type='Iter', od_wait=200)
train_data, train_labels, val_data, val_labels, expected = random_split()
catboost_pool = Pool(train_data, 
                    train_labels)

test_data = Pool(val_data,
                 val_labels) 
# train the model
clf = model.fit(train_data, train_labels, 
          eval_set=test_data,
          save_snapshot=False, snapshot_file='vasya')
y_pred = clf.predict(val_data)
x_pred = clf.predict(train_data)
compute_metrics(res, val_labels.flatten(), y_pred, train_labels, x_pred)
res

0:	learn: 0.6896796	test: 0.6935031	best: 0.6935031 (0)	total: 122ms	remaining: 7m 19s
1:	learn: 0.6859911	test: 0.6934576	best: 0.6934576 (1)	total: 223ms	remaining: 6m 41s
2:	learn: 0.6823906	test: 0.6941451	best: 0.6934576 (1)	total: 329ms	remaining: 6m 35s
3:	learn: 0.6791855	test: 0.6944261	best: 0.6934576 (1)	total: 430ms	remaining: 6m 26s
4:	learn: 0.6757865	test: 0.6945242	best: 0.6934576 (1)	total: 532ms	remaining: 6m 22s
5:	learn: 0.6723471	test: 0.6952007	best: 0.6934576 (1)	total: 637ms	remaining: 6m 21s
6:	learn: 0.6695169	test: 0.6958203	best: 0.6934576 (1)	total: 738ms	remaining: 6m 18s
7:	learn: 0.6665774	test: 0.6966276	best: 0.6934576 (1)	total: 847ms	remaining: 6m 20s
8:	learn: 0.6640551	test: 0.6968606	best: 0.6934576 (1)	total: 955ms	remaining: 6m 21s
9:	learn: 0.6609500	test: 0.6977762	best: 0.6934576 (1)	total: 1.06s	remaining: 6m 22s
10:	learn: 0.6581300	test: 0.6981483	best: 0.6934576 (1)	total: 1.17s	remaining: 6m 23s
11:	learn: 0.6554795	test: 0.6982565	best:

KeyboardInterrupt: 

In [71]:
y_pred = clf.predict(val_data)

In [67]:
expected

{'FN': 43.42609937293494,
 'FP': 37.76064461340954,
 'TN': 44.23935538659046,
 'TP': 105.57390062706506}

In [1]:
(51 + 123) / (51 + 30 + 27 + 123)

0.7532467532467533

In [128]:
from sklearn import datasets, svm, metrics
svm_total = defaultdict(list)
model = svm.SVC(C=1, class_weight={1: 0.5})
train_data, train_labels, val_data, val_labels, expected = random_split()
# train the model
clf = model.fit(numpy.nan_to_num(train_data), numpy.nan_to_num(train_labels))
y_pred = clf.predict(numpy.nan_to_num(val_data))
x_pred = clf.predict(numpy.nan_to_num(train_data))
compute_metrics(svm_total, val_labels.flatten(), y_pred, train_labels, x_pred)
for key in svm_total:
    print('svm  {0}: {1}'.format(key, svm_total[key][-1]))

  return f(**kwargs)


svm  recall: 0.5833333333333334
svm  precision: 0.6140350877192983
svm  f1: 0.5982905982905983
svm  confusion: [[76 44]
 [50 70]]
svm  train_f1: 0.8686257562662059
svm  train_confusion: [[ 676   43]
 [ 261 1005]]
svm  accuracy: 0.6083333333333333


In [None]:
from opencog.atomspace import AtomSpace
from opencog.pymoses import moses
from opencog.scheme_wrapper import scheme_eval

In [None]:
train_data, train_labels, val_data, val_labels = next(split('study_16446_GPL570_all-bmc15'))

In [None]:
input_data = numpy.concatenate([train_labels, train_data], axis=1)

In [None]:
input_data[:,[0, 2]] = input_data[:,[2,0]]

In [None]:
min(0, 179) / max(107, 0)

In [None]:
mos = moses()

In [None]:
output = mos.run(input=input_data, python=True, args='--balance=1 -m 100000')

In [None]:
output[0].program

In [None]:
mos = moses()
input_data = [[0, 0, 0], [1, 1, 0], [1, 0, 1], [2, 1, 1]]
output = mos.run(input=input_data, python=True)
print (output[0].score) # Prints: 0
model = output[0].eval
print(model([0, 1]))  # Returns: True
print(model([1, 1]))  # Returns: False