In [1]:
import sys
import os
import lzma
import random

In [2]:
import catboost
import pandas


In [3]:
cancer_data_dir = '/home/leron/projects/cancer/data'
dump_dir = os.path.join(cancer_data_dir, 'bcDump/example15bmc')
merged_path = os.path.join(dump_dir, 'ex15bmcMerged.csv.xz')
bmc_all_path = os.path.join(dump_dir, 'bmc15mldata1.csv')

In [4]:
dtype = {'DFS': pandas.Int64Dtype(),
         'pCR': pandas.Int64Dtype(),
         'RFS': pandas.Int64Dtype(), 
         'DFS': pandas.Int64Dtype(), 
         'posOutcome': pandas.Int64Dtype()}

In [5]:
surgery_mapping = dict()

In [6]:
def convert_surgery(x):
    if x not in surgery_mapping:
        surgery_mapping[x] = len(surgery_mapping) + 1
    return surgery_mapping[x]

In [7]:
bmc = pandas.read_csv(bmc_all_path, dtype=dtype, converters=dict(surgery=convert_surgery))
bmc = bmc.sort_values(by='patient_ID')

In [8]:

eval_study = random.choice(list(set(bmc.study)))
eval_study

'study_19615_GPL570_all-bmc15'

In [9]:
bmc_train = bmc[bmc.study != eval_study]
bmc_val = bmc[bmc.study == eval_study]
assert (not set(bmc_train.patient_ID).intersection(set(bmc_val.patient_ID)))

In [10]:
bmc.dtypes

study         object
patient_ID     int64
radio          int64
surgery        int64
chemo          int64
hormone        int64
pCR            Int64
RFS            Int64
DFS            Int64
posOutcome     Int64
dtype: object

In [11]:
gene_expression = pandas.read_csv(lzma.open(merged_path))

In [12]:
gene_expression.head(5)

Unnamed: 0,patient_ID,MAGEA12,MAGEA11,KLF1,ADH7,MSH4,BIRC3,AKR1C4,GBX2,GCGR,...,ZNF80,ZNF83,ZNF84,ZNF91,ZNHIT2,ZSCAN2,ZXDC,ZYX,ZZEF1,ZZZ3
0,22449,-0.118953,1.180345,0.252643,-0.262987,0.142903,0.167314,0.498846,0.774632,0.104353,...,-1.564143,0.466733,0.827552,-0.617981,0.303161,1.260602,-0.217995,0.219529,0.389849,1.313703
1,22450,0.423693,-0.922374,-1.202192,-0.105451,-0.061571,-0.093231,-0.09555,-0.481403,-0.214238,...,0.711752,0.358388,0.037911,2.304784,0.328942,-1.028791,-0.850002,-0.292574,-0.068982,0.722123
2,22451,-0.239183,-0.733389,0.523791,-0.081958,-0.004635,-0.008094,0.268636,-0.614192,0.027471,...,-0.011786,-0.474762,-0.349981,-0.097197,0.100946,-0.5547,-0.367363,0.094464,-0.372665,-0.790771
3,22452,0.500445,-0.177686,-0.216638,-0.13085,-0.261039,-0.048521,1.479664,-0.10012,0.233178,...,0.757255,0.590212,0.06015,2.287583,-0.108866,-1.1325,-0.106976,-0.216267,0.393671,-0.027349
4,22453,-0.609235,0.259494,-0.071802,0.027963,0.162509,0.112654,-0.239435,0.229737,-0.132271,...,0.407159,0.570637,0.851658,-0.41295,0.105692,-1.047445,0.08448,-0.224081,-0.021074,0.764555


In [13]:
genes_features = gene_expression[gene_expression.patient_ID.isin(bmc.patient_ID)]

In [14]:
genes_features = genes_features.sort_values(by='patient_ID')


In [21]:
feature_columns = genes_features.columns.to_list()[1:] + ['radio', 'surgery', 'chemo', 'hormone']
label_columns = ['pCR', 'RFS', 'DFS', 'posOutcome']
label_columns = ['posOutcome']

In [23]:
merged = pandas.merge(genes_features, bmc, left_on='patient_ID', right_on='patient_ID')

In [26]:
train_split = merged[merged.patient_ID.isin(bmc_train.patient_ID)]
val_split = merged[merged.patient_ID.isin(bmc_val.patient_ID)]

In [27]:
assert val_split.patient_ID.to_list() == bmc_val.patient_ID.to_list()

In [28]:
train_data = train_split[feature_columns].to_numpy()
train_labels = train_split[label_columns].to_numpy().astype(int)
val_data = val_split[feature_columns].to_numpy()
val_labels = val_split[label_columns].to_numpy().astype(int)

In [29]:
train_data[0]

array([-0.11895262,  1.18034458,  0.25264325, ...,  1.        ,
        0.        ,  1.        ])

In [30]:
train_labels.flatten().max()

1

In [31]:
from catboost import Pool, CatBoostClassifier

In [32]:

catboost_pool = Pool(train_data, 
                    train_labels)

test_data = Pool(val_data,
                 val_labels)

In [None]:
model = CatBoostClassifier(iterations=140,
                           depth=4,
                           learning_rate=0.5,
                           loss_function='Logloss',
                           verbose=False,
                           l2_leaf_reg=1)
# train the model
res = model.fit(train_data, train_labels, 
          eval_set=test_data,
          save_snapshot=False, snapshot_file='vasya')

In [38]:
res.eval_metrics(catboost_pool, ['F1', 'Recall', 'Precision', 'AUC'])

{'F1': [0.7905027932960893,
  0.8015320334261838,
  0.8027923211169286,
  0.8092404620231012,
  0.811706629055007,
  0.8197530864197531,
  0.8277562522014793,
  0.8289054197662061,
  0.8333922886452069,
  0.8391211906449326,
  0.8417699115044248],
 'Recall': [0.8775193798449612,
  0.8922480620155039,
  0.8914728682170543,
  0.896124031007752,
  0.8922480620155039,
  0.9007751937984496,
  0.9108527131782945,
  0.9069767441860465,
  0.9131782945736434,
  0.9178294573643411,
  0.9217054263565891],
 'Precision': [0.7191867852604829,
  0.7275600505689002,
  0.7301587301587301,
  0.7377153797064454,
  0.7445019404915912,
  0.7521035598705501,
  0.7585539057456423,
  0.7632093933463796,
  0.7664281067013663,
  0.7728459530026109,
  0.7745928338762215],
 'AUC': [0.7018440216114634,
  0.7475240779891943,
  0.7656485788113695,
  0.7802757810664788,
  0.7916452901104064,
  0.8012901104063894,
  0.809169837914024,
  0.8169560723514212,
  0.827737843551797,
  0.8396645525017618,
  0.849103594080338

In [40]:
res.eval_metrics(test_data, ['F1', 'Recall', 'Precision', 'AUC'])

{'F1': [0.39999999999999997,
  0.39999999999999997,
  0.41269841269841273,
  0.39999999999999997,
  0.47328244274809156,
  0.5673758865248226,
  0.6394557823129251,
  0.6577181208053692,
  0.7375,
  0.7160493827160493,
  0.7160493827160493],
 'Recall': [0.2604166666666667,
  0.2604166666666667,
  0.2708333333333333,
  0.2604166666666667,
  0.3229166666666667,
  0.4166666666666667,
  0.4895833333333333,
  0.5104166666666666,
  0.6145833333333334,
  0.6041666666666666,
  0.6041666666666666],
 'Precision': [0.8620689655172413,
  0.8620689655172413,
  0.8666666666666667,
  0.8620689655172413,
  0.8857142857142857,
  0.8888888888888888,
  0.9215686274509803,
  0.9245283018867925,
  0.921875,
  0.8787878787878788,
  0.8787878787878788],
 'AUC': [0.5632440476190477,
  0.5587797619047619,
  0.6439732142857143,
  0.6104910714285714,
  0.6261160714285714,
  0.6149553571428571,
  0.6130952380952381,
  0.6231398809523809,
  0.6298363095238095,
  0.6101190476190477,
  0.5997023809523809]}

In [41]:
import xgboost as xgb

In [42]:
clf = xgb.XGBClassifier()
clf = clf.fit(train_data, train_labels)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [43]:
from sklearn.metrics import precision_recall_curve, auc

In [44]:
y_pred = clf.predict(val_data)

In [45]:
precision, recall, thresholds = precision_recall_curve(y_pred, val_labels.flatten())

In [46]:
print(auc(recall, precision))

0.9528506716651046
