In [1]:
import sys
import os
import lzma

In [2]:
import catboost
import pandas


In [3]:
cancer_data_dir = '/home/leron/projects/cancer/data'
dump_dir = os.path.join(cancer_data_dir, 'bcDump/example15bmc')
merged_path = os.path.join(dump_dir, 'ex15bmcMerged.csv.xz')
bmc_all_path = os.path.join(dump_dir, 'bmc15mldata1.csv')

In [4]:
dtype = {'DFS': pandas.Int64Dtype(),
         'pCR': pandas.Int64Dtype(),
         'RFS': pandas.Int64Dtype(), 
         'DFS': pandas.Int64Dtype(), 
         'posOutcome': pandas.Int64Dtype()}

In [5]:
surgery_mapping = dict()

In [6]:
def convert_surgery(x):
    if x not in surgery_mapping:
        surgery_mapping[x] = len(surgery_mapping) + 1
    return surgery_mapping[x]

In [7]:
bmc = pandas.read_csv(bmc_all_path, dtype=dtype, converters=dict(surgery=convert_surgery))
bmc = bmc.sort_values(by='patient_ID')

In [8]:
bmc.dtypes

study         object
patient_ID     int64
radio          int64
surgery        int64
chemo          int64
hormone        int64
pCR            Int64
RFS            Int64
DFS            Int64
posOutcome     Int64
dtype: object

In [9]:
gene_expression = pandas.read_csv(lzma.open(merged_path))

In [10]:
gene_expression.head(5)

Unnamed: 0,patient_ID,MAGEA12,MAGEA11,KLF1,ADH7,MSH4,BIRC3,AKR1C4,GBX2,GCGR,...,ZNF80,ZNF83,ZNF84,ZNF91,ZNHIT2,ZSCAN2,ZXDC,ZYX,ZZEF1,ZZZ3
0,22449,-0.118953,1.180345,0.252643,-0.262987,0.142903,0.167314,0.498846,0.774632,0.104353,...,-1.564143,0.466733,0.827552,-0.617981,0.303161,1.260602,-0.217995,0.219529,0.389849,1.313703
1,22450,0.423693,-0.922374,-1.202192,-0.105451,-0.061571,-0.093231,-0.09555,-0.481403,-0.214238,...,0.711752,0.358388,0.037911,2.304784,0.328942,-1.028791,-0.850002,-0.292574,-0.068982,0.722123
2,22451,-0.239183,-0.733389,0.523791,-0.081958,-0.004635,-0.008094,0.268636,-0.614192,0.027471,...,-0.011786,-0.474762,-0.349981,-0.097197,0.100946,-0.5547,-0.367363,0.094464,-0.372665,-0.790771
3,22452,0.500445,-0.177686,-0.216638,-0.13085,-0.261039,-0.048521,1.479664,-0.10012,0.233178,...,0.757255,0.590212,0.06015,2.287583,-0.108866,-1.1325,-0.106976,-0.216267,0.393671,-0.027349
4,22453,-0.609235,0.259494,-0.071802,0.027963,0.162509,0.112654,-0.239435,0.229737,-0.132271,...,0.407159,0.570637,0.851658,-0.41295,0.105692,-1.047445,0.08448,-0.224081,-0.021074,0.764555


In [11]:
train_feat = gene_expression[gene_expression.patient_ID.isin(bmc.patient_ID)]

In [12]:
train_feat = train_feat.sort_values(by='patient_ID')

In [13]:
feature_columns = train_feat.columns.to_list()[1:] + ['radio', 'surgery', 'chemo', 'hormone']
label_columns = ['pCR', 'RFS', 'DFS', 'posOutcome']
label_columns = ['posOutcome']

In [14]:
assert(bmc.patient_ID.to_list() == train_feat.patient_ID.to_list())

In [15]:
merged_train = pandas.merge(train_feat, bmc, left_on='patient_ID', right_on='patient_ID')

In [16]:
train_data = merged_train[feature_columns].to_numpy()
train_labels = merged_train[label_columns].to_numpy().astype(int)

In [17]:
train_labels.flatten().max()

2

In [18]:
from catboost import Pool, CatBoostClassifier

In [19]:

test_data = catboost_pool = Pool(train_data, 
                                 train_labels)


In [24]:

model = CatBoostClassifier(iterations=40,
                           depth=4,
                           learning_rate=1,
                           loss_function='MultiClass',
                           verbose=True)
# train the model
model.fit(train_data, train_labels)

0:	learn: 0.7967950	total: 59.7s	remaining: 38m 49s
1:	learn: 0.7552695	total: 1m 4s	remaining: 20m 16s
2:	learn: 0.7237539	total: 1m 6s	remaining: 13m 34s
3:	learn: 0.7052150	total: 1m 7s	remaining: 10m 11s
4:	learn: 0.6802453	total: 1m 9s	remaining: 8m 4s
5:	learn: 0.6643927	total: 1m 11s	remaining: 6m 43s
6:	learn: 0.6498434	total: 1m 12s	remaining: 5m 43s
7:	learn: 0.6390587	total: 1m 14s	remaining: 4m 58s
8:	learn: 0.6265640	total: 1m 15s	remaining: 4m 20s
9:	learn: 0.6117341	total: 1m 16s	remaining: 3m 49s
10:	learn: 0.5935958	total: 1m 17s	remaining: 3m 24s
11:	learn: 0.5835994	total: 1m 18s	remaining: 3m 3s
12:	learn: 0.5672392	total: 1m 19s	remaining: 2m 44s
13:	learn: 0.5517326	total: 1m 20s	remaining: 2m 29s
14:	learn: 0.5310425	total: 1m 21s	remaining: 2m 15s
15:	learn: 0.5173421	total: 1m 22s	remaining: 2m 3s
16:	learn: 0.5050993	total: 1m 23s	remaining: 1m 53s
17:	learn: 0.4920774	total: 1m 24s	remaining: 1m 43s
18:	learn: 0.4755416	total: 1m 25s	remaining: 1m 34s
19:	lea

<catboost.core.CatBoostClassifier at 0x7fa9e731e630>