In [1]:
import sys
sys.path.append('/usr/local/lib/python3/dist-packages/')

In [2]:
import sys
import os
import lzma
import random
from collections import defaultdict
import math

In [3]:
from typing import *

In [4]:
import numpy
import pandas

In [5]:
import xgboost as xgb

ModuleNotFoundError: No module named 'xgboost'

In [6]:
import sklearn

In [7]:
from sklearn.metrics import recall_score, precision_score, f1_score, confusion_matrix

In [8]:
from catboost import Pool, CatBoostClassifier
import catboost

In [9]:
cancer_data_dir = '/home/noskill/projects/cancer/data'
dump_dir = os.path.join(cancer_data_dir, 'bcDump/example15bmc')
clinical_table_path = os.path.join(cancer_data_dir, 'bcClinicalTable.csv')
merged_path = os.path.join(dump_dir, 'ex15bmcMerged.csv.xz')
bmc_all_path = os.path.join(dump_dir, 'bmc15mldata1.csv')

In [10]:
dtype = {'DFS': pandas.Int64Dtype(),
         'pCR': pandas.Int64Dtype(),
         'RFS': pandas.Int64Dtype(), 
         'DFS': pandas.Int64Dtype(), 
         'posOutcome': pandas.Int64Dtype()}

# Convertors for mapping string data to numbers

In [11]:
def convert_surgery(x, surgery_mapping=dict()):
    if x not in surgery_mapping:
        surgery_mapping[x] = len(surgery_mapping)# + 1
    return surgery_mapping[x]


def convert_node_status(x, mapping=dict()):
    if x == 'NA' or x == 'NaN':
        return numpy.nan
    if not isinstance(x, str) and numpy.isnan(x):
        return x
    if x not in mapping:
        mapping[x] = len(mapping) + 1
    return mapping[x]


def convert_race(x, mapping=dict()):
    return convert_node_status(x, mapping)

def convert_menapause(x, mapping=dict()):
    return convert_node_status(x, mapping)

converters=dict(preTrt_lymph_node_status=convert_node_status,
               race=convert_race,
               menopausal_status=convert_menapause,
               surgery_type=convert_surgery,
               surgery=convert_surgery)

# load averaged treatment table

In [12]:
bmc = pandas.read_csv(bmc_all_path, dtype=dtype, converters=converters)
bmc = bmc.sort_values(by='patient_ID')

# load detailed treatment

In [13]:
treatment = pandas.read_csv(clinical_table_path, converters=converters).sort_values(by='patient_ID')
treatment = treatment[treatment.patient_ID.isin(bmc.patient_ID)]

In [14]:
bmc.head()

Unnamed: 0,study,patient_ID,radio,surgery,chemo,hormone,pCR,RFS,DFS,posOutcome
0,study_1379_GPL1223_all-bmc15,22449,0,0,0,1,,,0,0
1,study_1379_GPL1223_all-bmc15,22450,0,0,0,1,,,0,0
2,study_1379_GPL1223_all-bmc15,22451,0,0,0,1,,,0,0
3,study_1379_GPL1223_all-bmc15,22452,0,0,0,1,,,0,0
4,study_1379_GPL1223_all-bmc15,22453,0,0,0,1,,,1,1


# load genes expression data

In [15]:
gene_expression = pandas.read_csv(lzma.open(merged_path))

In [16]:
gene_expression.head(5)

Unnamed: 0,patient_ID,MAGEA12,MAGEA11,KLF1,ADH7,MSH4,BIRC3,AKR1C4,GBX2,GCGR,...,ZNF80,ZNF83,ZNF84,ZNF91,ZNHIT2,ZSCAN2,ZXDC,ZYX,ZZEF1,ZZZ3
0,22449,-0.118953,1.180345,0.252643,-0.262987,0.142903,0.167314,0.498846,0.774632,0.104353,...,-1.564143,0.466733,0.827552,-0.617981,0.303161,1.260602,-0.217995,0.219529,0.389849,1.313703
1,22450,0.423693,-0.922374,-1.202192,-0.105451,-0.061571,-0.093231,-0.09555,-0.481403,-0.214238,...,0.711752,0.358388,0.037911,2.304784,0.328942,-1.028791,-0.850002,-0.292574,-0.068982,0.722123
2,22451,-0.239183,-0.733389,0.523791,-0.081958,-0.004635,-0.008094,0.268636,-0.614192,0.027471,...,-0.011786,-0.474762,-0.349981,-0.097197,0.100946,-0.5547,-0.367363,0.094464,-0.372665,-0.790771
3,22452,0.500445,-0.177686,-0.216638,-0.13085,-0.261039,-0.048521,1.479664,-0.10012,0.233178,...,0.757255,0.590212,0.06015,2.287583,-0.108866,-1.1325,-0.106976,-0.216267,0.393671,-0.027349
4,22453,-0.609235,0.259494,-0.071802,0.027963,0.162509,0.112654,-0.239435,0.229737,-0.132271,...,0.407159,0.570637,0.851658,-0.41295,0.105692,-1.047445,0.08448,-0.224081,-0.021074,0.764555


In [17]:
genes_features = gene_expression[gene_expression.patient_ID.isin(bmc.patient_ID)]

In [18]:
genes_features = genes_features.sort_values(by='patient_ID')

# columns to use for training

In [19]:
pam50 = """ACTR3B
ANLN
BAG1
BCL2
BIRC5
BLVRA
CCNB1
CCNE1
CDC20
CDC6
NUF2
CDH3
CENPF
CEP55
CXXC5
EGFR
ERBB2
ESR1
EXO1
FGFR4
FOXA1
FOXC1
GPR160
GRB7
KIF2C
NDC80
KRT14
KRT17
KRT5
MAPT
MDM2
MELK
MIA
MKI67
MLPH
MMP11
MYBL2
MYC
NAT1
ORC6
PGR
PHGDH
PTTG1
RRM2
SFRP1
SLC39A6
TMEM45B
TYMS
UBE2C
UBE2T""".split()

In [23]:
treatment_columns = ['tumor_size_cm_preTrt_preSurgery', 
                     'preTrt_lymph_node_status', 
                     'preTrt_totalLymphNodes', 
                     'preTrt_numPosLymphNodes', 
                     'hist_grade', 
                     'nuclear_grade_preTrt', 
                     'age', 'race', 'menopausal_status', 'surgery_type', 'intarvenous', 'intramuscular', 'oral', 
                     'radiotherapyClass', 'chemotherapyClass', 'hormone_therapyClass', 'postmenopausal_only', 
                     'anthracycline', 'taxane', 'anti_estrogen', 'aromatase_inhibitor',
                     'estrogen_receptor_blocker', 'estrogen_receptor_blocker_and_stops_production', 'anti_HER2', 
                     'tamoxifen', 'doxorubicin', 
                     'epirubicin', 'docetaxel', 'capecitabine', 'fluorouracil',
                     'paclitaxel', 'cyclophosphamide', 'anastrozole',
                     'trastuzumab', 'letrozole', 'chemotherapy',
                     'no_treatment', 'methotrexate', 'other', 'taxaneGeneral']

In [24]:
pam50col = genes_features.columns[genes_features.columns.isin(pam50).nonzero()[0]].to_list()

In [25]:
aggregated_treatment_columns = ['radio', 'surgery', 'chemo', 'hormone']
label_columns = ['pCR', 'RFS', 'DFS', 'posOutcome']
label_columns = ['posOutcome']
genes_columns = genes_features.columns.to_list()[1:]
feature_columns = genes_columns + treatment_columns # label_columns +  # pam50col #  +   + aggregated_treatment_columns

## merge genes expression + averaged treatment + detailed treatment

In [26]:
merged = pandas.merge(genes_features, bmc, left_on='patient_ID', right_on='patient_ID')
merged = pandas.merge(merged, treatment, left_on='patient_ID', right_on='patient_ID')
merged.insert(0, 'row_num', range(0,len(merged)))

In [27]:
def split_by_study(study_name=None):
    """
    Split one study out for cross-validation
    """
    for eval_study in set(bmc.study):
        if study_name:
            eval_study = study_name
        print(eval_study)
        bmc_train = bmc[bmc.study != eval_study]
        bmc_val = bmc[bmc.study == eval_study]
        assert (not set(bmc_train.patient_ID).intersection(set(bmc_val.patient_ID)))

        train_split = merged[merged.patient_ID.isin(bmc_train.patient_ID)]
        val_split = merged[merged.patient_ID.isin(bmc_val.patient_ID)]
        assert val_split.patient_ID.to_list() == bmc_val.patient_ID.to_list()
        train_data = train_split[feature_columns].to_numpy()
        train_labels = train_split[label_columns].to_numpy().astype(int)
        val_data = val_split[feature_columns].to_numpy()
        val_labels = val_split[label_columns].to_numpy().astype(int)
        yield train_data, train_labels, val_data, val_labels
        if study_name:
            break

In [28]:
def select_balanced_idx(study, num):
    if not num % 2 == 0:
        num = num + 1
    validation = []
    pos_outcome = study[study.posOutcome == 1].patient_ID
    neg_outcome = study[study.posOutcome == 0].patient_ID
    pos_idx = numpy.arange(len(pos_outcome))
    neg_idx = numpy.arange(len(neg_outcome))
    random.shuffle(pos_idx)
    random.shuffle(neg_idx)
    i = 0
    while not (len(validation) >= num):
        validation.append(pos_outcome.iloc[pos_idx[i]])
        validation.append(neg_outcome.iloc[neg_idx[i]])
        i += 1
    train = study[~study.patient_ID.isin(validation)]
    validation = study[study.patient_ID.isin(validation)]
    return train, validation

In [29]:
def resample_patients_by_study(study_patients: Dict[str, list]) -> Dict[str, list]:
    result = defaultdict(list)
    max_length = max([len(x) for x in study_patients.values()])
    for study, lst in study_patients.items():
        result[study] += lst
        to_upsample = max_length - len(lst)
        result[study] += [random.choice(lst) for i in range(to_upsample)]
    assert all([(len(x) == max_length) for x in result.values()])
    return result

In [30]:
def get_loc(patient_ID, frame):
    return frame[frame.patient_ID == patient_ID].row_num.to_list()[0]

In [31]:
def random_split(ratio=0.1, study_name=None, rand=False, to_numpy=True, balance_by_study=False):
    """
    Split dataset into train and validation sets:
    --------------
    Returns: train_data, train_labels, val_data, val_labels, expected
        expected - confusion matrix expected from classification by ratio of positive/negative for each study
    """
    val_dict = defaultdict(list)
    train_dict = defaultdict(list)
    expected = dict()
    expected['TN'] = 0
    expected['FN'] = 0
    expected['FP'] = 0
    expected['TP'] = 0
    
    for eval_study in set(bmc.study):
        if study_name is not None:
            if study_name != eval_study:
                continue
        study = bmc[bmc.study == eval_study]
        num_select = math.ceil(len(study) * ratio)
        study_patients = bmc[bmc.study == eval_study]
        bmc_train, bmc_val = select_balanced_idx(study_patients, num_select)
        pos_prob_train = bmc_train.posOutcome.sum() / len(bmc_train)
        neg_prob_train = 1 - pos_prob_train
        P = bmc_val.posOutcome.sum()
        N = len(bmc_val) - P
        TN = N * neg_prob_train
        TP= P * pos_prob_train
        FP = N - TN
        FN = P - TP
        expected['TN'] += TN
        expected['TP'] += TP
        expected['FP'] += FP
        expected['FN'] += FN
        val_dict[eval_study] = bmc_val.patient_ID.to_list()
        train_dict[eval_study] = bmc_train.patient_ID.to_list()
    if balance_by_study:
        train_dict = resample_patients_by_study(train_dict)
        iloc = []
        for patient_lst in train_dict.values():
            iloc += [get_loc(p, merged) for p in patient_lst]
        train_split = merged.iloc[iloc]
    else:
        train_patients = []
        for patient_lst in train_dict.values():
            train_patients += patient_lst
        train_split = merged[merged.patient_ID.isin(train_patients)]
    val_patients = []
    for patient_lst in val_dict.values():
        val_patients += patient_lst
    val_split = merged[merged.patient_ID.isin(val_patients)]
    train_data = train_split[feature_columns]
    train_labels = train_split[label_columns]
    val_data = val_split[feature_columns]
    val_labels = val_split[label_columns]
    if rand:
        train_data = numpy.random.randn(*train_data.shape)
        val_data = numpy.random.randn(*val_data.shape)
    if to_numpy:
        train_data = train_data.to_numpy()
        train_labels = train_labels.to_numpy().astype(int).ravel()
        val_data = val_data.to_numpy()
        val_labels = val_labels.to_numpy().astype(int).ravel()
    return train_data, train_labels, val_data, val_labels, expected

In [32]:
def compute_metrics(result, y_true, y_pred, x_true, x_pred):
    result['recall'].append(recall_score(y_true, y_pred))
    result['precision'].append( precision_score(y_true, y_pred))
    result['f1'].append(f1_score(y_true, y_pred))
    result['confusion'].append(confusion_matrix(y_true, y_pred))
    result['train_f1'].append(f1_score(x_true, x_pred))
    result['train_confusion'].append(confusion_matrix(x_true, x_pred))
    confusion = result['confusion'][-1]
    accuracy = (confusion[0][0] + confusion[1][1]) / (sum(confusion[0]) + sum(confusion[1]))
    result['accuracy'].append(accuracy)

# catboost

In [None]:
res = defaultdict(list)
model = CatBoostClassifier(iterations=3600,
                           depth=7,
                           use_best_model=True,
                           learning_rate=0.005,
                           loss_function='Logloss',
                           model_size_reg=20,
                           verbose=True,
                           scale_pos_weight=0.605,
                           l2_leaf_reg=20,
                           od_type='Iter', od_wait=200)
train_data, train_labels, val_data, val_labels, expected = random_split(balance_by_study=True)
catboost_pool = Pool(train_data, 
                    train_labels)

test_data = Pool(val_data,
                 val_labels) 
# train the model
clf = model.fit(train_data, train_labels, 
          eval_set=test_data,
          save_snapshot=False, snapshot_file='vasya')
y_pred = clf.predict(val_data)
x_pred = clf.predict(train_data)
compute_metrics(res, val_labels.flatten(), y_pred, train_labels, x_pred)
res

# SVM

In [None]:
from sklearn import datasets, svm, metrics
svm_total = defaultdict(list)
for i in range(30):
    print('iteration {0}'.format(i))
    model = svm.SVC(C=1, kernel='rbf', class_weight={1: 0.5})
    train_data, train_labels, val_data, val_labels, expected = random_split(balance_by_study=True)
    # train the model
    clf = model.fit(numpy.nan_to_num(train_data), numpy.nan_to_num(train_labels))
    y_pred = clf.predict(numpy.nan_to_num(val_data))
    x_pred = clf.predict(numpy.nan_to_num(train_data))
    compute_metrics(svm_total, val_labels.flatten(), y_pred, train_labels, x_pred)
for key in svm_total:
    ave = numpy.asarray(svm_total[key]).mean(axis=0)
    print('{0}: {1}'.format(key, ave))

In [None]:
print(train_data.shape)
print(val_data.shape)

# 

# SVM - single study

In [None]:
from sklearn import datasets, svm, metrics

svm_total = defaultdict(list)
for study in set(bmc.study):
#     if study != 'study_25065_GPL96_MDACC-bmc15':
#         continue
    print('\n' * 2 + study)
    for i in range(20):
        train_data, train_labels, val_data, val_labels, expected = random_split(ratio=0.1, study_name=study)
        C = 1
        if study == 'study_16446_GPL570_all-bmc15':
            C = 1.5
        if study == 'study_22358_GPL5325_all-bmc15':
            C = 0.1
        if study == 'study_22226_GPL1708_all-bmc15':
            C = 2
        if study == 'study_20181_GPL96_all-bmc15':
            C = 0.72
        if study == 'study_25065_GPL96_MDACC-bmc15':
            C = 1.72
        model = svm.SVC(C=C, kernel='rbf', class_weight={1: (1 - numpy.mean(train_labels))  / numpy.mean(train_labels)})
        # train the model
        clf = model.fit(numpy.nan_to_num(train_data), numpy.nan_to_num(train_labels))
        y_pred = clf.predict(numpy.nan_to_num(val_data))
        # print(y_pred)
        x_pred = clf.predict(numpy.nan_to_num(train_data))
        compute_metrics(svm_total, val_labels.flatten(), y_pred, train_labels, x_pred)
for key in svm_total:
    ave = numpy.asarray(svm_total[key]).mean(axis=0)
    print('{0}: {1}'.format(key, ave))

In [None]:
print(train_data.shape)
print(val_data.shape)

# nearest neigbour classifier

In [None]:
def predict(train_data, validation_data, train_labels):
    tmp = []
    for i in range(len(validation_data)):
        diff = train_data - validation_data[i]
        idx = numpy.argmin(numpy.sqrt(numpy.sum(diff ** 2, axis=1)))
        tmp.append(idx)
    return train_labels[tmp]

In [None]:
nearest_total = defaultdict(list)
for i in range(10):
    train_data, train_labels, val_data, val_labels, expected = random_split()
    y_pred = predict(numpy.nan_to_num(train_data), numpy.nan_to_num(val_data), train_labels)
    # x_pred = predict(numpy.nan_to_num(train_data), numpy.nan_to_num(train_data), train_labels)
    compute_metrics(nearest_total, val_labels.flatten(), y_pred, train_labels, x_pred)

for key in nearest_total:
    ave = numpy.asarray(nearest_total[key]).mean(axis=0)
    print('{0}: {1}'.format(key, ave))

In [None]:
print(train_data.shape)
print(val_data.shape)

# naive bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
train_data, train_labels, val_data, val_labels, expected = random_split(ratio=0.1)

In [None]:
nearest_total = defaultdict(list)
for i in range(10):
    gnb = GaussianNB()
    train_data, train_labels, val_data, val_labels, expected = random_split()
    model = gnb.fit(numpy.nan_to_num(train_data, nan=-1), numpy.nan_to_num(train_labels))
    y_pred = model.predict(numpy.nan_to_num(val_data, nan=-1))
    x_pred = model.predict(numpy.nan_to_num(train_data))
    compute_metrics(nearest_total, val_labels.flatten(), y_pred, train_labels, x_pred)

for key in nearest_total:
    ave = numpy.asarray(nearest_total[key]).mean(axis=0)
    print('{0}: {1}'.format(key, ave))



In [None]:
train_data.shape

# binarization

In [33]:
merged.head()

Unnamed: 0,row_num,patient_ID,MAGEA12,MAGEA11,KLF1,ADH7,MSH4,BIRC3,AKR1C4,GBX2,...,chemotherapy,hormone_therapy,no_treatment,methotrexate,cetuximab,carboplatin,other,taxaneGeneral,neoadjuvant_or_adjuvant,study_specific_protocol_number
0,0,22449,-0.118953,1.180345,0.252643,-0.262987,0.142903,0.167314,0.498846,0.774632,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,adj,1.0
1,1,22450,0.423693,-0.922374,-1.202192,-0.105451,-0.061571,-0.093231,-0.09555,-0.481403,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,adj,1.0
2,2,22451,-0.239183,-0.733389,0.523791,-0.081958,-0.004635,-0.008094,0.268636,-0.614192,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,adj,1.0
3,3,22452,0.500445,-0.177686,-0.216638,-0.13085,-0.261039,-0.048521,1.479664,-0.10012,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,adj,1.0
4,4,22453,-0.609235,0.259494,-0.071802,0.027963,0.162509,0.112654,-0.239435,0.229737,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,adj,1.0


In [34]:
def digitize_genes(col):
    _, edges = numpy.histogram(col, bins=15, density=True)
    res = numpy.digitize(col, edges)
    return res - res.min()

In [35]:
other_columns = [x for x in merged.columns[~merged.columns.isin(genes_columns)] if x in feature_columns]

In [36]:
res_ar = numpy.zeros((16, 4), dtype=numpy.bool_)
for i in range(16):
    ar0 = bin(i).split('0b')[1]
    ar0 = '0' * (4 - len(ar0)) + ar0
    res_ar[i] = [int(x) for x in ar0]


In [37]:
res_ar

array([[False, False, False, False],
       [False, False, False,  True],
       [False, False,  True, False],
       [False, False,  True,  True],
       [False,  True, False, False],
       [False,  True, False,  True],
       [False,  True,  True, False],
       [False,  True,  True,  True],
       [ True, False, False, False],
       [ True, False, False,  True],
       [ True, False,  True, False],
       [ True, False,  True,  True],
       [ True,  True, False, False],
       [ True,  True, False,  True],
       [ True,  True,  True, False],
       [ True,  True,  True,  True]])

In [70]:
def binary_genes():
    n_patients, n_genes = merged[genes_columns].shape
    result = dict()
    for gene_col in range(len(genes_columns)):
    # for gene_col in range(1000): 
        binary_genes = numpy.zeros((n_patients, 4), dtype=numpy.bool_)
        digitized = digitize_genes(merged[genes_columns[gene_col]])
        for i, digit in enumerate(digitized):
            binary_genes[i] = res_ar[digit]
        column_names = [genes_columns[gene_col] + '_{0}'.format(x) for x in range(4)]
        for i, col in enumerate(column_names):
            result[col] = binary_genes[:, i]
    return result

In [71]:
def binary_non_genes():
    non_genes_data = dict()
    for oth in other_columns:
        dig = digitize_non_genes_data(merged[oth])
        set_size = len(set(dig))
        new_col = -1
        for x in range(1, 5):
            if set_size <= 2 ** x:
                new_col = x
                break
        assert new_col != -1
        ar1 = numpy.zeros((len(merged[oth]), new_col), dtype=numpy.bool_)
        for i, d in enumerate(dig):
            ar1[i] = res_ar[d][-new_col:]
        column_names = [oth + '_{0}'.format(x) for x in range(new_col)]
        for i, col_name in enumerate(column_names):
            non_genes_data[col_name] = ar1[:, i]
    return non_genes_data

def binarize_dataset():
    result = dict()
    patients_id = merged.patient_ID.to_list()
    result['patient_ID'] = patients_id
    result['posOutcome'] = merged.posOutcome.to_list()
    result.update(binary_genes())
    result.update(binary_non_genes())
    return pandas.DataFrame(data=result).sort_values(by='patient_ID').drop(columns=['patient_ID']) * 1

In [72]:
bin_data = binarize_dataset()

In [73]:
bin_data.shape

(2225, 35389)

In [74]:
bin_data.to_csv('/tmp/cancer_bin.csv', header=True, index=False)

In [None]:
res_ar[0][-4:]

In [None]:
[12 < 2 ** x for x in range(1,5)]

In [41]:
def digitize_non_genes_data(col):
    not_nan = numpy.nan_to_num(col, nan=-1)
    n_unique = len(set(not_nan))
    if 15 < n_unique:
        n_unique = 15
    edge = numpy.histogram_bin_edges(numpy.nan_to_num(col, nan=col.min()), bins=n_unique - 1)
    digits = numpy.digitize(not_nan, edge)
    digits = digits - digits.min()
    return digits

In [None]:
for oth in other_columns:
    print(oth)
    print(set(digitize_non_genes_data(merged[oth])))

In [None]:
other_columns[6]

In [None]:
set(numpy.nan_to_num(merged.hormone_therapy, nan=-1))

In [None]:
other_columns[0]

In [None]:
numpy.histogram_bin_edges(r, bins=14)

In [None]:
other_columns[18]

In [None]:
edges = numpy.histogram_bin_edges(r, bins=14)

In [None]:
edges

# moses

In [None]:
from opencog.pyasmoses import moses

In [None]:
train_data, train_labels, val_data, val_labels, expected = random_split(ratio=0.1, to_numpy=False)

In [None]:
train_data.shape

save data to file

In [None]:
train_data = train_data.fillna(-1)
train_data.to_csv('/tmp/input_data.csv', header=True, index=False)

In [None]:
set(train_data.surgery_type.to_list())

In [None]:
input_data = numpy.concatenate([train_labels[..., numpy.newaxis], train_data], axis=1)

In [None]:
print(input_data.shape)

In [None]:
mos = moses()

In [None]:
args = "--log-file log1.txt.log --hc-fraction-of-nn 0.01 -j5 --balance 1 -m 100000 --result-count 100 --reduct-knob-building-effort=2 --hc-widen-search=1 --enable-fs=1 --fs-algo=smd --fs-target-size=1000 --hc-crossover-min-neighbors=5000 --fs-focus=all --fs-seed=init  --hc-max-nn-evals=10000 --hc-crossover-pop-size=1000 -l debug --noise 0.2 -q 0.05"

In [None]:
output = mos.run(input=numpy.nan_to_num(input_data), python=True, args=args)

In [None]:
output[0].score

In [None]:
mos = moses()
input_data = [[0, 0, 0],
              [0.2, 0.2, 0.4],
              [1, 1, 2],
              [1, 0, 1],
              [2., 1, 3]]
output = mos.run(input=input_data, python=True, args='-m 1000000 --max-time=60 --balance=1')
print (output[0].score) # Prints: 0
model = output[0].eval
print(model([0, 1]))  # Returns: True
print(model([1, 1]))  # Returns: False

In [None]:
moses_args = []
# target column
moses_args.append('--problem_type=it')
