For each cancer type or multiclass, creates a classifier and saves the models and results



In [3]:
# settings: 

cancer_types = ['BLCA', 'BRCA', 'COAD', 'ESCA', 'HNSC', 'KIRC', 'KIRP', 'LIHC', 'LUAD', 'LUSC', 'PRAD', 'THCA', 'UCEC']
# cancer_types = [''] # uncomment for a multiclass model. Warning: this model takes ages to train! (I think roughly 1 or 2 days)
classifier_type = 'XGBoost'

In [4]:
# for binary, chosen from results of Hyperparam_search:
params = {'objective':'binary:logistic', 'max_depth': 10, 'n_estimators':450, 'learning_rate':0.189} # dictionary of parameters for the xgboost model
if cancer_types == ['']: # for multiclass
    params = {
    'subsample': 0.5, 
    'objective': 'binary:logistic', 
    'n_estimators' : 800, 
    'max_depth' : 3, 
    'learning_rate' : 0.189, 
    'colsample_bytree' : 0.5
}

print("params is ", params)

params is  {'objective': 'binary:logistic', 'max_depth': 10, 'n_estimators': 450, 'learning_rate': 0.189}


In [None]:

def create_classifier(cancer_type):
    # get the data
    
    # adding this path so we can import get_train_and_test
    import sys
    path = '../'
    sys.path.append(path)
    from get_train_and_test import get_train_and_test
    root_path = path
    
    
    seed = 42 # using a seed for splitting up the train and test data 
#     seed = None # if you don't use a seed you can get different xgboost feature importances

    m_values_train, m_values_test, diagnoses_train, diagnoses_test = get_train_and_test(cancer_type, use_small=False, root_path = root_path, model_path = root_path + '/xgboost/', model_type = classifier_type + '_' + cancer_type, seed = seed)
        
    # fit the model
    if classifier_type == 'XGBoost':
        # XGBoost
        import xgboost as xgb
        from xgboost import XGBClassifier
        bst = xgb.XGBClassifier(**params)
        bst.fit(m_values_train, diagnoses_train)
    else:
        print("classifier type ", classifier_type, " not known!")
    
    
    # save
    import joblib # joblib is apparently more efficient than pickle functions for model saving (see https://scikit-learn.org/stable/modules/model_persistence.html)
    joblib.dump(bst, 'saved_models/xgboost_model_'+cancer_type+'.pkl')
    booster = bst.get_booster()
    booster.dump_model('saved_models/xgboost_trees_'+cancer_type+'.txt')
    
    
    # Print important features
    it = 0
    lines = []
    # important features printed into console
    for i in bst.feature_importances_: # feature_importances is how important each feature (probe) is. Most of them seem to be 0, but some are > 0
        if i > 0:
#             print('feature nr: %d, importance: %7.5f'%(it, i))
            lines.append('feature nr: %d, importance: %7.5f'%(it, i))
        it += 1
    # saving lines:
    import numpy as np

    # finding what number we should give the file - we want to give it a number higher than all other files of this type so we don't overwrite anything
    import os
    files = os.listdir('feature_importances/')
    this_type = [file for file in files if cancer_type+'_run_' in file] # find all existing files of this type
    if this_type == []:
        num = 0
    else:
        nums = [file.split('_')[2].split('.')[0] for file in this_type] # get the numbers of the files
        ints = map(int, nums)
        num = max(ints) + 1 # go one bigger than the max to get a unique number
    np.savetxt('feature_importances/'+cancer_type+'_run_'+str(num)+'.csv', lines, delimiter='\t', fmt='%s')

    # NOTE: XGBoost finds different features on consecutive runs (because of the randomness) but it finds common features between runs (with higher importance I think) - when looking at features, look at these!
    
    # Evaluate:
    from Evaluate import print_evaluation
    print_evaluation(bst, m_values_test, diagnoses_test, 'xgboost_'+cancer_type)
    
    # create eval curves
    import matplotlib.pyplot as plt
    from Evaluate import plot_curve
    import numpy as np
    num_classes = len(np.unique(diagnoses_test, axis=0))
    print("num classes is: ", num_classes)
    confidence = bst.predict_proba(m_values_test)
    plot_curve('roc', diagnoses_test, confidence, num_classes, 'xgboost_'+cancer_type)
    plot_curve('precision_recall', diagnoses_test, confidence, num_classes, 'xgboost_'+cancer_type)


    

import multiprocessing as mp
pool = mp.Pool(processes=mp.cpu_count())
results = [pool.map(create_classifier, (cancer_type for cancer_type in cancer_types))]



Getting values and diagnoses from: 
/Tank/methylation-patterns-code/methylation-patterns-izzy/data_preprocessing/dataset/pandas/m_values/TCGA-all.csv
/Tank/methylation-patterns-code/methylation-patterns-izzy/data_preprocessing/dataset/pandas/diagnoses/TCGA-all.csv
m_value and diagnoses shapes:
(276016, 6224)
(6224,)
m values train, m values test, diagnoses train, diagnoses test shapes:
(4668, 276016) (1556, 276016) (4668,) (1556,)
