For each binary cancer type or for multiclass (see settings below), creates a classifier with the given type and saves the models and results (in figs/)


In [1]:
# settings:

cancer_types = ['BLCA', 'BRCA', 'COAD', 'ESCA', 'HNSC', 'KIRC', 'KIRP', 'LIHC', 'LUAD', 'LUSC', 'PRAD', 'THCA', 'UCEC']
# cancer_types = [''] # uncomment to create a multiclass model
classifier_type = 'SVM' # can be LR (logistic regression) or SVM (support vector machine)

In [1]:
if cancer_types == ['']: # multiclass params
    params = {'penalty': 'l2',
                 'solver': 'lbfgs',
                 'max_iter': 100} # found from Hyperparam_search
else: # binary:  
    params = {'penalty': 'l2',
                 'solver': 'newton-cg',
                 'max_iter': 100}

if cancer_types == ['']: # multiclass
    params_SVM = {'kernel': 'rbf',
             'tol': 1e-3, 
             'gamma': 'auto',
             'probability': True} # found from Hyperparam_search for multiclass
else: # binary
    params_SVM = {'kernel': 'rbf',
             'tol': 1e-3, # default
             'gamma': 'scale', # default
             'probability': True} 

In [2]:

def create_classifier(cancer_type):
    # get the data
    
    # adding this path so we can import get_train_and_test
    import sys
    path = '../' # needs to be in top folder (where get_train_and_test.py is)
    sys.path.append(path)
    from get_train_and_test import get_train_and_test
    root_path = path
    
    seed = 42 # using a seed for splitting up the train and test data 
    m_values_train, m_values_test, diagnoses_train, diagnoses_test = get_train_and_test(cancer_type, use_small=False, root_path = root_path, model_path = root_path + '/simple_methods/', model_type = classifier_type + '_' + cancer_type, seed = seed)
    
    
    # fit the model
    if classifier_type == 'LR':
        from sklearn.linear_model import LogisticRegression
        model = LogisticRegression(penalty = params['penalty'], solver = params['solver'], max_iter = params['max_iter']) # sag is supposed to be fast on large datasets (if features are around same scale)
    
    elif classifier_type == 'SVM':
        from sklearn.svm import SVC
        print("params are ", params_SVM)
        model = SVC(kernel = params_SVM['kernel'], probability  = params_SVM['probability'], tol = params_SVM['tol'], gamma = params_SVM['gamma'])
    else:
        print("classifier type ", classifier_type, " not known!")
    
    
    fitted = model.fit(X=m_values_train, y=diagnoses_train)
    
    # save
    import joblib # joblib is apparently more efficient than pickle functions for model saving (see https://scikit-learn.org/stable/modules/model_persistence.html)
    joblib.dump(fitted, 'saved_models/'+classifier_type+'_model_'+cancer_type+'.pkl')
    
    # Evaluate:
    from Evaluate import print_evaluation
    predictions = print_evaluation(fitted, m_values_test, diagnoses_test, classifier_type+'_'+cancer_type)
    
    # create eval curves
    import matplotlib.pyplot as plt
    from Evaluate import plot_curve
    import numpy as np
    num_classes = len(np.unique(diagnoses_test, axis=0))
    print("num classes is: ", num_classes)
    confidence = fitted.predict_proba(m_values_test)
    plot_curve('roc', diagnoses_test, confidence, num_classes, classifier_type+'_'+cancer_type)
    plot_curve('precision_recall', diagnoses_test, confidence, num_classes, classifier_type+'_'+cancer_type)


    

import multiprocessing as mp
pool = mp.Pool(processes=mp.cpu_count())
results = [pool.map(create_classifier, (cancer_type for cancer_type in cancer_types))]



Getting values and diagnoses from: 
/Tank/methylation-patterns-code/methylation-patterns-izzy/data_preprocessing/dataset/pandas/m_values/TCGA-all.csv
/Tank/methylation-patterns-code/methylation-patterns-izzy/data_preprocessing/dataset/pandas/diagnoses/TCGA-all.csv
m_value and diagnoses shapes:
(276016, 6224)
(6224,)
m values train, m values test, diagnoses train, diagnoses test shapes:
(4668, 276016) (1556, 276016) (4668,) (1556,)
params are  {'kernel': 'rbf', 'tol': 0.001, 'gamma': 'auto', 'probability': True}
Acc, conf mat:
0.9595115681233933
[[162   0   1   0   1   1   0   0   0   0   0   7   3   0]
 [  1 102   0   0   0   0   0   0   0   0   1   0   0   0]
 [  2   0 196   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0  79   0   0   0   0   0   0   0   0   0   0]
 [  0   1   1   0  32  11   0   0   0   0   1   0   0   0]
 [  0   1   1   0   0 125   0   0   0   0   5   0   0   0]
 [  1   0   0   0   0   0  78   2   0   0   0   0   0   0]
 [  0   1   0   0   0   0   2  66   