# Evaluate Pseudo Labels

Questions:
1. Do labels obtained from semisupervised classifier improve a supervised classifier performance?
2. Do high confident labels from a SSL model improve a supervised classifier performance?

## Design

Original data set has 2801 samples x 5072 features. This data were split into 70% train and 30% test. This process was repeated 7 times to create 7 different train-test splits.

For each split:
1. Create a random forest model
2. Evaluate the model on 35% data with itest
3. Evaluate the model on 35% data  + SSL GSE109379 with itest
4. Evaluate the model on 35% data + 35% pseudo-label data + GSE109379 pseudo-label data with itest
5. Evaluate the model on 35% data + 35% pseudo-label data + HC GSE1093790 pseudo-label data with itest
6. Evaluate the model on 35% data + high confident pseudo-label +  GSE109379 pseudo-label data with itest
7. Evaluate the model on 35% data + high confident pseudo-label +  HC GSE109379 pseudo-label data with itest


Box plots for each model (1 box plot show the average accuracies of each model over the 7 train-test splits)

In [1]:
import scipy.io
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, balanced_accuracy_score
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
import collections

def CountFrequency(arr):
    return collections.Counter(arr)  

def combine(y, str):
    freq = CountFrequency(y)
    d = pandas.DataFrame(freq.items(), columns=['Sample', 'pseudo_class_label'] )    
    schema = [str] * len(d)
    d['Dataset'] = schema
    return d

def get_freq_series(y):
    freq = CountFrequency(y)
    s = pandas.Series(freq.values(), index = freq.keys())
    return s

def process_csv(filename):
    df = pandas.read_csv(filename, index_col=0)
    y = df.y
    X = df.drop(columns=['y'])
    return (X, y)

def append_result(p, df):
    s = pandas.Series(p, index=df.columns)
    return df.append(s, ignore_index=True)

def apply_CV(mod, cv, X, y):
    return cross_validate(mod, X, y, cv=cv, scoring=['balanced_accuracy', 'accuracy', 'recall_weighted'])



def append_cv_results(df, r, seed_name, dataset_name):
    df = append_result([seed_name, dataset_name, 'cross_val', 'balanced_acc', r['test_balanced_accuracy'].mean().round(2)], df)
    df = append_result([seed_name, dataset_name, 'cross_val', 'accuracy', r['test_accuracy'].mean().round(2)], df)
    df = append_result([seed_name, dataset_name, 'cross_val', 'recall_weighted', r['test_recall_weighted'].mean().round(2)], df)
    return df

def accuracy_per_class(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    classes = np.unique(y_true)
    per_class_accuracies = cm.diagonal()/cm.sum(axis=1)
    per_class_acc_wKeys = {}
    for idx, cls in enumerate(classes):
        per_class_acc_wKeys[cls] = per_class_accuracies[idx]
    return per_class_acc_wKeys

def evaluate_against_test_set(mod, x_train, y_train, x_test, y_test):
    mod.fit(x_train, y_train)
    y_pred = mod.predict(x_test)
    per_class_acc = accuracy_per_class(y_test, y_pred)
    rec = recall_score(y_test, y_pred,  average='weighted', zero_division=0).round(3)
    acc = accuracy_score(y_pred, y_test).round(3)
    b_acc = balanced_accuracy_score(y_pred, y_test).round(3)
    return (rec, acc, b_acc, per_class_acc)

def append_testset_results(df, r, seed_name, dataset_name):
    df = append_result([seed_name, dataset_name, 'vs_testset', 'accuracy', r[0]], df)
    df = append_result([seed_name, dataset_name, 'vs_testset', 'balanced_acc', r[1]], df)
    df = append_result([seed_name, dataset_name, 'vs_testset', 'recall_weighted', r[2]], df)
    return df


def create_per_class_acc_df(class_acc, seed, type_dset):
    d = pd.DataFrame(columns = ['Seed', 'Dataset'])
    d = d.append(class_acc, ignore_index=True)
    d['Seed'] = seed
    d['Dataset'] = type_dset
    return d

def evaluate(seed, model, cv, mcf=False):
    print('1. Read files (seed={})'.format(seed))
       
    file35 = './data_family/seed{}_35perc.csv'.format(seed)
    file70='./data_family/seed{}_70perc.csv'.format(seed)
    file70HC= './data_family/seed{}_70percHC.csv'.format(seed)
    
    file35_GSE='./data_family/seed{}_35perc_gse109379.csv'.format(seed)
    file35_GSEHC ='./data_family/seed{}_35perc_HCgse109379.csv'.format(seed)
    file70_GSE = './data_family/seed{}_70perc_gse109379.csv'.format(seed)
    
    file70HC_GSE = './data_family/seed{}_70HC_gse109379.csv'.format(seed)
    file70_GSEHC = './data_family/seed{}_70perc_gse109379HC.csv'.format(seed)
    file70HC_GSEHC = './data_family/seed{}_70HC_gse109379HC.csv'.format(seed)
    
    
    fileHoldOut = './data_family/seed{}_holdOutTest.csv'.format(seed)
    #########
    print('\tReading', file35)
    X35, y35 = process_csv(file35)
    print('\tReading', file70)
    X70, y70 = process_csv(file70)
    print('\tReading', file70HC)
    X70HC, y70HC = process_csv(file70HC)
    
    print('\tReading', file35_GSE)
    X35gse, y35gse = process_csv(file35_GSE)
    print('\tReading', file35_GSEHC)
    X35gseHC, y35gseHC = process_csv(file35_GSEHC)
    print('\tReading', file70_GSE)
    X70gse, y70gse = process_csv(file70_GSE)
    
    print('\tReading', file70HC_GSE)
    X70HCgse, y70HCgse = process_csv(file70HC_GSE)
    print('\tReading', file70_GSEHC)
    X70gseHC, y70gseHC = process_csv(file70_GSEHC)
    print('\tReading', file70HC_GSEHC)
    XHC, yHC = process_csv(file70HC_GSEHC)
    
    print('\tReading', fileHoldOut)
    xitest, yitest = process_csv(fileHoldOut)
    ###################
    
    
    print('2. Evaluate model with 35% data')
    print('\tcross validate', model, 'with', cv)
    result_cv_35 = apply_CV(model, cv, X35, y35)
    print('\tvalidate against test set')
    result_ts_35 = evaluate_against_test_set(model, X35, y35, xitest, yitest)
          
    print('3. Evaluate model with 70% data')
    print('\tcross validate', model, 'with', cv)
    result_cv_70 = apply_CV(model, cv, X70, y70)
    print('\tvalidate against test set')
    result_ts_70 = evaluate_against_test_set(model, X70, y70, xitest, yitest)

    print('4. Evaluate model with 70% HC')    
    print('\tcross validate', model, 'with', cv)
    result_cv_70HC = apply_CV(model, cv, X70HC, y70HC)
    print('\tvalidate against test set')
    result_ts_70HC = evaluate_against_test_set(model, X70HC, y70HC, xitest, yitest)
    
    #########
    
    print('5. Evaluate model with 35% data with GSE109379')
    print('\tcross validate', model, 'with', cv)
    result_cv_35gse = apply_CV(model, cv, X35gse, y35gse)
    print('\tvalidate against test set')
    result_ts_35gse = evaluate_against_test_set(model, X35gse, y35gse, xitest, yitest)
    
    print('6. Evaluate model with 35% data with High confident GSE109379')
    print('\tcross validate', model, 'with', cv)
    result_cv_35gseHC = apply_CV(model, cv, X35gseHC, y35gseHC)
    print('\tvalidate against test set')
    result_ts_35gseHC = evaluate_against_test_set(model, X35gseHC, y35gseHC, xitest, yitest)
    
    print('7. Evaluate model with 70% + GSE90496 pseudo-labeled data')    
    print('\tcross validate', model, 'with', cv)
    result_cv_70gse = apply_CV(model, cv, X70gse, y70gse)
    print('\tvalidate against test set')
    result_ts_70gse = evaluate_against_test_set(model, X70gse, y70gse, xitest, yitest)
    
    ##########    
             
    print('8. Evaluate model with 70% HC + GSE109379')
    print('\tcross validate', model, 'with', cv)
    result_cv_70HCgse = apply_CV(model, cv, X70HCgse, y70HCgse)
    print('\tvalidate against test set')
    result_ts_70HCgse = evaluate_against_test_set(model, X70HCgse, y70HCgse, xitest, yitest)

    print('9. Evaluate model with 70% + HC GSE90496 ')    
    print('\tcross validate', model, 'with', cv)
    result_cv_70gseHC = apply_CV(model, cv, X70gseHC, y70gseHC)
    print('\tvalidate against test set')
    result_ts_70gseHC = evaluate_against_test_set(model, X70gseHC, y70gseHC, xitest, yitest)
      
    print('10. Evaluate model with HC pseudo-labeled data')    
    print('\tcross validate', model, 'with', cv)
    result_cv_HC = apply_CV(model, cv, XHC, yHC)
    print('\tvalidate against test set')
    result_ts_HC = evaluate_against_test_set(model, XHC, yHC, xitest, yitest)

    
    print('6. Store results.')
    df = pd.DataFrame(data=[], columns=['Seed','Dataset','Validation','Metric','Value'])
    df = append_cv_results(df, result_cv_35, seed, '35')
    df = append_testset_results(df, result_ts_35, seed, '35')
    
    df = append_cv_results(df, result_cv_70, seed, '70')
    df = append_testset_results(df, result_ts_70, seed, '70')
    
    df = append_cv_results(df, result_cv_70HC, seed, '70HC')
    df = append_testset_results(df, result_ts_70HC, seed, '70HC')
    
    df = append_cv_results(df, result_cv_35gse, seed, '35_GSE')
    df = append_testset_results(df, result_ts_35gse, seed, '35_GSE')
    
    df = append_cv_results(df, result_cv_35gseHC, seed, '35_gseHC')
    df = append_testset_results(df, result_ts_35gseHC, seed, '35_gseHC')
    
    df = append_cv_results(df, result_cv_70gse, seed, '70_GSE')
    df = append_testset_results(df, result_ts_70gse, seed, '70_GSE')
    
    df = append_cv_results(df, result_cv_70HCgse, seed, '70HC_GSE')
    df = append_testset_results(df, result_ts_70HCgse, seed, '70HC_GSE')
    df = append_cv_results(df, result_cv_70gseHC, seed, '70_gseHC')
    df = append_testset_results(df, result_ts_70gseHC, seed, '70_gseHC')
    df = append_cv_results(df, result_cv_HC, seed, 'HC')
    df = append_testset_results(df, result_ts_HC, seed, 'HC')
    
    #df_class_acc = pd.DataFrame(columns = ['Seed', 'Dataset'])
    
    df_class = create_per_class_acc_df(result_ts_35[3], seed, '35')
    
    df_class = df_class.append(create_per_class_acc_df(result_ts_70[3], seed, '70'))
    df_class = df_class.append(create_per_class_acc_df(result_ts_70HC[3], seed, '70HC'))
    
    df_class = df_class.append(create_per_class_acc_df(result_ts_35gse[3], seed, '35GSE'))
    df_class = df_class.append(create_per_class_acc_df(result_ts_35gseHC[3], seed, '35_gseHC'))
    df_class = df_class.append(create_per_class_acc_df(result_ts_70gse[3], seed, '70_GSE'))
    
    df_class = df_class.append(create_per_class_acc_df(result_ts_70HCgse[3], seed, '70HC_GSE'))
    df_class = df_class.append(create_per_class_acc_df(result_ts_70gseHC[3], seed, '70_gseHC'))
    df_class = df_class.append(create_per_class_acc_df(result_ts_HC[3], seed, 'HC'))
    
    return (df, df_class)


def count_pseudo_label_per_family(seed):
    print('1. Read files (seed={})'.format(seed))
       
    file35 = './data_family/seed{}_35perc.csv'.format(seed)
    file70='./data_family/seed{}_70perc.csv'.format(seed)
    file70HC= './data_family/seed{}_70percHC.csv'.format(seed)
    
    file35_GSE='./data_family/seed{}_35perc_gse109379.csv'.format(seed)
    file35_GSEHC ='./data_family/seed{}_35perc_HCgse109379.csv'.format(seed)
    file70_GSE = './data_family/seed{}_70perc_gse109379.csv'.format(seed)
    
    file70HC_GSE = './data_family/seed{}_70HC_gse109379.csv'.format(seed)
    file70_GSEHC = './data_family/seed{}_70perc_gse109379HC.csv'.format(seed)
    file70HC_GSEHC = './data_family/seed{}_70HC_gse109379HC.csv'.format(seed)
    
    fileHoldOut = './data_family/seed{}_holdOutTest.csv'.format(seed)
    
    #########
    
    print('\tReading', file35)
    X35, y35 = process_csv(file35)
    f35 = get_freq_series(y35)
    print('\tReading', file70)
    X70, y70 = process_csv(file70)
    f70 = get_freq_series(y70)
    print('\tReading', file70HC)
    X70HC, y70HC = process_csv(file70HC)
    f70HC = get_freq_series(y70HC)
    
    print('\tReading', file35_GSE)
    X35gse, y35gse = process_csv(file35_GSE)
    f35gse = get_freq_series(y35gse)
    print('\tReading', file35_GSEHC)
    X35gseHC, y35gseHC = process_csv(file35_GSEHC)
    f35gseHC = get_freq_series(y35gseHC)
    print('\tReading', file70_GSE)
    X70gse, y70gse = process_csv(file70_GSE)
    f70gse = get_freq_series(y70gse)
    
    print('\tReading', file70HC_GSE)
    X70HCgse, y70HCgse = process_csv(file70HC_GSE)
    f70HCgse = get_freq_series(y70HCgse)
    print('\tReading', file70_GSEHC)
    X70gseHC, y70gseHC = process_csv(file70_GSEHC)
    f70gseHC = get_freq_series(y70gseHC)
    print('\tReading', file70HC_GSEHC)
    XHC, yHC = process_csv(file70HC_GSEHC)
    fHC = get_freq_series(yHC)
    
    print('\tReading', fileHoldOut)
    xitest, yitest = process_csv(fileHoldOut)
    fitest = get_freq_series(yitest)
    
 
    print('2. Store pseudo_label_counts')
    df = pandas.DataFrame(data=[], columns = f35.keys())
    df = df.append(f35, ignore_index=True)
    df = df.append(f70, ignore_index=True)  
    df = df.append(f70HC, ignore_index=True)
    df = df.append(f35gse, ignore_index=True)
    df = df.append(f35gseHC, ignore_index=True)
    df = df.append(f70gse, ignore_index=True)
    df = df.append(f70HCgse, ignore_index=True)
    df = df.append(f70gseHC, ignore_index=True)
    df = df.append(fHC, ignore_index=True)
    df = df.append(fitest, ignore_index=True)
    
    df.insert(0, 'Dataset', ['35', '70', '70HC', '35gse', '35gseHC', '70gse', '70HCgse', '70gseHC', 'HC', 'holdOut'])
    df.insert(1, 'Seed', [seed]*10)
    
    return df
   

def combine_input_label2(seed):
    print('1. Read files (seed={})'.format(seed))
       
    file35 = './data_family/seed{}_35perc.csv'.format(seed)
    file70='./data_family/seed{}_70perc.csv'.format(seed)
    file70HC= './data_family/seed{}_70percHC.csv'.format(seed)
    
    file35_GSE='./data_family/seed{}_35perc_gse109379.csv'.format(seed)
    file35_GSEHC ='./data_family/seed{}_35perc_HCgse109379.csv'.format(seed)
    file70_GSE = './data_family/seed{}_70perc_gse109379.csv'.format(seed)
    
    file70HC_GSE = './data_family/seed{}_70HC_gse109379.csv'.format(seed)
    file70_GSEHC = './data_family/seed{}_70perc_gse109379HC.csv'.format(seed)
    file70HC_GSEHC = './data_family/seed{}_70HC_gse109379HC.csv'.format(seed)
    
    
    fileHoldOut = './data_family/seed{}_holdOutTest.csv'.format(seed)
    #########
    
    #########
    
    print('\tReading', file35)
    X35, y35 = process_csv(file35)
    f35 = get_freq_series(y35)
    print('\tReading', file70)
    X70, y70 = process_csv(file70)
    f70 = get_freq_series(y70)
    print('\tReading', file70HC)
    X70HC, y70HC = process_csv(file70HC)
    f70HC = get_freq_series(y70HC)
    
    print('\tReading', file35_GSE)
    X35gse, y35gse = process_csv(file35_GSE)
    f35gse = get_freq_series(y35gse)
    print('\tReading', file35_GSEHC)
    X35gseHC, y35gseHC = process_csv(file35_GSEHC)
    f35gseHC = get_freq_series(y35gseHC)
    print('\tReading', file70_GSE)
    X70gse, y70gse = process_csv(file70_GSE)
    f70gse = get_freq_series(y70gse)
    
    print('\tReading', file70HC_GSE)
    X70HCgse, y70HCgse = process_csv(file70HC_GSE)
    f70HCgse = get_freq_series(y70HCgse)
    print('\tReading', file70_GSEHC)
    X70gseHC, y70gseHC = process_csv(file70_GSEHC)
    f70gseHC = get_freq_series(y70gseHC)
    print('\tReading', file70HC_GSEHC)
    XHC, yHC = process_csv(file70HC_GSEHC)
    fHC = get_freq_series(yHC)
    
    print('\tReading', fileHoldOut)
    xitest, yitest = process_csv(fileHoldOut)
    fitest = get_freq_series(yitest)
    
 
    print('2. Store pseudo_label_counts')
    df = pandas.DataFrame(data=[], columns = f35.keys())
    df = df.append(f35, ignore_index=True)
    df = df.append(f70, ignore_index=True)  
    df = df.append(f70HC, ignore_index=True)
    df = df.append(f35gse, ignore_index=True)
    df = df.append(f35gseHC, ignore_index=True)
    df = df.append(f70gse, ignore_index=True)
    df = df.append(f70HCgse, ignore_index=True)
    df = df.append(f70gseHC, ignore_index=True)
    df = df.append(fHC, ignore_index=True)
    df = df.append(fitest, ignore_index=True)
    
    df.insert(0, 'Dataset', ['35', '70', '70HC', '35gse', '35gseHC', '70gse', '70HCgse', '70gseHC', 'HC', 'holdOut'])
    df.insert(1, 'Seed', [seed]*10)
    
    return df

def evaluate_short(seed, model, cv, mcf=False):
    print('1. Read files (seed={})'.format(seed))
       
    file35 = './data_family/seed{}_35perc.csv'.format(seed)
    file70='./data_family/seed{}_70perc.csv'.format(seed)
    file70HC= './data_family/seed{}_70percHC.csv'.format(seed)
    
    
    
    fileHoldOut = './data_family/seed{}_holdOutTest.csv'.format(seed)
    #########
    print('\tReading', file35)
    X35, y35 = process_csv(file35)
    print('\tReading', file70)
    X70, y70 = process_csv(file70)
    print('\tReading', file70HC)
    X70HC, y70HC = process_csv(file70HC)
    
        
    print('\tReading', fileHoldOut)
    xitest, yitest = process_csv(fileHoldOut)
    ###################
    
    
    print('2. Evaluate model with 35% data')
    print('\tcross validate', model, 'with', cv)
    result_cv_35 = apply_CV(model, cv, X35, y35)
    print('\tvalidate against test set')
    result_ts_35 = evaluate_against_test_set(model, X35, y35, xitest, yitest)
          
    print('3. Evaluate model with 70% data')
    print('\tcross validate', model, 'with', cv)
    result_cv_70 = apply_CV(model, cv, X70, y70)
    print('\tvalidate against test set')
    result_ts_70 = evaluate_against_test_set(model, X70, y70, xitest, yitest)

    
    #########
    
        
    print('6. Store results.')
    df = pd.DataFrame(data=[], columns=['Seed','Dataset','Validation','Metric','Value'])
    df = append_cv_results(df, result_cv_35, seed, '35')
    df = append_testset_results(df, result_ts_35, seed, '35')
    
    df = append_cv_results(df, result_cv_70, seed, '70')
    df = append_testset_results(df, result_ts_70, seed, '70')
    
    
    #df_class_acc = pd.DataFrame(columns = ['Seed', 'Dataset'])
    
    df_class_acc = create_per_class_acc_df(result_ts_35[3], seed, '35')
    df_class_acc = df_class_acc.append(create_per_class_acc_df(result_ts_70[3], seed, '70'))
       
    return (df, df_class_acc)

def run_all_seeds(rand):
    model = RandomForestClassifier(n_estimators = 50, max_depth=100, random_state=rand)
    cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=5)
    for seed in [1, 2, 20, 40, 80, 160, 320]:
        result = evaluate(seed, model, cv)
        output_file = 'result_family_seed{}_GSE109379_{}.csv'.format(seed, rand)
        output_per_family_file = './output_acc_perFamily/result_acc_perFamily_seed{}_GSE109379_{}.csv'.format(seed, rand)
        result[0].to_csv(output_file, index=False)
        result[1].to_csv(output_per_family_file, index=False)
        print('Result saved to', output_file)

def run_one_seeds(rand):
    model = RandomForestClassifier(n_estimators = 50, max_depth=100, random_state=rand)
    cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=1)
    for seed in [ 2]:
        result = evaluate_short(seed, model, cv)
        output_file = './output/result_family_seed{}_GSE109379_{}.csv'.format(seed, rand)
        output_per_family_file = './output_acc_perFamily/result_acc_perFamily_seed{}_GSE109379_{}.csv'.format(seed, rand)
        result[0].to_csv(output_file, index=False)
        result[1].to_csv(output_per_family_file, index=False)
        print('Result saved to', output_file)
        
def create_pseudo_labels_file():
    for seed in [1, 2, 20, 40, 80, 160, 320]:
        combined_labels = count_pseudo_label_per_family(seed)
        output_file = 'pseudo_label_family_seed{}.csv'.format(seed)
        combined_labels.to_csv(output_file, index=True)

In [4]:
class_label = './raw_data/GSE90496_methylation_family_label.csv'

In [13]:
labels = pd.read_csv(class_label, header=0, names=['y'])

labels

Unnamed: 0,y
GSM2402854_5684819014_R03C02,"GBM, G34"
GSM2402863_5684819013_R04C01,"DMG, K27"
GSM2402953_6164621144_R03C02,MCF ATRT
GSM2403853_9305651003_R06C02,"EPN, RELA"
GSM2403854_9305651005_R03C02,"EPN, PF B"
...,...
GSM2403848_9305651037_R02C02,"CPH, PAP"
GSM2403849_9305651037_R06C01,"CPH, PAP"
GSM2403850_9305651037_R04C01,"EPN, MPE"
GSM2403851_9305651037_R01C02,PITUI


In [20]:
labels.y = [x.strip() for x in labels.y.values]

In [22]:
labels

Unnamed: 0,y
GSM2402854_5684819014_R03C02,"GBM, G34"
GSM2402863_5684819013_R04C01,"DMG, K27"
GSM2402953_6164621144_R03C02,MCF ATRT
GSM2403853_9305651003_R06C02,"EPN, RELA"
GSM2403854_9305651005_R03C02,"EPN, PF B"
...,...
GSM2403848_9305651037_R02C02,"CPH, PAP"
GSM2403849_9305651037_R06C01,"CPH, PAP"
GSM2403850_9305651037_R04C01,"EPN, MPE"
GSM2403851_9305651037_R01C02,PITUI


In [24]:
run_all_seeds(123456)

1. Read files (seed=1)
	Reading ./data_family/seed1_35perc.csv
	Reading ./data_family/seed1_70perc.csv
	Reading ./data_family/seed1_70percHC.csv
	Reading ./data_family/seed1_35perc_gse109379.csv
	Reading ./data_family/seed1_35perc_HCgse109379.csv
	Reading ./data_family/seed1_70perc_gse109379.csv
	Reading ./data_family/seed1_70HC_gse109379.csv
	Reading ./data_family/seed1_70perc_gse109379HC.csv
	Reading ./data_family/seed1_70HC_gse109379HC.csv
	Reading ./data_family/seed1_holdOutTest.csv
2. Evaluate model with 35% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




3. Evaluate model with 70% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




4. Evaluate model with 70% HC
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




5. Evaluate model with 35% data with GSE109379
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




5. Evaluate model with 35% data with High confident GSE109379
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




6. Evaluate model with 70% + GSE90496 pseudo-labeled data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




7. Evaluate model with 70% HC + GSE109379
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




8. Evaluate model with 70% + HC GSE90496 
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




9. Evaluate model with HC pseudo-labeled data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




6. Store results.
Result saved to result_family_seed1_GSE109379_123456.csv
1. Read files (seed=2)
	Reading ./data_family/seed2_35perc.csv
	Reading ./data_family/seed2_70perc.csv
	Reading ./data_family/seed2_70percHC.csv
	Reading ./data_family/seed2_35perc_gse109379.csv
	Reading ./data_family/seed2_35perc_HCgse109379.csv
	Reading ./data_family/seed2_70perc_gse109379.csv
	Reading ./data_family/seed2_70HC_gse109379.csv
	Reading ./data_family/seed2_70perc_gse109379HC.csv
	Reading ./data_family/seed2_70HC_gse109379HC.csv
	Reading ./data_family/seed2_holdOutTest.csv
2. Evaluate model with 35% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




3. Evaluate model with 70% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




4. Evaluate model with 70% HC
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




5. Evaluate model with 35% data with GSE109379
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




5. Evaluate model with 35% data with High confident GSE109379
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




6. Evaluate model with 70% + GSE90496 pseudo-labeled data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




7. Evaluate model with 70% HC + GSE109379
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




8. Evaluate model with 70% + HC GSE90496 
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




9. Evaluate model with HC pseudo-labeled data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




6. Store results.
Result saved to result_family_seed2_GSE109379_123456.csv
1. Read files (seed=20)
	Reading ./data_family/seed20_35perc.csv
	Reading ./data_family/seed20_70perc.csv
	Reading ./data_family/seed20_70percHC.csv
	Reading ./data_family/seed20_35perc_gse109379.csv
	Reading ./data_family/seed20_35perc_HCgse109379.csv
	Reading ./data_family/seed20_70perc_gse109379.csv
	Reading ./data_family/seed20_70HC_gse109379.csv
	Reading ./data_family/seed20_70perc_gse109379HC.csv
	Reading ./data_family/seed20_70HC_gse109379HC.csv
	Reading ./data_family/seed20_holdOutTest.csv
2. Evaluate model with 35% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




3. Evaluate model with 70% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




4. Evaluate model with 70% HC
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




5. Evaluate model with 35% data with GSE109379
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




5. Evaluate model with 35% data with High confident GSE109379
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




6. Evaluate model with 70% + GSE90496 pseudo-labeled data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




7. Evaluate model with 70% HC + GSE109379
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




8. Evaluate model with 70% + HC GSE90496 
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




9. Evaluate model with HC pseudo-labeled data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




6. Store results.
Result saved to result_family_seed20_GSE109379_123456.csv
1. Read files (seed=40)
	Reading ./data_family/seed40_35perc.csv
	Reading ./data_family/seed40_70perc.csv
	Reading ./data_family/seed40_70percHC.csv
	Reading ./data_family/seed40_35perc_gse109379.csv
	Reading ./data_family/seed40_35perc_HCgse109379.csv
	Reading ./data_family/seed40_70perc_gse109379.csv
	Reading ./data_family/seed40_70HC_gse109379.csv
	Reading ./data_family/seed40_70perc_gse109379HC.csv
	Reading ./data_family/seed40_70HC_gse109379HC.csv
	Reading ./data_family/seed40_holdOutTest.csv
2. Evaluate model with 35% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




3. Evaluate model with 70% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




4. Evaluate model with 70% HC
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




5. Evaluate model with 35% data with GSE109379
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




5. Evaluate model with 35% data with High confident GSE109379
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




6. Evaluate model with 70% + GSE90496 pseudo-labeled data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




7. Evaluate model with 70% HC + GSE109379
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




8. Evaluate model with 70% + HC GSE90496 
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




9. Evaluate model with HC pseudo-labeled data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




6. Store results.
Result saved to result_family_seed40_GSE109379_123456.csv
1. Read files (seed=80)
	Reading ./data_family/seed80_35perc.csv
	Reading ./data_family/seed80_70perc.csv
	Reading ./data_family/seed80_70percHC.csv
	Reading ./data_family/seed80_35perc_gse109379.csv
	Reading ./data_family/seed80_35perc_HCgse109379.csv
	Reading ./data_family/seed80_70perc_gse109379.csv
	Reading ./data_family/seed80_70HC_gse109379.csv
	Reading ./data_family/seed80_70perc_gse109379HC.csv
	Reading ./data_family/seed80_70HC_gse109379HC.csv
	Reading ./data_family/seed80_holdOutTest.csv
2. Evaluate model with 35% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




3. Evaluate model with 70% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




4. Evaluate model with 70% HC
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




5. Evaluate model with 35% data with GSE109379
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




5. Evaluate model with 35% data with High confident GSE109379
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




6. Evaluate model with 70% + GSE90496 pseudo-labeled data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




7. Evaluate model with 70% HC + GSE109379
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




8. Evaluate model with 70% + HC GSE90496 
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




9. Evaluate model with HC pseudo-labeled data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




6. Store results.
Result saved to result_family_seed80_GSE109379_123456.csv
1. Read files (seed=160)
	Reading ./data_family/seed160_35perc.csv
	Reading ./data_family/seed160_70perc.csv
	Reading ./data_family/seed160_70percHC.csv
	Reading ./data_family/seed160_35perc_gse109379.csv
	Reading ./data_family/seed160_35perc_HCgse109379.csv
	Reading ./data_family/seed160_70perc_gse109379.csv
	Reading ./data_family/seed160_70HC_gse109379.csv
	Reading ./data_family/seed160_70perc_gse109379HC.csv
	Reading ./data_family/seed160_70HC_gse109379HC.csv
	Reading ./data_family/seed160_holdOutTest.csv
2. Evaluate model with 35% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




3. Evaluate model with 70% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




4. Evaluate model with 70% HC
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




5. Evaluate model with 35% data with GSE109379
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




5. Evaluate model with 35% data with High confident GSE109379
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




6. Evaluate model with 70% + GSE90496 pseudo-labeled data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




7. Evaluate model with 70% HC + GSE109379
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




8. Evaluate model with 70% + HC GSE90496 
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




9. Evaluate model with HC pseudo-labeled data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




6. Store results.
Result saved to result_family_seed160_GSE109379_123456.csv
1. Read files (seed=320)
	Reading ./data_family/seed320_35perc.csv
	Reading ./data_family/seed320_70perc.csv
	Reading ./data_family/seed320_70percHC.csv
	Reading ./data_family/seed320_35perc_gse109379.csv
	Reading ./data_family/seed320_35perc_HCgse109379.csv
	Reading ./data_family/seed320_70perc_gse109379.csv
	Reading ./data_family/seed320_70HC_gse109379.csv
	Reading ./data_family/seed320_70perc_gse109379HC.csv
	Reading ./data_family/seed320_70HC_gse109379HC.csv
	Reading ./data_family/seed320_holdOutTest.csv
2. Evaluate model with 35% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




3. Evaluate model with 70% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




4. Evaluate model with 70% HC
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




5. Evaluate model with 35% data with GSE109379
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




5. Evaluate model with 35% data with High confident GSE109379
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




6. Evaluate model with 70% + GSE90496 pseudo-labeled data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




7. Evaluate model with 70% HC + GSE109379
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




8. Evaluate model with 70% + HC GSE90496 
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




9. Evaluate model with HC pseudo-labeled data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set
6. Store results.
Result saved to result_family_seed320_GSE109379_123456.csv




In [3]:
file1 = './data/seed2_35perc_train.Mat'
file2='./data/seed2_70perc_SSLabel.csv'
file3='./data/seed2_SSLabel_HC.csv'
X35, y35, xitest, yitest = read_Mat(file1)
X70, y70 = process_csv(file2)
XHC, yHC = process_csv(file3)

In [7]:
result35 = apply_CV(model, cv, X35, y35)
result_ts_35 = evaluate_against_test_set(model, X35, y35, xitest, yitest)



In [8]:
result70 = apply_CV(model, cv, X70, y70)
result_ts_70 = evaluate_against_test_set(model, X70, y70, xitest, yitest)



In [9]:
resultHC = apply_CV(model, cv, XHC, yHC)
result_ts_HC = evaluate_against_test_set(model, XHC, yHC, xitest, yitest)



In [10]:
result_df = pandas.DataFrame(data=[], columns=['Seed','Dataset','Validation','Metric','Value'])

In [11]:
result_df = append_cv_results(result_df, result35, 2, '35%')
result_df = append_testset_results(result_df, result_ts_35, 2, '35%')
result_df = append_cv_results(result_df, result70, 2, '70%')
result_df = append_testset_results(result_df, result_ts_70, 2, '70%')
result_df = append_cv_results(result_df, resultHC, 2, 'HC')
result_df = append_testset_results(result_df, result_ts_HC, 2, 'HC')

In [15]:
result_df

Unnamed: 0,Seed,Dataset,Validation,Metric,Value
0,2,35%,cross_val,balanced_acc,0.71
1,2,35%,cross_val,f1,0.81
2,2,35%,cross_val,roc_auc_ovr,0.99
3,2,35%,vs_testset,prec,0.86
4,2,35%,vs_testset,rec,0.88
5,2,35%,vs_testset,acc,0.88
6,2,35%,vs_testset,balanced_acc,0.903
7,2,70%,cross_val,balanced_acc,0.83
8,2,70%,cross_val,f1,0.9
9,2,70%,cross_val,roc_auc_ovr,1.0
