# Evaluate Pseudo Labels

Questions:
1. Do labels obtained from semisupervised classifier improve a supervised classifier performance?
2. Do high confident labels from a SSL model improve a supervised classifier performance?

## Design

Original data set has 2801 samples x 5072 features. This data were split into 70% train and 30% test. This process was repeated 7 times to create 7 different train-test splits.

For each split:
1. Create a random forest model
2. Evaluate the model on 35% data with itest
3. Evaluate the model on 35% data  + SSL GSE109379 with itest
4. Evaluate the model on 35% data + 35% pseudo-label data + GSE109379 pseudo-label data with itest
5. Evaluate the model on 35% data + 35% pseudo-label data + HC GSE1093790 pseudo-label data with itest
6. Evaluate the model on 35% data + high confident pseudo-label +  GSE109379 pseudo-label data with itest
7. Evaluate the model on 35% data + high confident pseudo-label +  HC GSE109379 pseudo-label data with itest


Box plots for each model (1 box plot show the average accuracies of each model over the 7 train-test splits)

In [154]:
import scipy.io
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, accuracy_score,f1_score,precision_score,recall_score 
from sklearn.metrics import confusion_matrix
from sklearn.utils import class_weight
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import collections
  # it returns a dictionary data structure whose 
# keys are array elements and values are their 
# corresponding frequencies {1: 4, 2: 4, 3: 2, 
# 5: 2, 4: 1}
def CountFrequency(arr):
    return collections.Counter(arr)  

def combine(y, str):
    freq = CountFrequency(y)
    d = pd.DataFrame(freq.items(), columns=['Sample', 'pseudo_class_label'] )    
    schema = [str] * len(d)
    d['Dataset'] = schema
    return d

def get_freq_series(y):
    freq = CountFrequency(y)
    s = pd.Series(freq.values(), index = freq.keys())
    return s
    
def read_Mat(filename):
    d = scipy.io.loadmat(filename)
    xtrain = d['xtrain']
    ytrain = [x[0][0].strip() for x in d['ytrain']]
    xitest = d['xitest']
    yitest = [x[0][0].strip() for x in d['yitest']]
    return(xtrain, ytrain, xitest, yitest)

def get_beta_in_select_file(beta, filename):
    X_temp, y = process_csv(filename)
    X = beta[ X_temp.index ]
    return X.T, y
 
def process_csv(filename):
    df = pd.read_csv(filename, index_col=0)
    y = df.y
    X = df.drop(columns=['y'])
    return (X, y)

def append_result(p, df):
    s = pd.Series(p, index=df.columns)
    return df.append(s, ignore_index=True)

def select_probes(data, sd_cutoff):
    probes = data.T
    probes['STD'] = probes.std(axis=1)
    above_threshold = probes[probes["STD"] > sd_cutoff]
    print("shape in select_probes", above_threshold.T.shape)
    return above_threshold.drop(columns='STD').T

def match_probes(ref_data, data_to_be_matched):
    df = data_to_be_matched.loc[:, ref_data.columns.values]
    return df

def apply_CV(mod, cv, X, y):
    return cross_validate(mod, X, y, cv=cv, scoring=['balanced_accuracy', 'accuracy', 'recall_weighted'])

def accuracy_per_class(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    classes = np.unique(y_true)
    per_class_accuracies = cm.diagonal()/cm.sum(axis=1)
    per_class_acc_wKeys = {}
    for idx, cls in enumerate(classes):
        per_class_acc_wKeys[cls] = per_class_accuracies[idx]
    return per_class_acc_wKeys

def evaluate_against_test_set(mod, x_train, y_train, x_test, y_test, cls_weight):
    mod.fit(x_train, y_train)
    y_pred = mod.predict(x_test)
    test_sample_weights = class_weight.compute_sample_weight(cls_weight, y_test)
    acc=accuracy_score(y_test,y_pred).round(3)
    bal_acc=balanced_accuracy_score(y_test,y_pred, sample_weight = test_sample_weights).round(3)
    rec=recall_score(y_test,y_pred, average='weighted', zero_division=0).round(3)
    prec=precision_score(y_test,y_pred, average='weighted', zero_division=0).round(3)
    f1=f1_score(y_test,y_pred, average='weighted').round(3)
    per_class_acc = accuracy_per_class(y_test, y_pred)
    return (acc, bal_acc, rec, prec, f1, per_class_acc)

def evaluate_against_cv_test_set(mod, x_test, y_test, cls_weight):
    y_pred = mod.predict(x_test)
    test_sample_weights = class_weight.compute_sample_weight(cls_weight, y_test)
    acc=accuracy_score(y_test,y_pred).round(3)
    bal_acc=balanced_accuracy_score(y_test,y_pred, sample_weight = test_sample_weights).round(3)
    rec=recall_score(y_test,y_pred, average='weighted', zero_division=0).round(3)
    prec=precision_score(y_test,y_pred, average='weighted', zero_division=0).round(3)
    f1=f1_score(y_test,y_pred, average='weighted').round(3)
    return (acc, bal_acc, rec, prec, f1)

def cross_validate_withSD(mod, data_x, data_y, cv, sd_cutoff, cls_weight):
    '''
    features are selectiving for the training data set after each CV split
    '''
    #cvscores = []
    cv_balanced_scores = []
    fold = 1
    #model = RandomForestClassifier(n_estimators = 50, max_depth=100)
    for train_index,test_index in cv.split(data_x, data_y):
        #print("X_cv shape = ", data_x.shape, "; y_cv shape = ", data_y.shape)
        x_train = select_probes(data_x.iloc[train_index, :], sd_cutoff)
        x_test = match_probes(x_train, data_x.iloc[test_index, :])
        
        y_train,y_test = data_y[train_index], data_y[test_index]
        
        print('------------------------------------------------------------------------')
        print(f'Training for fold {fold} ...')
        mod.fit(x_train, y_train)
        scores = evaluate_against_cv_test_set(mod, x_test, y_test, cls_weight)
    
        cv_balanced_scores.append(scores)
        
        print("Scores = ", scores)
        
        fold = fold+1
    return model, pd.DataFrame(cv_balanced_scores, columns= ['acc', 'bal_acc', 'weighted_recall','weighted_precision', 'weighted_F1'])


def append_cv_results(df, r, seed_name, dataset_name):
    df = append_result([seed_name, dataset_name, 'cross_val', 'accuracy', r['acc'].mean()], df)
    df = append_result([seed_name, dataset_name, 'cross_val', 'balanced_acc', r['bal_acc'].mean()], df)
    df = append_result([seed_name, dataset_name, 'cross_val', 'recall_weighted', r['weighted_recall'].mean()], df)
    df = append_result([seed_name, dataset_name, 'cross_val', 'precision_weighted', r['weighted_precision'].mean()], df)
    return df

def accuracy_per_class(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    classes = np.unique(y_true)
    per_class_accuracies = cm.diagonal()/cm.sum(axis=1)
    per_class_acc_wKeys = {}
    for idx, cls in enumerate(classes):
        per_class_acc_wKeys[cls] = per_class_accuracies[idx]
    return per_class_acc_wKeys

def append_testset_results(df, r, seed_name, dataset_name):
    df = append_result([seed_name, dataset_name, 'vs_testset', 'accuracy', r[0]], df)
    df = append_result([seed_name, dataset_name, 'vs_testset', 'balanced_acc', r[1]], df)
    df = append_result([seed_name, dataset_name, 'vs_testset', 'recall_weighted', r[2]], df)
    df = append_result([seed_name, dataset_name, 'vs_testset', 'precision_weighted', r[3]], df)
    return df

def create_per_class_acc_df(class_acc, seed, dset):
    d = pd.DataFrame(columns = ['Seed', 'Dataset'])
    d = d.append(class_acc, ignore_index=True)
    d['Seed'] = seed
    d['Dataset'] = dset
    return d



   


In [155]:

def plot_cm(y_true, y_pred):
    # Creating  a confusion matrix,which compares the y_test and y_pred
    cm = confusion_matrix(y_true, y_pred)
    # Creating a dataframe for a array-formatted Confusion matrix.
    cm_df = pd.DataFrame(cm,
                     index = np.unique(y_true), 
                     columns = np.unique(y_true))
    #Plotting the confusion matrix
    plt.figure(figsize=(19,25))
    sns.heatmap(cm_df, annot=True, cmap="YlGnBu", cbar_kws={"orientation": "horizontal"})
    plt.title('Confusion Matrix')
    plt.ylabel('Actual Labels')
    plt.xlabel('Predicted Labels')
    plt.show()

def evaluate(seed, model, cv, mcf=False):
    print('1. Read files (seed={})'.format(seed))
       
    file35 = './data_class/seed{}_35perc.csv'.format(seed)
    file70='./data_class/seed{}_70perc.csv'.format(seed)
    file70HC= './data_class/seed{}_70percHC.csv'.format(seed)
    
    file35_GSE='./data_class/seed{}_35perc_gse109379.csv'.format(seed)
    file35_GSEHC ='./data_class/seed{}_35perc_HCgse109379.csv'.format(seed)
    file70_GSE = './data_class/seed{}_70perc_gse109379.csv'.format(seed)
    
    file70HC_GSE = './data_class/seed{}_70HC_gse109379.csv'.format(seed)
    file70_GSEHC = './data_class/seed{}_70perc_gse109379HC.csv'.format(seed)
    file70HC_GSEHC = './data_class/seed{}_70HC_gse109379HC.csv'.format(seed)
    
    
    fileHoldOut = './data_class/seed{}_holdOutTest.csv'.format(seed)
    #########
    print('\tReading', file35)
    X35, y35 = process_csv(file35)
    print('\tReading', file70)
    X70, y70 = process_csv(file70)
    print('\tReading', file70HC)
    X70HC, y70HC = process_csv(file70HC)
    
    print('\tReading', file35_GSE)
    X35gse, y35gse = process_csv(file35_GSE)
    print('\tReading', file35_GSEHC)
    X35gseHC, y35gseHC = process_csv(file35_GSEHC)
    print('\tReading', file70_GSE)
    X70gse, y70gse = process_csv(file70_GSE)
    
    print('\tReading', file70HC_GSE)
    X70HCgse, y70HCgse = process_csv(file70HC_GSE)
    print('\tReading', file70_GSEHC)
    X70gseHC, y70gseHC = process_csv(file70_GSEHC)
    print('\tReading', file70HC_GSEHC)
    XHC, yHC = process_csv(file70HC_GSEHC)
    
    print('\tReading', fileHoldOut)
    xitest, yitest = process_csv(fileHoldOut)
    ###################
    
    
    print('2. Evaluate model with 35% data')
    print('\tcross validate', model, 'with', cv)
    result_cv_35 = apply_CV(model, cv, X35, y35)
    print('\tvalidate against test set')
    result_ts_35 = evaluate_against_test_set(model, X35, y35, xitest, yitest)
          
    print('3. Evaluate model with 70% data')
    print('\tcross validate', model, 'with', cv)
    result_cv_70 = apply_CV(model, cv, X70, y70)
    print('\tvalidate against test set')
    result_ts_70 = evaluate_against_test_set(model, X70, y70, xitest, yitest)

    print('4. Evaluate model with 70% HC')    
    print('\tcross validate', model, 'with', cv)
    result_cv_70HC = apply_CV(model, cv, X70HC, y70HC)
    print('\tvalidate against test set')
    result_ts_70HC = evaluate_against_test_set(model, X70HC, y70HC, xitest, yitest)
    
    #########
    
    print('5. Evaluate model with 35% data with GSE109379')
    print('\tcross validate', model, 'with', cv)
    result_cv_35gse = apply_CV(model, cv, X35gse, y35gse)
    print('\tvalidate against test set')
    result_ts_35gse = evaluate_against_test_set(model, X35gse, y35gse, xitest, yitest)
    
    print('5. Evaluate model with 35% data with High confident GSE109379')
    print('\tcross validate', model, 'with', cv)
    result_cv_35gseHC = apply_CV(model, cv, X35gseHC, y35gseHC)
    print('\tvalidate against test set')
    result_ts_35gseHC = evaluate_against_test_set(model, X35gseHC, y35gseHC, xitest, yitest)
    
    print('6. Evaluate model with 70% + GSE90496 pseudo-labeled data')    
    print('\tcross validate', model, 'with', cv)
    result_cv_70gse = apply_CV(model, cv, X70gse, y70gse)
    print('\tvalidate against test set')
    result_ts_70gse = evaluate_against_test_set(model, X70gse, y70gse, xitest, yitest)
    
    ##########    
             
    print('7. Evaluate model with 70% HC + GSE109379')
    print('\tcross validate', model, 'with', cv)
    result_cv_70HCgse = apply_CV(model, cv, X70HCgse, y70HCgse)
    print('\tvalidate against test set')
    result_ts_70HCgse = evaluate_against_test_set(model, X70HCgse, y70HCgse, xitest, yitest)

    print('8. Evaluate model with 70% + HC GSE90496 ')    
    print('\tcross validate', model, 'with', cv)
    result_cv_70gseHC = apply_CV(model, cv, X70gseHC, y70gseHC)
    print('\tvalidate against test set')
    result_ts_70gseHC = evaluate_against_test_set(model, X70gseHC, y70gseHC, xitest, yitest)
      
    print('9. Evaluate model with HC pseudo-labeled data')    
    print('\tcross validate', model, 'with', cv)
    result_cv_HC = apply_CV(model, cv, XHC, yHC)
    print('\tvalidate against test set')
    result_ts_HC = evaluate_against_test_set(model, XHC, yHC, xitest, yitest)

    
    print('6. Store results.')
    df = pd.DataFrame(data=[], columns=['Seed','Dataset','Validation','Metric','Value'])
    df = append_cv_results(df, result_cv_35, seed, '35')
    df = append_testset_results(df, result_ts_35, seed, '35')
    
    df = append_cv_results(df, result_cv_70, seed, '70')
    df = append_testset_results(df, result_ts_70, seed, '70')
    
    df = append_cv_results(df, result_cv_70HC, seed, '70HC')
    df = append_testset_results(df, result_ts_70HC, seed, '70HC')
    
    df = append_cv_results(df, result_cv_35gse, seed, '35_GSE')
    df = append_testset_results(df, result_ts_35gse, seed, '35_GSE')
    
    df = append_cv_results(df, result_cv_35gseHC, seed, '35_gseHC')
    df = append_testset_results(df, result_ts_35gseHC, seed, '35_gseHC')
    
    df = append_cv_results(df, result_cv_70gse, seed, '70_GSE')
    df = append_testset_results(df, result_ts_70gse, seed, '70_GSE')
    
    df = append_cv_results(df, result_cv_70HCgse, seed, '70HC_GSE')
    df = append_testset_results(df, result_ts_70HCgse, seed, '70HC_GSE')
    df = append_cv_results(df, result_cv_70gseHC, seed, '70_gseHC')
    df = append_testset_results(df, result_ts_70gseHC, seed, '70_gseHC')
    df = append_cv_results(df, result_cv_HC, seed, 'HC')
    df = append_testset_results(df, result_ts_HC, seed, 'HC')
    
    #df_class_acc = pd.DataFrame(columns = ['Seed', 'Dataset'])
    
    df_class = create_per_class_acc_df(result_ts_35[3], seed, '35')
    
    df_class = df_class.append(create_per_class_acc_df(result_ts_70[3], seed, '70'))
    df_class = df_class.append(create_per_class_acc_df(result_ts_70HC[3], seed, '70HC'))
    
    df_class = df_class.append(create_per_class_acc_df(result_ts_35gse[3], seed, '35GSE'))
    df_class = df_class.append(create_per_class_acc_df(result_ts_35gseHC[3], seed, '35_gseHC'))
    df_class = df_class.append(create_per_class_acc_df(result_ts_70gse[3], seed, '70_GSE'))
    
    df_class = df_class.append(create_per_class_acc_df(result_ts_70HCgse[3], seed, '70HC_GSE'))
    df_class = df_class.append(create_per_class_acc_df(result_ts_70gseHC[3], seed, '70_gseHC'))
    df_class = df_class.append(create_per_class_acc_df(result_ts_HC[3], seed, 'HC'))
    
    return (df, df_class)

def evaluate_short(seed, probe_data, model, cv, mcf=False):
    print('1. Read files (seed={})'.format(seed))
       
    file35 = './data_class/seed{}_35perc.csv'.format(seed)
    file70='./data_class/seed{}_70perc.csv'.format(seed)
    file70HC= './data_class/seed{}_70percHC.csv'.format(seed)
    
    fileHoldOut = './data_class/seed{}_holdOutTest.csv'.format(seed)
    #########
    print('\tReading', file35)
    X35, y35 = get_beta_in_select_file(probe_data, file35)
    print('\tReading', file70)
    X70, y70 = get_beta_in_select_file(probe_data, file70)
    print('\tReading', file70HC)
    X70HC, y70HC = get_beta_in_select_file(probe_data, file70HC)
    
        
    print('\tReading', fileHoldOut)
    xitest, yitest = get_beta_in_select_file(probe_data, fileHoldOut)
    ###################
    
    
    print('2. Evaluate model with 35% data')
    print('\tcross validate', model, 'with', cv)
    mod35, result_cv_35 = cross_validate_withSD(model, X35, y35, cv, sd_cutoff = 0.3, cls_weight = cls_weight_dict)
    print('\tvalidate against test set')
    result_ts_35 = evaluate_against_test_set(mod35, X35, y35, xitest, yitest, cls_weight=cls_weight_dict)
          
    print('3. Evaluate model with 70% data')
    print('\tcross validate', model, 'with', cv)
    mod70, result_cv_70 = cross_validate_withSD(model, X70, y70, cv, sd_cutoff = 0.3, cls_weight = cls_weight_dict)
    print('\tvalidate against test set')
    result_ts_70 = evaluate_against_test_set(mod70, X70, y70, xitest, yitest, cls_weight=cls_weight_dict)

    print('4. Evaluate model with 70% HC')    
    print('\tcross validate', model, 'with', cv)
    mod70HC, result_cv_70HC = cross_validate_withSD(model, X70HC, y70HC, cv, sd_cutoff = 0.3, cls_weight = cls_weight_dict)
    print('\tvalidate against test set')
    result_ts_70HC = evaluate_against_test_set(mod70HC, X70HC, y70HC, xitest, yitest, cls_weight=cls_weight_dict)
    
    #########
    
        
    print('6. Store results.')
    df = pd.DataFrame(data=[], columns=['Seed','Dataset','Validation','Metric','Value'])
    
    df = append_cv_results(df, result_cv_35, seed, '35')
    df = append_testset_results(df, result_ts_35, seed, '35')
    
    df = append_cv_results(df, result_cv_70, seed, '70')
    df = append_testset_results(df, result_ts_70, seed, '70')
    
    df = append_cv_results(df, result_cv_70HC, seed, '70HC')
    df = append_testset_results(df, result_ts_70HC, seed, '70HC')
    
    #df_class_acc = pd.DataFrame(columns = ['Seed', 'Dataset'])
    
    df_class_acc = create_per_class_acc_df(result_ts_35[5], seed, '35')
    df_class_acc = df_class_acc.append(create_per_class_acc_df(result_ts_70[5], seed, '70'))
    df_class_acc = df_class_acc.append(create_per_class_acc_df(result_ts_70HC[5], seed, '70HC'))
       
    return (df, df_class_acc)

In [151]:
####Combine all the pseudo_label_inputs into 1 file 
def count_pseudo_label_per_class(seed):
    print('1. Read files (seed={})'.format(seed))
       
    file35 = './data/seed{}_35perc.csv'.format(seed)
    file70='./data/seed{}_70perc.csv'.format(seed)
    file70HC= './data/seed{}_70percHC.csv'.format(seed)
    
    file35_GSE='./data/seed{}_35perc_gse109379.csv'.format(seed)
    file35_GSEHC ='./data/seed{}_35perc_HCgse109379.csv'.format(seed)
    file70_GSE = './data/seed{}_70perc_gse109379.csv'.format(seed)
    
    file70HC_GSE = './data/seed{}_70HC_gse109379.csv'.format(seed)
    file70_GSEHC = './data/seed{}_70perc_gse109379HC.csv'.format(seed)
    file70HC_GSEHC = './data/seed{}_70HC_gse109379HC.csv'.format(seed)
    
    fileHoldOut = './data/seed{}_holdOutTest.csv'.format(seed)
    
    #########
    
    print('\tReading', file35)
    X35, y35 = process_csv(file35)
    f35 = get_freq_series(y35)
    print('\tReading', file70)
    X70, y70 = process_csv(file70)
    f70 = get_freq_series(y70)
    print('\tReading', file70HC)
    X70HC, y70HC = process_csv(file70HC)
    f70HC = get_freq_series(y70HC)
    
    print('\tReading', file35_GSE)
    X35gse, y35gse = process_csv(file35_GSE)
    f35gse = get_freq_series(y35gse)
    print('\tReading', file35_GSEHC)
    X35gseHC, y35gseHC = process_csv(file35_GSEHC)
    f35gseHC = get_freq_series(y35gseHC)
    print('\tReading', file70_GSE)
    X70gse, y70gse = process_csv(file70_GSE)
    f70gse = get_freq_series(y70gse)
    
    print('\tReading', file70HC_GSE)
    X70HCgse, y70HCgse = process_csv(file70HC_GSE)
    f70HCgse = get_freq_series(y70HCgse)
    print('\tReading', file70_GSEHC)
    X70gseHC, y70gseHC = process_csv(file70_GSEHC)
    f70gseHC = get_freq_series(y70gseHC)
    print('\tReading', file70HC_GSEHC)
    XHC, yHC = process_csv(file70HC_GSEHC)
    fHC = get_freq_series(yHC)
    
    print('\tReading', fileHoldOut)
    xitest, yitest = process_csv(fileHoldOut)
    fitest = get_freq_series(yitest)
    
 
    print('2. Store pseudo_label_counts')
    df = pandas.DataFrame(data=[], columns = f35.keys())
    df = df.append(f35, ignore_index=True)
    df = df.append(f70, ignore_index=True)  
    df = df.append(f70HC, ignore_index=True)
    df = df.append(f35gse, ignore_index=True)
    df = df.append(f35gseHC, ignore_index=True)
    df = df.append(f70gse, ignore_index=True)
    df = df.append(f70HCgse, ignore_index=True)
    df = df.append(f70gseHC, ignore_index=True)
    df = df.append(fHC, ignore_index=True)
    df = df.append(fitest, ignore_index=True)
    
    df.insert(0, 'Dataset', ['35', '70', '70HC', '35gse', '35gseHC', '70gse', '70HCgse', '70gseHC', 'HC', 'holdOut'])
    df.insert(1, 'Seed', [seed]*10)
    
    return df

In [152]:
def run_all_seeds(rand):
    model = RandomForestClassifier(n_estimators = 50, max_depth=100)
    cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=5, random_state=rand)
    for seed in [2, 20, 40, 80, 160, 320]:
        result = evaluate(seed, beta, model, cv)
        output_file = './output/result_subClass_seed{}_GSE109379_{}.csv'.format(seed, rand)
        output_per_class_file = './output_acc_perClass/result_acc_perClass_seed{}_GSE109379_{}.csv'.format(seed, rand)
        result[0].to_csv(output_file, index=False)
        result[1].to_csv(output_per_class_file, index=False)
        print('Result saved to', output_file)
        
def run_one_seeds(rand):
    model = RandomForestClassifier(n_estimators = 50, max_depth=100, random_state=rand)
    cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=1)
    for seed in [ 2]:
        result = evaluate_short(seed, beta, model, cv)
        output_file = './output/result_RF_subClass_seed{}_GSE109379_{}.csv'.format(seed, rand)
        output_per_class_file = './output_acc_perClass/result_RF_acc_perClass_seed{}_GSE109379_{}.csv'.format(seed, rand)
        result[0].to_csv(output_file, index=False)
        result[1].to_csv(output_per_class_file, index=False)
        print('Result saved to', output_file)
        
def create_pseudo_labels_file():
    for seed in [1, 2, 20, 40, 80, 160, 320]:
        combined_labels = count_pseudo_label_per_class(seed)
        output_file = 'pseudo_label_subClass_seed{}.csv'.format(seed)
        combined_labels.to_csv(output_file, index=True)

In [2]:
beta_file = './raw_data/top32K_GSE90496.txt'
beta = pd.read_csv(beta_file, index_col=0)

In [3]:
seed=1
file35 = './data_class/seed{}_35perc.csv'.format(seed)
file70='./data_class/seed{}_70perc.csv'.format(seed)
X35, y35 = get_beta_in_select_file(beta, file35)
X70, y70 = get_beta_in_select_file(beta, file35)
fileHoldOut = './data_class/seed{}_holdOutTest.csv'.format(seed)
xitest, yitest = get_beta_in_select_file(beta, fileHoldOut)

          


In [94]:
class_label = './raw_data/GSE90496_methylation_class_label.csv'

labels = pd.read_csv(class_label, header=0, names=['y'])
print(labels)
cls_weights = class_weight.compute_class_weight("balanced", classes=np.unique(labels), y=labels['y'])

cls_weight_dict = {}
i = 0
for cls in np.unique(labels):
    cls_weight_dict[cls] = cls_weights[i].round(3)
    i = i+1
print(cls_weight_dict)

                                      y
GSM2402854_5684819014_R03C02   GBM, G34
GSM2402863_5684819013_R04C01   DMG, K27
GSM2402953_6164621144_R03C02  ATRT, TYR
GSM2403853_9305651003_R06C02  EPN, RELA
GSM2403854_9305651005_R03C02  EPN, PF B
...                                 ...
GSM2403848_9305651037_R02C02   CPH, PAP
GSM2403849_9305651037_R06C01   CPH, PAP
GSM2403850_9305651037_R04C01   EPN, MPE
GSM2403851_9305651037_R01C02      PITUI
GSM2403852_9305651037_R06C02       SCHW

[2801 rows x 1 columns]
{'A IDH': 0.395, 'A IDH, HG': 0.669, 'ANA PA': 1.466, 'ATRT, MYC': 1.061, 'ATRT, SHH': 0.669, 'ATRT, TYR': 0.832, 'CHGL': 2.565, 'CHORDM': 3.42, 'CN': 1.466, 'CNS NB, FOXR2': 0.789, 'CONTR, ADENOPIT': 3.42, 'CONTR, CEBM': 3.848, 'CONTR, HEMI': 2.368, 'CONTR, HYPTHAL': 3.42, 'CONTR, INFLAM': 1.283, 'CONTR, PINEAL': 2.565, 'CONTR, PONS': 2.565, 'CONTR, REACT': 1.338, 'CONTR, WM': 3.42, 'CPH, ADM': 1.231, 'CPH, PAP': 1.539, 'DLGNT': 3.848, 'DMG, K27': 0.395, 'EFT, CIC': 2.368, 'ENB, A': 1.338,

In [156]:
rand = 123456
model = RandomForestClassifier(n_estimators = 100, max_depth=100)
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=5, random_state=rand)

#y_pred = model.predict(xitest)
#result_ts_35 = evaluate_against_test_set(model, X35, y35, xitest, yitest)

run_one_seeds(rand)

1. Read files (seed=2)
	Reading ./data_class/seed2_35perc.csv
	Reading ./data_class/seed2_70perc.csv
	Reading ./data_class/seed2_70percHC.csv
	Reading ./data_class/seed2_holdOutTest.csv
2. Evaluate model with 35% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=1, n_splits=2, random_state=None)
shape in select_probes (488, 4081)
------------------------------------------------------------------------
Training for fold 1 ...
Scores =  (0.795, 0.648, 0.795, 0.764, 0.765)
shape in select_probes (488, 4176)
------------------------------------------------------------------------
Training for fold 2 ...
Scores =  (0.778, 0.596, 0.778, 0.734, 0.736)
	validate against test set
3. Evaluate model with 70% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=1, n_splits=2, random_state=None)
shape in select_probes (981, 4162)
--

In [118]:
mod, cv_result = cross_validate_withSD(model, X35, y35, cv, sd_cutoff = 0.3, cls_weight = cls_weight_dict)

shape in select_probes (650, 3873)
------------------------------------------------------------------------
Training for fold 1 ...
Scores =  (0.874, 0.733, 0.874, 0.839, 0.843)
shape in select_probes (650, 3945)
------------------------------------------------------------------------
Training for fold 2 ...
Scores =  (0.862, 0.751, 0.862, 0.836, 0.834)
shape in select_probes (651, 4230)
------------------------------------------------------------------------
Training for fold 3 ...
Scores =  (0.836, 0.738, 0.836, 0.805, 0.811)
shape in select_probes (650, 4087)
------------------------------------------------------------------------
Training for fold 4 ...
Scores =  (0.883, 0.82, 0.883, 0.871, 0.866)
shape in select_probes (650, 4038)
------------------------------------------------------------------------
Training for fold 5 ...
Scores =  (0.834, 0.701, 0.834, 0.791, 0.797)
shape in select_probes (651, 3955)
------------------------------------------------------------------------
Tra

In [112]:
model

RandomForestClassifier(max_depth=100, n_estimators=50, random_state=1235)

In [7]:
result_cv_35 = apply_CV(model, cv, X35, y35)
print('\tvalidate against test set')
model.fit(x_train, y35)
y_pred = model.predict(xitest)
result_ts_35 = evaluate_against_test_set(model, X35, y35, xitest, yitest)



	validate against test set




In [19]:
cross_validate()

{'fit_time': array([8.97381186, 8.54711795, 8.71060991]),
 'score_time': array([0.14942813, 0.14501619, 0.14842224]),
 'test_balanced_accuracy': array([0.7355868 , 0.68504777, 0.65920009]),
 'test_accuracy': array([0.85846154, 0.81538462, 0.80246914]),
 'test_recall_weighted': array([0.85846154, 0.81538462, 0.80246914])}

In [18]:
set(np.unique(y35)).difference(np.unique(y_pred))

{'DLGNT',
 'GBM, MYCN',
 'GBM, RTK III',
 'LGG, DIG DIA',
 'LGG, RGNT',
 'PIN T, PB A',
 'PITAD, PRL',
 'PITAD, STH DNS A',
 'PTPR, A',
 'SCHW, MEL'}

In [30]:
cv_result, test_result = evaluate_short(seed, beta, model, cv, mcf=False)

1. Read files (seed=1)
	Reading ./data_class/seed1_35perc.csv
	Reading ./data_class/seed1_70perc.csv
	Reading ./data_class/seed1_70percHC.csv
	Reading ./data_class/seed1_holdOutTest.csv
2. Evaluate model with 35% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=1235) with RepeatedStratifiedKFold(n_repeats=1, n_splits=3, random_state=None)
	validate against test set


ValueError: Number of features of the model must match the input. Model n_features is 32000 and input n_features is 5072 