# Evaluate Pseudo Labels

Questions:
1. tba
2. tba

## Design

Original data set has 2801 samples x 5072 features. This data were split into 70% train and 30% test. This process was repeated 7 times to create 7 different train-test splits.

For each split:
1. Create a random model
2. Evaluate the model on 35% data  with itest
3. Evaluate the model on 35% data + 35% pseudo-label data with itest
4. Evaluate the model on 35% data + high confident pseudo-label data with itest

Box plots for each model (1 box plot show the average accuracies of each model over the 7 train-test splits)

In [4]:
import scipy.io
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, balanced_accuracy_score
import pandas
import numpy as np

def read_Mat(filename):
    d = scipy.io.loadmat(filename)
    xtrain = d['xtrain']
    ytrain = [x[0][0].strip() for x in d['ytrain']]
    xitest = d['xitest']
    yitest = [x[0][0].strip() for x in d['yitest']]
    return(xtrain, ytrain, xitest, yitest)

def process_csv(filename):
    df = pandas.read_csv(filename, index_col=0)
    y = df.y
    X = df.drop(columns=['y'])
    return (X, y)

def append_result(p, df):
    s = pandas.Series(p, index=df.columns)
    return df.append(s, ignore_index=True)

def apply_CV(mod, cv, X, y):
    return cross_validate(mod, X, y, cv=cv, scoring=['balanced_accuracy', 'accuracy', 'recall_weighted'])

def append_cv_results(df, r, seed_name, dataset_name):
    df = append_result([seed_name, dataset_name, 'cross_val', 'balanced_acc', r['test_balanced_accuracy'].mean().round(2)], df)
    df = append_result([seed_name, dataset_name, 'cross_val', 'accuracy', r['test_accuracy'].mean().round(2)], df)
    df = append_result([seed_name, dataset_name, 'cross_val', 'recall_weighted', r['test_recall_weighted'].mean().round(2)], df)
    return df

def evaluate_against_test_set(mod, x_train, y_train, x_test, y_test):
    mod.fit(x_train, y_train)
    y_pred = mod.predict(x_test)
    rec = recall_score(y_test, y_pred,  average='weighted', zero_division=0).round(3)
    acc = accuracy_score(y_pred, y_test).round(3)
    b_acc = balanced_accuracy_score(y_pred, y_test).round(3)
    return (rec, acc, b_acc)

def append_testset_results(df, r, seed_name, dataset_name):
    df = append_result([seed_name, dataset_name, 'vs_testset', 'accuracy', r[0]], df)
    df = append_result([seed_name, dataset_name, 'vs_testset', 'balanced_acc', r[1]], df)
    df = append_result([seed_name, dataset_name, 'vs_testset', 'recall_weighted', r[2]], df)
    return df

def evaluate(seed, model, cv):
    print('1. Read files (seed={})'.format(seed))
    file1 = './data/seed{}_35perc_train.csv'.format(seed)
    file2='./data/seed{}_70perc_SSLabel.csv'.format(seed)
    file3='./data/seed{}_SSLabel_HC.csv'.format(seed)
    file4 = './data/seed{}_holdOutTest.csv'.format(seed)
    print('\tReading', file1)
    X35, y35 = process_csv(file1)
    print('\tReading', file2)
    X70, y70 = process_csv(file2)
    print('\tReading', file3)
    XHC, yHC = process_csv(file3)
    print('\tReading', file4)
    xitest, yitest = process_csv(file4)
    
    print('2. Evaluate model with 35% data')
    print('\tcross validate', model, 'with', cv)
    result_cv_35 = apply_CV(model, cv, X35, y35)
    print('\tvalidate against test set')
    result_ts_35 = evaluate_against_test_set(model, X35, y35, xitest, yitest)
          
    print('3. Evaluate model with 70% data')
    print('\tcross validate', model, 'with', cv)
    result_cv_70 = apply_CV(model, cv, X70, y70)
    print('\tvalidate against test set')
    result_ts_70 = evaluate_against_test_set(model, X70, y70, xitest, yitest)

    print('4. Evaluate model with HC pseudo-labeled data')    
    print('\tcross validate', model, 'with', cv)
    result_cv_HC = apply_CV(model, cv, XHC, yHC)
    print('\tvalidate against test set')
    result_ts_HC = evaluate_against_test_set(model, XHC, yHC, xitest, yitest)

    print('5. Store results.')
    df = pandas.DataFrame(data=[], columns=['Seed','Dataset','Validation','Metric','Value'])
    df = append_cv_results(df, result_cv_35, seed, '35%')
    df = append_testset_results(df, result_ts_35, seed, '35%')
    df = append_cv_results(df, result_cv_70, seed, '70%')
    df = append_testset_results(df, result_ts_70, seed, '70%')
    df = append_cv_results(df, result_cv_HC, seed, 'HC')
    df = append_testset_results(df, result_ts_HC, seed, 'HC')
    
    return df
def run_all_seeds(rand):
    model = RandomForestClassifier(n_estimators = 50, max_depth=100, random_state=rand)
    cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=5)
    for seed in [1, 2, 20, 40, 80, 160, 320]:
        result = evaluate(seed, model, cv)
        output_file = 'result_seed{}_{}.csv'.format(seed, rand)
        result.to_csv(output_file, index=False)
        print('Result saved to', output_file)

In [5]:
run_all_seeds(123456)

1. Read files (seed=1)
	Reading ./data/seed1_35perc_train.csv
	Reading ./data/seed1_70perc_SSLabel.csv
	Reading ./data/seed1_SSLabel_HC.csv
	Reading ./data/seed1_holdOutTest.csv
2. Evaluate model with 35% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




3. Evaluate model with 70% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




4. Evaluate model with HC pseudo-labeled data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




5. Store results.
Result saved to result_seed1_123456.csv
1. Read files (seed=2)
	Reading ./data/seed2_35perc_train.csv
	Reading ./data/seed2_70perc_SSLabel.csv
	Reading ./data/seed2_SSLabel_HC.csv
	Reading ./data/seed2_holdOutTest.csv
2. Evaluate model with 35% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




3. Evaluate model with 70% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




4. Evaluate model with HC pseudo-labeled data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




5. Store results.
Result saved to result_seed2_123456.csv
1. Read files (seed=20)
	Reading ./data/seed20_35perc_train.csv
	Reading ./data/seed20_70perc_SSLabel.csv
	Reading ./data/seed20_SSLabel_HC.csv
	Reading ./data/seed20_holdOutTest.csv
2. Evaluate model with 35% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




3. Evaluate model with 70% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




4. Evaluate model with HC pseudo-labeled data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




5. Store results.
Result saved to result_seed20_123456.csv
1. Read files (seed=40)
	Reading ./data/seed40_35perc_train.csv
	Reading ./data/seed40_70perc_SSLabel.csv
	Reading ./data/seed40_SSLabel_HC.csv
	Reading ./data/seed40_holdOutTest.csv
2. Evaluate model with 35% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




3. Evaluate model with 70% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




4. Evaluate model with HC pseudo-labeled data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




5. Store results.
Result saved to result_seed40_123456.csv
1. Read files (seed=80)
	Reading ./data/seed80_35perc_train.csv
	Reading ./data/seed80_70perc_SSLabel.csv
	Reading ./data/seed80_SSLabel_HC.csv
	Reading ./data/seed80_holdOutTest.csv
2. Evaluate model with 35% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




3. Evaluate model with 70% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




4. Evaluate model with HC pseudo-labeled data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




5. Store results.
Result saved to result_seed80_123456.csv
1. Read files (seed=160)
	Reading ./data/seed160_35perc_train.csv
	Reading ./data/seed160_70perc_SSLabel.csv
	Reading ./data/seed160_SSLabel_HC.csv
	Reading ./data/seed160_holdOutTest.csv
2. Evaluate model with 35% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




3. Evaluate model with 70% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




4. Evaluate model with HC pseudo-labeled data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




5. Store results.
Result saved to result_seed160_123456.csv
1. Read files (seed=320)
	Reading ./data/seed320_35perc_train.csv
	Reading ./data/seed320_70perc_SSLabel.csv
	Reading ./data/seed320_SSLabel_HC.csv
	Reading ./data/seed320_holdOutTest.csv
2. Evaluate model with 35% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




3. Evaluate model with 70% data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




4. Evaluate model with HC pseudo-labeled data
	cross validate RandomForestClassifier(max_depth=100, n_estimators=50, random_state=123456) with RepeatedStratifiedKFold(n_repeats=5, n_splits=3, random_state=None)
	validate against test set




5. Store results.
Result saved to result_seed320_123456.csv


In [3]:
file1 = './data/seed2_35perc_train.Mat'
file2='./data/seed2_70perc_SSLabel.csv'
file3='./data/seed2_SSLabel_HC.csv'
X35, y35, xitest, yitest = read_Mat(file1)
X70, y70 = process_csv(file2)
XHC, yHC = process_csv(file3)

In [7]:
result35 = apply_CV(model, cv, X35, y35)
result_ts_35 = evaluate_against_test_set(model, X35, y35, xitest, yitest)



In [8]:
result70 = apply_CV(model, cv, X70, y70)
result_ts_70 = evaluate_against_test_set(model, X70, y70, xitest, yitest)



In [9]:
resultHC = apply_CV(model, cv, XHC, yHC)
result_ts_HC = evaluate_against_test_set(model, XHC, yHC, xitest, yitest)



In [10]:
result_df = pandas.DataFrame(data=[], columns=['Seed','Dataset','Validation','Metric','Value'])

In [11]:
result_df = append_cv_results(result_df, result35, 2, '35%')
result_df = append_testset_results(result_df, result_ts_35, 2, '35%')
result_df = append_cv_results(result_df, result70, 2, '70%')
result_df = append_testset_results(result_df, result_ts_70, 2, '70%')
result_df = append_cv_results(result_df, resultHC, 2, 'HC')
result_df = append_testset_results(result_df, result_ts_HC, 2, 'HC')

In [15]:
result_df

Unnamed: 0,Seed,Dataset,Validation,Metric,Value
0,2,35%,cross_val,balanced_acc,0.71
1,2,35%,cross_val,f1,0.81
2,2,35%,cross_val,roc_auc_ovr,0.99
3,2,35%,vs_testset,prec,0.86
4,2,35%,vs_testset,rec,0.88
5,2,35%,vs_testset,acc,0.88
6,2,35%,vs_testset,balanced_acc,0.903
7,2,70%,cross_val,balanced_acc,0.83
8,2,70%,cross_val,f1,0.9
9,2,70%,cross_val,roc_auc_ovr,1.0
