In [None]:
import torch
import torchvision
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from torch.utils.data import Dataset, DataLoader
import os 
from warnings import simplefilter
import pandas as pd
import ast
import models

In [None]:
df = pd.read_csv('results/convnet_aucs.csv')
# use results/cleaned_convent_aucs.csv instead to find best results/data being used in paper

In [None]:
df.ratio = df.ratio.apply(lambda x : ast.literal_eval(x))
df.classes_used = df.classes_used.apply(lambda x : ast.literal_eval(x))

In [None]:
ratio = (100, 1) 
norm = False

NUM_CLASSES= len(ratio)

normal_df = df.loc[(df.name=='normal') & (df.num_classes==NUM_CLASSES) & (df.normalization==norm)]
ratio_df = df.loc[(df.name=='ratio') & (df.num_classes==NUM_CLASSES) & (df.ratio==ratio) & (df.normalization==norm)]
oversampled_df = df.loc[(df.name=='oversampled') & (df.num_classes==NUM_CLASSES) & (df.ratio==ratio) & (df.normalization==norm)]
weighted_df = df.loc[(df.name=='weighted') & (df.num_classes==NUM_CLASSES) & (df.ratio==ratio) & (df.normalization==norm)]
undersampled_df = df.loc[(df.name=='undersampled') & (df.num_classes==NUM_CLASSES) & (df.ratio==ratio) & (df.normalization==norm)]

smote_df = df.loc[(df.name=='smote') & (df.num_classes==NUM_CLASSES) & (df.ratio==ratio) & (df.normalization==norm)]

capped_smote10_df = df.loc[(df.name=='capped_smote') & (df.num_classes==NUM_CLASSES) & (df.ratio==ratio) & (df.cap==10.0)& (df.normalization==norm)]
capped_smote5_df = df.loc[(df.name=='capped_smote') & (df.num_classes==NUM_CLASSES) & (df.ratio==ratio) & (df.cap==5.0)& (df.normalization==norm)]
capped_smote_df = df.loc[(df.name=='capped_smote') & (df.num_classes==NUM_CLASSES) & (df.ratio==ratio) & (df.cap==1.0)& (df.normalization==norm)]

distance_capped_smote_df = df.loc[(df.name=='distance_capped_smote') & (df.num_classes==NUM_CLASSES) & (df.ratio==ratio)& (df.normalization==norm) & (df.cap==1.0)]
distance_capped_smote5_df = df.loc[(df.name=='distance_capped_smote') & (df.num_classes==NUM_CLASSES) & (df.ratio==ratio)& (df.normalization==norm) & (df.cap==5.0)]
distance_capped_smote10_df = df.loc[(df.name=='distance_capped_smote') & (df.num_classes==NUM_CLASSES) & (df.ratio==ratio)& (df.normalization==norm) & (df.cap==10.0)]


cosine_distance_capped_smote_df = df.loc[(df.name=='cosine_distance_capped_smote') & (df.num_classes==NUM_CLASSES) & (df.ratio==ratio) & (df.normalization==norm) & (df.cap==1.0)]
cosine_distance_capped_smote10_df = df.loc[(df.name=='cosine_distance_capped_smote') & (df.num_classes==NUM_CLASSES) & (df.ratio==ratio)& (df.normalization==norm) & (df.cap == 10.0)]
cosine_distance_capped_smote5_df = df.loc[(df.name=='cosine_distance_capped_smote') & (df.num_classes==NUM_CLASSES) & (df.ratio==ratio)& (df.normalization==norm)& (df.cap == 5.0)]
cosine_distance_capped_smote_avg_df = df.loc[(df.name=='cosine_distance_capped_smote_avg') & (df.num_classes==NUM_CLASSES) & (df.ratio==ratio) & (df.normalization==norm)]
cosine_distance_capped_smote_with_triplet_loss_df = df.loc[(df.name=='cosine_distance_capped_smote_with_triplet_loss') & (df.num_classes==NUM_CLASSES) & (df.ratio==ratio) & (df.normalization==norm)]
cosine_distance_capped_smote_with_smote_triplet_loss_df = df.loc[(df.name=='cosine_distance_capped_smote_with_smote_triplet_loss') & (df.num_classes==NUM_CLASSES) & (df.ratio==ratio) & (df.normalization==norm)]


triplet_loss_df = df.loc[(df.name=='triplet_loss') & (df.num_classes==NUM_CLASSES)& (df.normalization==norm)]
triplet_loss_capped_smote_df = df.loc[(df.name=='triplet_loss_capped_smote') & (df.num_classes==NUM_CLASSES)& (df.ratio==ratio)& (df.normalization==norm) & (df.cap==1.0)]
triplet_loss_capped_smote5_df = df.loc[(df.name=='triplet_loss_capped_smote') & (df.num_classes==NUM_CLASSES)& (df.ratio==ratio)& (df.normalization==norm) & (df.cap == 5.0)]
triplet_loss_capped_smote10_df = df.loc[(df.name=='triplet_loss_capped_smote') & (df.num_classes==NUM_CLASSES)& (df.ratio==ratio)& (df.normalization==norm) & (df.cap==10.0)]


In [None]:
best_rows=[]

epochs = [0, 10, 20, 30]
mean_cols = ["mean_" + str(epoch )for epoch in epochs]
variance_cols = ["variance_" + str(epoch )for epoch in epochs]


best_mean=0
best_row = pd.core.series.Series()
for index, row in normal_df.iterrows(): 
    means = [row[mean_name] for mean_name in mean_cols]
    variances = [row[variance] for variance in variance_cols]
    plt.errorbar(epochs, means, yerr=variances, label=row['learning_rate'])
    if row[mean_cols[-1]] >= best_mean: 
        best_row = row
        best_mean=row[mean_cols[-1]]
best_rows.append(best_row)
plt.title("Normal")
plt.legend()
plt.show()

best_mean=0
best_row = pd.core.series.Series()
for index, row in ratio_df.iterrows(): 
    means = [row[mean_name] for mean_name in mean_cols]
    variances = [row[variance] for variance in variance_cols]
    plt.errorbar(epochs, means, yerr=variances, label=row['learning_rate'])
    if row[mean_cols[-1]] >= best_mean: 
        best_row = row
        best_mean=row[mean_cols[-1]]
best_rows.append(best_row)
plt.title(str(row['ratio']) + " Ratio")
plt.legend()
plt.show()


best_mean=0
best_row = pd.core.series.Series()
for index, row in oversampled_df.iterrows(): 
    means = [row[mean_name] for mean_name in mean_cols]
    variances = [row[variance] for variance in variance_cols]
    plt.errorbar(epochs, means, yerr=variances, label=row['learning_rate'])
    if row[mean_cols[-1]] >= best_mean: 
        best_row = row
        best_mean=row[mean_cols[-1]]
best_rows.append(best_row)
plt.title(str(row['ratio']) + " Oversampling")
plt.legend()
plt.show()



best_mean=0
best_row = pd.core.series.Series()
for index, row in undersampled_df.iterrows(): 
    means = [row[mean_name] for mean_name in mean_cols]
    variances = [row[variance] for variance in variance_cols]
    plt.errorbar(epochs, means, yerr=variances, label=row['learning_rate'])
    if row[mean_cols[-1]] >= best_mean: 
        best_row = row
        best_mean=row[mean_cols[-1]]
best_rows.append(best_row)
plt.title(str(row['ratio']) + " Undersampling")
plt.legend()
plt.show()


best_mean=0
best_row = pd.core.series.Series()
for index, row in weighted_df.iterrows(): 
    means = [row[mean_name] for mean_name in mean_cols]
    variances = [row[variance] for variance in variance_cols]
    plt.errorbar(epochs, means, yerr=variances, label=row['learning_rate'])
    if row[mean_cols[-1]] >= best_mean: 
        best_row = row
        best_mean=row[mean_cols[-1]]
best_rows.append(best_row)
plt.title(str(row['ratio']) + " Weighted Loss")
plt.legend()
plt.show()


best_mean=0
best_row = pd.core.series.Series()
for index, row in smote_df.iterrows(): 
    means = [row[mean_name] for mean_name in mean_cols]
    variances = [row[variance] for variance in variance_cols]
    plt.errorbar(epochs, means, yerr=variances, label=row['learning_rate'])
    if row[mean_cols[-1]] >= best_mean: 
        best_row = row
        best_mean=row[mean_cols[-1]]
best_rows.append(best_row)
plt.title(str(row['ratio']) + " SMOTE")
plt.legend()
plt.show()


best_mean=0
best_row = pd.core.series.Series()
for index, row in capped_smote10_df.iterrows(): 
    means = [row[mean_name] for mean_name in mean_cols]
    variances = [row[variance] for variance in variance_cols]
    plt.errorbar(epochs, means, yerr=variances, label=row['learning_rate'])
    if row[mean_cols[-1]] >= best_mean: 
        best_row = row
        best_mean=row[mean_cols[-1]]
    plt.title(str(row['ratio']) + " Cap 10 SMOTE")
if len(best_row) > 0:
    best_row['name'] = best_row['name'] + '10'
best_rows.append(best_row)
plt.legend()
plt.show()



best_mean=0
best_row = pd.core.series.Series()
for index, row in capped_smote5_df.iterrows(): 
    means = [row[mean_name] for mean_name in mean_cols]
    variances = [row[variance] for variance in variance_cols]
    plt.errorbar(epochs, means, yerr=variances, label=row['learning_rate'])
    if row[mean_cols[-1]] >= best_mean: 
        best_row = row
        best_mean=row[mean_cols[-1]]
    plt.title(str(row['ratio']) + " Cap 5 SMOTE")
if len(best_row) > 0:
    best_row['name'] = best_row['name'] + '5'
best_rows.append(best_row)
plt.legend()
plt.show()



best_mean=0
best_row = pd.core.series.Series()
for index, row in capped_smote_df.iterrows(): 
    means = [row[mean_name] for mean_name in mean_cols]
    variances = [row[variance] for variance in variance_cols]
    plt.errorbar(epochs, means, yerr=variances, label=row['learning_rate'])
    if row[mean_cols[-1]] >= best_mean: 
        best_row = row
        best_mean=row[mean_cols[-1]]
    plt.title(str(row['ratio']) + " Cap 1 SMOTE")
if len(best_row) > 0:
    best_row['name'] = best_row['name'] + '1'
best_rows.append(best_row)
plt.legend()
plt.show()



best_mean=0
best_row = pd.core.series.Series()
for index, row in distance_capped_smote_df.iterrows(): 
    means = [row[mean_name] for mean_name in mean_cols]
    variances = [row[variance] for variance in variance_cols]
    plt.errorbar(epochs, means, yerr=variances, label=row['learning_rate'])
    if row[mean_cols[-1]] >= best_mean: 
        best_row = row
        best_mean=row[mean_cols[-1]]
    plt.title(str(row['ratio']) + " Capped SMOTE with Euclidean Distance")
if len(best_row) > 0:    
    best_rows.append(best_row)
plt.legend()
plt.show()

best_mean=0
best_row = pd.core.series.Series()
for index, row in distance_capped_smote5_df.iterrows(): 
    means = [row[mean_name] for mean_name in mean_cols]
    variances = [row[variance] for variance in variance_cols]
    plt.errorbar(epochs, means, yerr=variances, label=row['learning_rate'])
    if row[mean_cols[-1]] >= best_mean: 
        best_row = row
        best_mean=row[mean_cols[-1]]
    plt.title(str(row['ratio']) + " Capped SMOTE with Euclidean Distance (5)")
if len(best_row) > 0:
    best_row['name'] = best_row['name'] + '5'
best_rows.append(best_row)
plt.legend()
plt.show()

best_mean=0
best_row = pd.core.series.Series()
for index, row in distance_capped_smote10_df.iterrows(): 
    means = [row[mean_name] for mean_name in mean_cols]
    variances = [row[variance] for variance in variance_cols]
    plt.errorbar(epochs, means, yerr=variances, label=row['learning_rate'])
    if row[mean_cols[-1]] >= best_mean: 
        best_row = row
        best_mean=row[mean_cols[-1]]
    plt.title(str(row['ratio']) + " Capped SMOTE with Euclidean Distance (10)")
if len(best_row) > 0:
    best_row['name'] = best_row['name'] + '10'
best_rows.append(best_row)
plt.legend()
plt.show()

best_mean=0
best_row = pd.core.series.Series()
for index, row in cosine_distance_capped_smote_df.iterrows(): 
    means = [row[mean_name] for mean_name in mean_cols]
    variances = [row[variance] for variance in variance_cols]
    plt.errorbar(epochs, means, yerr=variances, label=row['learning_rate'])
    if row[mean_cols[-1]] >= best_mean: 
        best_row = row
        best_mean=row[mean_cols[-1]]
    plt.title(str(row['ratio']) + " Capped SMOTE with Cosine Distance")
best_rows.append(best_row)
plt.legend()
plt.show()

best_mean=0
best_row = pd.core.series.Series()
for index, row in cosine_distance_capped_smote5_df.iterrows(): 
    means = [row[mean_name] for mean_name in mean_cols]
    variances = [row[variance] for variance in variance_cols]
    plt.errorbar(epochs, means, yerr=variances, label=row['learning_rate'])
    if row[mean_cols[-1]] >= best_mean: 
        best_row = row
        best_mean=row[mean_cols[-1]]
    plt.title(str(row['ratio']) + " Capped SMOTE with Cosine Distance (5)")
if len(best_row) > 0:
    best_row['name'] = best_row['name'] + '5'
best_rows.append(best_row)
plt.legend()
plt.show()



best_mean=0
best_row = pd.core.series.Series()
for index, row in cosine_distance_capped_smote10_df.iterrows(): 
    means = [row[mean_name] for mean_name in mean_cols]
    variances = [row[variance] for variance in variance_cols]
    plt.errorbar(epochs, means, yerr=variances, label=row['learning_rate'])
    if row[mean_cols[-1]] >= best_mean: 
        best_row = row
        best_mean=row[mean_cols[-1]]
    plt.title(str(row['ratio']) + " Capped SMOTE with Cosine Distance (10)")
if len(best_row) > 0:
    best_row['name'] = best_row['name'] + '10'
best_rows.append(best_row)
plt.legend()
plt.show()


best_mean=0
best_row = pd.core.series.Series()
for index, row in cosine_distance_capped_smote_avg_df.iterrows(): 
    means = [row[mean_name] for mean_name in mean_cols]
    variances = [row[variance] for variance in variance_cols]
    plt.errorbar(epochs, means, yerr=variances, label=row['learning_rate'])
    if row[mean_cols[-1]] >= best_mean: 
        best_row = row
        best_mean=row[mean_cols[-1]]
    plt.title(str(row['ratio']) + " Capped SMOTE with Cosine Distance Average")
best_rows.append(best_row)
plt.legend()
plt.show()


best_mean=0
best_row = pd.core.series.Series()
for index, row in cosine_distance_capped_smote_with_smote_triplet_loss_df.iterrows(): 
    means = [row[mean_name] for mean_name in mean_cols]
    variances = [row[variance] for variance in variance_cols]
    plt.errorbar(epochs, means, yerr=variances, label=row['learning_rate'])
    if row[mean_cols[-1]] >= best_mean: 
        best_row = row
        best_mean=row[mean_cols[-1]]
    plt.title(str(row['ratio']) + " Capped SMOTE with Cosine Distance Average and SMOTE Triplet Loss")
best_rows.append(best_row)
plt.legend()
plt.show()

best_mean=0
best_row = pd.core.series.Series()
for index, row in cosine_distance_capped_smote_with_triplet_loss_df.iterrows(): 
    means = [row[mean_name] for mean_name in mean_cols]
    variances = [row[variance] for variance in variance_cols]
    plt.errorbar(epochs, means, yerr=variances, label=row['learning_rate'])
    if row[mean_cols[-1]] >= best_mean: 
        best_row = row
        best_mean=row[mean_cols[-1]]
    plt.title(str(row['ratio']) + " Capped SMOTE with Cosine Distance Average and Triplet Loss")
best_rows.append(best_row)
plt.legend()
plt.show()


best_mean=0
best_row = pd.core.series.Series()
for index, row in triplet_loss_df.iterrows(): 
    means = [row[mean_name] for mean_name in mean_cols]
    variances = [row[variance] for variance in variance_cols]
    plt.errorbar(epochs, means, yerr=variances, label=row['learning_rate'])
    if row[mean_cols[-1]] >= best_mean: 
        best_row = row
        best_mean=row[mean_cols[-1]]
    plt.title("Triplet Loss (no ratio)")
best_rows.append(best_row)
plt.legend()
plt.show()


best_mean=0
best_row = pd.core.series.Series()
for index, row in triplet_loss_capped_smote_df.iterrows(): 
    means = [row[mean_name] for mean_name in mean_cols]
    variances = [row[variance] for variance in variance_cols]
    plt.errorbar(epochs, means, yerr=variances, label=row['learning_rate'])
    if row[mean_cols[-1]] >= best_mean: 
        best_row = row
        best_mean=row[mean_cols[-1]]
    plt.title(str(row['ratio']) + " Capped SMOTE Using Triplet Loss")
best_rows.append(best_row)
plt.legend()
plt.show()

best_mean=0
best_row = pd.core.series.Series()
for index, row in triplet_loss_capped_smote5_df.iterrows(): 
    means = [row[mean_name] for mean_name in mean_cols]
    variances = [row[variance] for variance in variance_cols]
    plt.errorbar(epochs, means, yerr=variances, label=row['learning_rate'])
    if row[mean_cols[-1]] >= best_mean: 
        best_row = row
        best_mean=row[mean_cols[-1]]
    plt.title(str(row['ratio']) + " Capped SMOTE Using Triplet Loss (5)")
if len(best_row) > 0:
    best_row['name'] = best_row['name'] + '5'
best_rows.append(best_row)
plt.legend()
plt.show()

best_mean=0
best_row = pd.core.series.Series()
for index, row in triplet_loss_capped_smote10_df.iterrows(): 
    means = [row[mean_name] for mean_name in mean_cols]
    variances = [row[variance] for variance in variance_cols]
    plt.errorbar(epochs, means, yerr=variances, label=row['learning_rate'])
    if row[mean_cols[-1]] >= best_mean: 
        best_row = row
        best_mean=row[mean_cols[-1]]
    plt.title(str(row['ratio']) + " Capped SMOTE Using Triplet Loss (10)")
if len(best_row) > 0:
    best_row['name'] = best_row['name'] + '10'
best_rows.append(best_row)
plt.legend()
plt.show()

In [None]:
# narrow down which data to graph 

rows_to_graph = ['normal', 'ratio', 'oversampled', 'undersampled', 'weighted']
# rows_to_graph = ['smote', 'capped_smote1', 'capped_smote5', 'capped_smote10']
# rows_to_graph = ['smote', 'distance_capped_smote5', 'cosine_distance_capped_smote', 'cosine_distance_capped_smote5', 'cosine_distance_capped_smote_avg']
# rows_to_graph = ['smote', 'triplet_loss_capped_smote5', 'cosine_distance_capped_smote_with_triplet_loss', 'cosine_distance_capped_smote_with_smote_triplet_loss']
rows_to_graph = ['normal', 'ratio', 'oversampled', 'smote', 'cosine_distance_capped_smote_with_smote_triplet_loss']

# rename experiments to appear on graph 
names = dict(zip(rows_to_graph, rows_to_graph))
names['normal'] = 'no class imbalance'
names['ratio'] = 'vanilla'
names['capped_smote1'] = 'constant capped SMOTE (1)'
names['capped_smote5'] = 'constant capped SMOTE (5)'
names['capped_smote1'] = 'constant capped SMOTE (10)'
names['cosine_distance_capped_smote_avg'] = 'cosine distance capped SMOTE using average tensor'
names['cosine_distance_capped_smote_with_triplet_loss'] = 'cosine distance capped SMOTE with triplet loss'
names['cosine_distance_capped_smote_with_smote_triplet_loss'] = 'cosine distance capped SMOTE with SMOTE triplet loss'

rows = []
for row in best_rows:
    if (len(row) > 0) and row['name'] in rows_to_graph:
        rows.append(row)


In [None]:
# graphing best AUCs for experiments 

if NUM_CLASSES == 2:
    balanced = (1, 1)
elif NUM_CLASSES == 3:
    balanced = (1, 1, 1)
    
for row in rows:
    means = [row['mean_' + str(epoch)] for epoch in epochs]
    variances = [row['variance_' + str(epoch)] for epoch in epochs]
    plt.errorbar(epochs, means, yerr=variances, label=names[row['name']] + " " + str(round(max(means), 4)))
plt.title("Baseline Test AUCs")
plt.ylabel("AUC")
plt.xlabel("Epoch")
plt.legend(loc="lower right")
plt.show()