In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
from Trainers.DatasetBC import *
from ExperimentsBC import *
from common_code.plotting import *

In [None]:
dataset_names = {
    'sst' : 'SST',
    'imdb' : 'IMDB',
    'agnews' : 'AG News',
    'tweet' : 'ADR',
    'snli' : 'SNLI',
    '20News_sports' : '20 News Sports',
    'diab' : 'Diabetes',
    'cnn' : 'CNN',
    'anemia' : 'Anemia',
    'babi_1' : 'bAbI 1',
    'babi_2' : 'bAbI 2',
    'babi_3' : 'bAbI 3'
}  

dataset_order = list(map(lambda x : dataset_names[x], 
                      ['sst', 'imdb', 'tweet', 'agnews', '20News_sports', 
                       'diab', 'anemia', 'cnn', 'babi_1', 'babi_2', 'babi_3', 'snli']))

In [None]:
model = 'lstm+tanh'
import os
models = [x for x in os.listdir('graph_outputs/CorrGL-AG_kendalltau/') if model in x]
import pandas as pd
dfs = {}
for d in models :
    df = pd.read_csv('graph_outputs/CorrGL-AG_kendalltau/' + d, index_col=0)
    dfs[d.split('+')[0]] = df.loc['Overall']

import seaborn as sns
dfs = pd.DataFrame(dfs).transpose()
dfs.index = dfs.index.map(lambda x : dataset_names[x])
fig = sns.barplot(y=dfs.index, x=dfs['mean'], order=dataset_order)

xmin, xmax = fig.axes.get_ylim()
fig.axes.vlines(dfs['mean'].mean(), xmin, xmax, colors='#222222')
sns.despine()
plt.xlabel("Mean Difference between Correlations", fontsize=25)
plt.ylabel("Dataset", fontsize=25)
plt.ylim(xmin, xmax)
plt.xticks(fontsize=22)
plt.yticks(fontsize=22)
# plt.gcf().set_size_inches(15, 15)
plt.savefig('graph_outputs/CorrGL-AG_summary.pdf', bbox_inches='tight', pad_inches=0)

In [None]:
import os
models = [x for x in os.listdir('graph_outputs/CorrStats_kendalltau/') if 'lstm+tanh' in x]
models_avg = [x for x in os.listdir('graph_outputs/CorrStats_kendalltau/') if 'average+tanh' in x]

import pandas as pd
dfs = {}
for d, d1 in zip(models, models_avg) :
    df = pd.read_csv('graph_outputs/CorrStats_kendalltau/' + d, index_col=0).rename(columns=lambda x : x + '_lstm')
    df_1 = pd.read_csv('graph_outputs/CorrStats_kendalltau/' + d1, index_col=0).rename(columns=lambda x : x + '_avg')
    df = pd.concat([df, df_1], axis=1)
    dfs[d.split('+')[0]] = df.loc['ag']

import seaborn as sns
dfs = pd.DataFrame(dfs).transpose()
dfs.index = dfs.index.map(lambda x : dataset_names[x])
fig = sns.barplot(y=dfs.index, x=dfs['mean_avg'] - dfs['mean_lstm'], order=dataset_order)

xmin, xmax = fig.axes.get_ylim()
fig.axes.vlines((dfs['mean_avg'] - dfs['mean_lstm']).mean(), xmin + 0.5, xmax - 0.5, colors='#222222')
sns.despine()
plt.xlabel("Mean Difference between Correlations", fontsize=25)
plt.ylabel("Dataset", fontsize=25)
plt.ylim(xmin, xmax)
plt.xticks(fontsize=22)
plt.yticks(fontsize=22)
# plt.savefig('graph_outputs/CorrAL(avg-lstm)_summary.pdf', bbox_inches='tight')
plt.show()

In [None]:
from Trainers.DatasetQA import *
from ExperimentsQA import *

In [None]:
for k in datasets :
    print(k)
    dataset = datasets[k]()
    dataset.basepath = 'outputs_dev'
    generate_graphs_on_encoders(dataset, ['cnn', 'lstm', 'average', 'cnn_dot', 'lstm_dot', 'average_dot'])
    print('+'*700)

In [None]:
dataset = datasets_ehr['mortality']()

In [None]:
from Trainers.PlottingBC import *

In [None]:
evaluator = run_evaluator_on_latest_model(dataset, config='lstm')
logodds_results = pload(evaluator.model, 'logodds_attention')
emax_jds, emax_adv_attn, emax_ad_y = plot_attn_diff(dataset, dataset.test_data, logodds_results, 
                                                    save_name='logodds_subs', dirname=evaluator.model.dirname)

In [None]:
n = 308
a = print_adversarial_example(dataset.vec.map2words(dataset.test_data.X[n]), dataset.test_data.attn_hat[n], emax_adv_attn[n], latex=True)
dataset.test_data.yt_hat[n], emax_ad_y[n]
print(a[2])

In [None]:
ent = [np.max(x) for x in emax_adv_attn]
plt.scatter(ent, emax_jds)

In [None]:
idx_1 = set(np.where(np.array(ent) > 0.6)[0]) & set(np.where(np.array(emax_jds) > 0.4)[0])
idx_1

In [None]:
train_dataset(dataset, 'logodds_lstm_reg')

In [None]:
run_logodds_experiment(dataset, 'logodds_lstm_reg')

In [None]:
run_experiments_on_latest_model(dataset, 'logodds_lstm_reg')

In [None]:
generate_graphs_on_latest_model(dataset, 'logodds_lstm_reg')

In [None]:
run_logodds_substitution_experiment(dataset)

In [None]:
test_data = dataset.test_data
plt.scatter(test_data.yt_hat[:, 0], test_data.opp_yt_hat[:, 0], s=5)
plt.show()

In [None]:
diff = np.abs(test_data.yt_hat[:, 0] - test_data.opp_yt_hat[:, 0])
np.argsort(diff)[:15]

In [None]:
plt.hist(diff, bins=30)

In [None]:
test_data = dataset.test_data
for k, v in test_data.logodds_combined[0].items() :
    print(dataset.vec.map2words(v))

In [None]:
n = 4231
true_X = dataset.vec.map2words(test_data.X[n])
new_X = dataset.vec.map2words(test_data.opp_X[n])
print_attn(true_X, dataset.test_data.attn_hat[n])
print(test_data.yt_hat[n])
print_attn(new_X, dataset.test_data.opp_attn[n])
print(test_data.opp_yt_hat[n])

In [None]:
for k in datasets_ehr :
    dataset = datasets_ehr[k]()
    generate_graphs_on_latest_model(dataset)

In [None]:
for k in datasets_ehr :
    dataset = datasets_ehr[k]()

In [None]:
for k in datasets_ehr :
    dataset = datasets_ehr[k]()
    train_dataset(dataset, 'logodds_lstm')

In [None]:
for k in datasets_ehr :
    dataset = datasets_ehr[k]()
    run_evaluator_on_latest_model(dataset, 'logodds_lstm')

In [None]:
from model.LR import LR

In [None]:
for k in datasets_ehr :
    if k != 'pheno' : continue
    dataset = datasets_ehr[k]()
    train_lr_on_dataset(dataset)

In [None]:
for k in datasets_ehr :
    dataset = datasets_ehr[k]()
    push_all_models(dataset, dataset.keys_to_use)

In [None]:
for k in datasets_ehr :
    dataset = datasets_ehr[k]()
    run_logodds_experiment(dataset)

In [None]:
list(enumerate(dataset.vec.label_headers))

In [None]:
get_top_words(dataset, config='lstm')

In [None]:
y = np.array(dataset.test_data.yt_hat)
idx_y = np.where(y > 0.8)[0]

In [None]:
from collections import defaultdict
top_words_dict = defaultdict(float)
for i in idx_y :
    d = dataset.test_data.top_words_attn[i]
    for k, v in d.items() :
        top_words_dict[k] += v

In [None]:
top_words = dict(sorted(top_words_dict.items(), key=lambda x: x[1])[-20:])

In [None]:
top_words

In [None]:
lr.print_all_features(n=40)

In [None]:
top_words_attn = set(top_words.keys())
top_words_lr = set(lr.get_features(n=20))

In [None]:
top_words_attn & top_words_lr

In [None]:
len(top_words_attn & top_words_lr) / len(top_words_attn | top_words_lr)

In [None]:
dataset.keys_to_use

Multi Adversarial Examples
==========================

In [None]:
for k in datasets_ehr :
    dataset = datasets_ehr[k]()
#     generate_adversarial_examples(dataset, config='lstm')
#     generate_logodds_examples(dataset, config='lstm')
#     generate_graphs_on_latest_model(dataset, config='lstm')
    push_all_models(dataset, keys=dataset.keys_to_use)

In [None]:
dataset.vec.label_headers

In [None]:
for k in datasets_ehr :
    dataset = datasets_ehr[k]()
    dataset.display_stats()

In [None]:
len(dataset.test_data.X)

In [None]:
evaluator = run_evaluator_on_latest_model(dataset)

In [None]:
multi_adversarial_outputs = pload(evaluator.model, 'multi_adversarial')

In [None]:
from Trainers.PlottingBC import *
test_data = dataset.test_data
emax_jds, emax_adv_attn, emax_ad_y = plot_multi_adversarial(test_data.X, test_data.yt_hat, 
                                                            test_data.attn_hat, multi_adversarial_outputs, dirname=".")

In [None]:
print_adversarial_examples(dataset, test_data.X, test_data.yt_hat, test_data.attn_hat, 
                           emax_jds, emax_adv_attn, emax_ad_y, by_class=None, dirname='.')