In [None]:
%load_ext autoreload
%autoreload 2
%load_ext line_profiler

In [None]:
import pandas as pd, numpy as np, seaborn as sns
import plotly.express as px 
import matplotlib.pyplot as plt
import wandb
from travis_attack.utils import display_all
from travis_attack.config import Config
from pprint import pprint
from pandas.io.json._normalize import nested_to_record    


In [None]:
api = wandb.Api()

In [None]:
runs = api.runs("uts_nlp/travis_attack", filters={"tags": {"$in": ["final"]}})
summary_params = ['baseline_test', 'any_adv_example_proportion-test',  'is_adv_example-mean-test', 'n_pp-test']
config_params = ['decode_method_eval', 'dataset_name', 'seed', 'gen_params_train']


summary_list, config_list, name_list = [], [], []
for run in runs: 
    # .summary contains the output keys/values for metrics like accuracy.
    #  We call ._json_dict to omit large files 
    d = run.summary._json_dict
    d1 = nested_to_record({ k: d[k] for k in summary_params })
    l = list(d1.keys())
    for k in l: 
        if 'baseline_test' in k: 
            if k not in ['baseline_test.any_adv_example_proportion', 'baseline_test.is_adv_example-mean']: d1.pop(k)
    summary_list.append(d1)

    # .config contains the hyperparameters.
    #  We remove special values that start with _.
    d1 = nested_to_record({k: v for k,v in run.config.items() if k in config_params})
    for k in ['gen_params_train.top_p','gen_params_train.do_sample']: d1.pop(k)
    #d2 = {k: v for k,v in run.config.items() if k in config_params}
    config_list.append(d1)

    # .name is the human-readable name of the run.
    name_list.append(run.name)

runs_df = pd.DataFrame({
   "summary": summary_list,
    "config": config_list,
    "name": name_list
    })

In [None]:
res_df = pd.concat(objs=[runs_df['name'], pd.DataFrame.from_records(runs_df["config"]), pd.DataFrame.from_records(runs_df["summary"])], axis=1)
res_df['avg_num_successes'] = res_df['is_adv_example-mean-test'] * res_df['n_pp-test']
res_df['decode_method_eval']  = pd.Categorical(res_df['decode_method_eval'], 
                      categories=["sample","beam_search","diverse_beam_search_low_diversity","diverse_beam_search_high_diversity"], ordered=False)
res_df['dataset_name']  = pd.Categorical(res_df['dataset_name'], 
                      categories=["rotten_tomatoes","financial"], ordered=False)

res_df = res_df.sort_values(['dataset_name', 'decode_method_eval', 'gen_params_train.temperature'])
#res_df

In [None]:
def replace_names(df): 
    df['dataset_name'].replace({"rotten_tomatoes": "RT", "financial":"FP" }, inplace=True)
    df['decode_method_eval'].replace({"sample": "Sample", "beam_search":"Beam search",
                                         "diverse_beam_search_low_diversity": "Diverse beam search (6 beam groups)",
                                         "diverse_beam_search_high_diversity": "Diverse beam search (48 beam groups)"}, inplace=True)
    df['decode_method_eval']  = pd.Categorical(df['decode_method_eval'], 
                        categories=["Sample","Beam search","Diverse beam search (6 beam groups)","Diverse beam search (48 beam groups)"], ordered=False)
    df['dataset_name']  = pd.Categorical(df['dataset_name'], 
                        categories=["RT","FP"], ordered=False)
    return df 

In [None]:
res_df = replace_names(res_df)

In [None]:
# check we have 3 runs per experiment condition 
grouped_df = res_df.groupby(['dataset_name', 'decode_method_eval', 'gen_params_train.temperature'])
grouped_df['seed'].agg('count')

dataset_name  decode_method_eval                    gen_params_train.temperature
RT            Sample                                0.85                            3
                                                    1.15                            3
              Beam search                           0.85                            3
                                                    1.15                            3
              Diverse beam search (6 beam groups)   0.85                            3
                                                    1.15                            3
              Diverse beam search (48 beam groups)  0.85                            3
                                                    1.15                            3
FP            Sample                                0.85                            3
                                                    1.15                            3
              Beam search                           0.85   

## Effect of changing temp 

In [None]:
# get baselines, avg and max performance 
metric_vals = grouped_df['baseline_test.any_adv_example_proportion', 'any_adv_example_proportion-test'].agg(['mean', 'max','std']).round(3)*100
metric_vals.columns =  ["_".join(a) for a in metric_vals.columns.to_flat_index()]
metric_vals = metric_vals.reset_index()
metric_vals.sort_values(['dataset_name', 'any_adv_example_proportion-test_max'], ascending=[False,False])
metric_vals


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0,dataset_name,decode_method_eval,gen_params_train.temperature,baseline_test.any_adv_example_proportion_mean,baseline_test.any_adv_example_proportion_max,baseline_test.any_adv_example_proportion_std,any_adv_example_proportion-test_mean,any_adv_example_proportion-test_max,any_adv_example_proportion-test_std
0,RT,Sample,0.85,29.3,30.6,1.2,38.1,41.8,3.9
1,RT,Sample,1.15,29.3,30.6,1.2,38.3,46.8,8.2
2,RT,Beam search,0.85,14.5,14.5,0.0,61.9,85.5,21.3
3,RT,Beam search,1.15,14.5,14.5,0.0,60.8,85.5,26.3
4,RT,Diverse beam search (6 beam groups),0.85,20.6,20.6,0.0,39.9,43.2,4.9
5,RT,Diverse beam search (6 beam groups),1.15,20.6,20.6,0.0,52.6,57.7,8.8
6,RT,Diverse beam search (48 beam groups),0.85,24.5,24.5,0.0,52.9,66.0,12.0
7,RT,Diverse beam search (48 beam groups),1.15,24.5,24.5,0.0,37.4,41.5,4.0
8,FP,Sample,0.85,19.1,20.8,1.6,66.0,72.3,5.4
9,FP,Sample,1.15,19.1,20.8,1.6,62.7,83.6,33.6


### Mean and std  

In [None]:
res_df.groupby(['gen_params_train.temperature'])['any_adv_example_proportion-test', 'is_adv_example-mean-test', 'n_pp-test'].agg(['mean', 'std'])


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0_level_0,any_adv_example_proportion-test,any_adv_example_proportion-test,is_adv_example-mean-test,is_adv_example-mean-test,n_pp-test,n_pp-test
Unnamed: 0_level_1,mean,std,mean,std,mean,std
gen_params_train.temperature,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0.85,0.579376,0.199876,0.226472,0.164152,39.870633,10.591294
1.15,0.571996,0.224978,0.230275,0.186917,39.412261,11.296676


No difference in mean. Std is a bit higher for 1.15 but that's about all. I don't know if it would be significant. 

### Does the agent see more different n-grams during training?

* pick 3 runs from each with same variables (just for now)
* get training step table 
* limit to same epochs that all have 
* get distinct n-grams of each
* plot them

In [None]:
from travis_attack.insights import get_training_dfs, _prepare_df_concat
from nltk import ngrams
from itertools import chain


In [None]:
def get_num_unigue_ngrams(df, n=1): 
    def get_ngrams(sen):  
        if pd.isna(sen): return ()
        return list(ngrams(sen.split(), n))
    ngrams_l = df['pp'].apply(get_ngrams).tolist()
    return len(set(chain.from_iterable(ngrams_l)))

In [None]:

cfg = Config()
results_d = dict()

for run_name in res_df['name'].tolist():
    print(run_name)
    count_d = dict()
    path_run = f"{cfg.path_checkpoints}{run_name}/"
    fname = f"{path_run}training_step.csv"
    df_training_step = pd.read_csv(fname)
    for n in [1,2,3]: 
        count_d[f"{n}gram_total"] =  get_num_unigue_ngrams(df_training_step, n=n)
        count_d[f"{n}gram_series"]  = df_training_step.groupby(['epoch']).apply(get_num_unigue_ngrams,  n=n)
    results_d[run_name] = count_d

polished-sweep-31
divine-sweep-32
glowing-sweep-31
rose-sweep-34
wild-sweep-33
lemon-sweep-32
rare-sweep-43
stoic-sweep-42
crisp-sweep-41
copper-yogurt-806
crimson-sweep-46
royal-sweep-45
devoted-sweep-37
comfy-sweep-36
gentle-sweep-35
usual-sweep-40
polar-sweep-39
woven-sweep-38
avid-sweep-27
unique-sweep-26
revived-sweep-25
cosmic-sweep-30
morning-sweep-29
apricot-sweep-28
zany-surf-809
absurd-sweep-8
vital-sweep-7
faithful-dust-807
rich-sweep-11
flowing-sweep-10
ancient-sweep-21
woven-sweep-20
happy-sweep-19
grateful-sweep-24
restful-sweep-23
mild-sweep-22
devout-sweep-15
silvery-sweep-14
balmy-sweep-13
crisp-sweep-18
dashing-sweep-17
sandy-sweep-16
proud-sweep-3
ruby-sweep-2
earthy-sweep-1
magic-sweep-6
deep-sweep-5
fallen-sweep-4


In [None]:
import pickle
#with open('./results/ngram_temp.pkl', 'wb') as handle:
#    pickle.dump(results_d, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('./results/ngram_temp.pkl', 'rb') as handle:
    results_d = pickle.load(handle)

#print(a == b)

In [None]:
results_d

{'polished-sweep-31': {'1gram_total': 18796,
  '1gram_series': epoch
  1      4475
  2      4122
  3      3766
  4      4002
  5      4161
         ... 
  149    4300
  150    4345
  151    4346
  152    4344
  153    4299
  Length: 153, dtype: int64,
  '2gram_total': 121705,
  '2gram_series': epoch
  1      11273
  2      10036
  3       8492
  4       9338
  5      10089
         ...  
  149    10028
  150    10151
  151    10225
  152    10203
  153    10064
  Length: 153, dtype: int64,
  '3gram_total': 265835,
  '3gram_series': epoch
  1      13199
  2      11450
  3       9297
  4      10408
  5      11524
         ...  
  149    11963
  150    12108
  151    12216
  152    12172
  153    11970
  Length: 153, dtype: int64},
 'divine-sweep-32': {'1gram_total': 21288,
  '1gram_series': epoch
  1      4522
  2      4406
  3      4348
  4      4260
  5      4200
         ... 
  196    4276
  197    4227
  198    4238
  199    4310
  200    4284
  Length: 200, dtype: int64,
  '2gram_to

In [None]:
l=[]
for run_name, count_d in results_d.items():
    s = count_d['2gram_series']
    s.name=run_name
    l.append(s)

df1 = pd.concat(l, axis=1)
df1 = df1.reset_index()
df_long = pd.melt(df1, id_vars='epoch', var_name='run_name', value_name='num_distinct_ngrams')
df_long = pd.merge(df_long, res_df[['name', 'dataset_name',"gen_params_train.temperature"]], 
                   how='left', left_on='run_name', right_on='name')

df_agg = df_long.groupby(['dataset_name','epoch', "gen_params_train.temperature"])['num_distinct_ngrams'].agg('mean').to_frame().reset_index()
px.line(data_frame=df_agg, x='epoch', y='num_distinct_ngrams', color="gen_params_train.temperature", line_group='dataset_name')

We have a small difference but not much more than that. 

In [None]:
df_long

Unnamed: 0,epoch,run_name,value,name,gen_params_train.temperature
0,1,polished-sweep-31,11273.0,polished-sweep-31,0.85
1,2,polished-sweep-31,10036.0,polished-sweep-31,0.85
2,3,polished-sweep-31,8492.0,polished-sweep-31,0.85
3,4,polished-sweep-31,9338.0,polished-sweep-31,0.85
4,5,polished-sweep-31,10089.0,polished-sweep-31,0.85
...,...,...,...,...,...
3195,196,usual-sweep-40,,usual-sweep-40,1.15
3196,197,usual-sweep-40,,usual-sweep-40,1.15
3197,198,usual-sweep-40,,usual-sweep-40,1.15
3198,199,usual-sweep-40,,usual-sweep-40,1.15


### How does the generated text change between them? (Perplexity)

Leaving this for now. I think focusing on the eval methods would be better. 

## Effect of eval decoding methods

### Mean and std

In [None]:
res_df.groupby(['dataset_name','decode_method_eval'])[['any_adv_example_proportion-test', 'is_adv_example-mean-test', 'n_pp-test']].agg(['mean', 'std'])

Unnamed: 0_level_0,Unnamed: 1_level_0,any_adv_example_proportion-test,any_adv_example_proportion-test,is_adv_example-mean-test,is_adv_example-mean-test,n_pp-test,n_pp-test
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std
dataset_name,decode_method_eval,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
RT,Sample,0.381616,0.057681,0.084857,0.048018,30.721448,6.827976
RT,Beam search,0.613742,0.214276,0.21583,0.149223,47.731198,0.141011
RT,Diverse beam search (6 beam groups),0.462396,0.094248,0.090053,0.032808,45.902507,0.849251
RT,Diverse beam search (48 beam groups),0.451718,0.116878,0.076136,0.030877,42.221913,1.341338
FP,Sample,0.643606,0.216216,0.425788,0.169798,15.403564,2.176364
FP,Beam search,0.748428,0.214501,0.411074,0.167591,47.016771,0.425199
FP,Diverse beam search (6 beam groups),0.791405,0.031426,0.343331,0.032811,44.52935,0.443268
FP,Diverse beam search (48 beam groups),0.512579,0.261646,0.179919,0.13758,43.604822,1.567214


In [None]:
res_df['success_percent'] = res_df['any_adv_example_proportion-test']* 100

In [None]:
fig = px.box(res_df, x= 'dataset_name', y='success_percent', color='decode_method_eval', range_y=[0,100],
            labels = {'success_percent': "Attack Success %"})
fig.update_layout(xaxis_title=None)  

# #fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(font_size=18, 
                  legend=dict(
       orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="left",
    x=0, 
        font_size=15
), legend_title_text=None, 
       # font_size=12,  
                  xaxis = dict( titlefont = dict(size=18)),
     #             yaxis = dict( titlefont = dict(size=15)),
                  
)

# fig.for_each_yaxis(lambda y: y.update(title = ''))
# fig.add_annotation(x=-0.065,y=0.41,
#                    text="# Distinct Bigrams", textangle=-90,
#                     xref="paper", yref="paper", font_size=18)
# fig.update_traces(line=dict(width=3))


# fig.update_annotations(font_size=18)

fig.update_layout(#width =800, height=250, 
                 # font_family="Serif", 
#                  font_size=14,
                  margin_l=5, margin_t=5, margin_b=7, margin_r=5)


# #fig.update_layout(showlegend=False) 
fig.write_image("./results/eval_performance.pdf",  width=1.5*450, height=0.75*600, engine='kaleido')
fig.show()

Observations 
* Lots of variance on these runs, some more than others 
* Beam search performed the best on each dataset
* Many runs just "fail" for some seeds (lots of variability) 

In [None]:
px.box(res_df, x= 'dataset_name', y='n_pp-test', color='decode_method_eval')

Sample generates far less unique paraphrases than the others. Surprisingly the high-diversity beam search also loses some efficiency. Beam search gets the most "attempts" at finding a solution. 

In [None]:
px.box(res_df, x= 'dataset_name', y='is_adv_example-mean-test', color='decode_method_eval')

High diversity beam search had a low chance of actually generating an adversarial example. Beam search did the best in this area. Sample did very well for the financial dataset, but overall performance was low because it didn't generate enough adversarial examples. 

That is indicated in this graph. Beam search clearly generates the highest quantity of examples. 

In [None]:
px.box(res_df, x= 'dataset_name', y='avg_num_successes', color='decode_method_eval')

### N-grams during training

When calculating eval metrics...these are some big dataframes. Let's use one as a sample. 
We can't use test set because that is only for start and end epoch, so let's try valid. Can also use train but valid is smaller.

In [None]:
def get_num_unigue_ngrams(df, n=1): 
    def get_ngrams(sen):  
        if pd.isna(sen): return ()
        return list(ngrams(sen.split(), n))
    ngrams_l = df['pp'].apply(get_ngrams).tolist()
    return len(set(chain.from_iterable(ngrams_l)))

In [None]:
cfg = Config()
results_d = dict()
run_name in res_df['name'].tolist():
    print(run_name)
    count_d = dict()
    path_run = f"{cfg.path_checkpoints}{run_name}/"
    fname = f"{path_run}train.csv"
    df_training_step = pd.read_csv(fname)
    for n in [2]: 
        count_d[f"{n}gram_total"] =  get_num_unigue_ngrams(df_training_step, n=n)
        count_d[f"{n}gram_series"]  = df_training_step.groupby(['epoch']).apply(get_num_unigue_ngrams,  n=n)
    results_d[run_name] = count_d

In [None]:
import pickle
#with open('./results/ngram_eval.pkl', 'wb') as handle:
#    pickle.dump(results_d, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('./results/ngram_eval.pkl', 'rb') as handle:
    results_d = pickle.load(handle)

#print(a == b)

In [None]:
l=[]
for run_name, count_d in results_d.items():
    s = count_d['2gram_series']
    s.name=run_name
    l.append(s)

df1 = pd.concat(l, axis=1)
df1 = df1.reset_index()
df_long = pd.melt(df1, id_vars='epoch', var_name='run_name', value_name='num_distinct_ngrams')
df_long = pd.merge(df_long, res_df[['name', 'dataset_name',"decode_method_eval"]], 
                   how='left', left_on='run_name', right_on='name')

df_agg = df_long.groupby(['dataset_name','epoch', "decode_method_eval"])['num_distinct_ngrams'].agg('mean').to_frame().reset_index().dropna()

df_agg = df_agg.query("epoch <=100")

In [None]:
df_agg = replace_names(df_agg)

In [None]:
fig = px.line(data_frame=df_agg, x='epoch', y='num_distinct_ngrams', color="decode_method_eval", facet_row='dataset_name', range_y=[0,28000],
       labels={"epoch": "Epoch", "num_distinct_ngrams": "# Distinct Bigrams", "dataset_name": "Dataset"})
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(legend=dict(
       orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="left",
    x=0, 
        font_size=15
), legend_title_text=None, 
       # font_size=12,  
                  xaxis = dict( titlefont = dict(size=18)),
     #             yaxis = dict( titlefont = dict(size=15)),
                  
)

fig.for_each_yaxis(lambda y: y.update(title = ''))
fig.add_annotation(x=-0.065,y=0.41,
                   text="# Distinct Bigrams", textangle=-90,
                    xref="paper", yref="paper", font_size=18)
fig.update_traces(line=dict(width=3))


fig.update_annotations(font_size=18)

fig.update_layout(#width =800, height=250, 
                 # font_family="Serif", 
                  font_size=14,
                  margin_l=5, margin_t=5, margin_b=7, margin_r=5)


#fig.update_layout(showlegend=False) 
fig.write_image("./results/eval_ngrams.pdf",  width=1.5*450, height=0.75*800, engine='kaleido')
fig.show()

In [None]:
fig.layout

Layout({
    'annotations': [{'font': {},
                     'showarrow': False,
                     'text': 'Rotten Tomatoes',
                     'x': 0.245,
                     'xanchor': 'center',
                     'xref': 'paper',
                     'y': 1.0,
                     'yanchor': 'bottom',
                     'yref': 'paper'},
                    {'font': {},
                     'showarrow': False,
                     'text': 'Financial PhraseBank',
                     'x': 0.755,
                     'xanchor': 'center',
                     'xref': 'paper',
                     'y': 1.0,
                     'yanchor': 'bottom',
                     'yref': 'paper'}],
    'legend': {'title': {}, 'tracegroupgap': 0, 'x': 0.99, 'xanchor': 'right', 'y': 0.99, 'yanchor': 'top'},
    'margin': {'t': 60},
    'template': '...',
    'xaxis': {'anchor': 'y', 'domain': [0.0, 0.49], 'title': {'text': 'Epoch'}},
    'xaxis2': {'anchor': 'y2', 'domain': [0.51, 1.0],

### Adv example quality + diversity 

We have established that beam search finds the most adv examples and has the highest success rate. But what do these examples look like?
Would a real-world adversary pick a different method which produces fewer but higher-quality examples?
How do you measure quality in this area?

Metrics: 
* how diverse or interesting are the examples? 
* are they local or global? Does the adversary find common phrases that trip up the classifier? What are these phrases?
* How fluent are they? 


#### Diversity 

Cluster sentences and see how many clusters you get. 
Higher number of clusters indicate more diversity. 

In [None]:
import hdbscan, umap, swifter
from travis_attack.models import _prepare_sts_model

In [None]:
def get_clusters(x): 
    embeddings = sts_model.encode(x['pp'].tolist())
    if len(x) <= 4: 
        x['cluster'] = [1 for o in range(len(x))]
        return x     
    clusterable_embedding = umap.UMAP(
        n_neighbors=15,
        min_dist=0.0,
        n_components=6,
        random_state=1000,
    ).fit_transform(embeddings)

    labels = hdbscan.HDBSCAN(
        min_samples=None,
        min_cluster_size=10,
        allow_single_cluster=True
    ).fit_predict(clusterable_embedding)
    #return len(set(labels))
    x['cluster'] = labels
    return x

In [None]:
cfg = Config()
sts_model = _prepare_sts_model(cfg)

In [None]:
results_d = dict()
df_topfew = res_df.groupby(['dataset_name', 'decode_method_eval']).apply(lambda x: x.nlargest(2, "any_adv_example_proportion-test"))
df_l = list()
for run_name in df_topfew['name'].tolist():
    count_d = dict()
    print(run_name)
    path_run = f"{cfg.path_checkpoints}{run_name}/"
    fname = f"{path_run}test.csv"
    df= pd.read_csv(fname)
    df['model'] = ["untrained" if o == 0 else "trained" for o in df['epoch']]
    df = df.query("model=='trained'")
    df['name'] = run_name
    #df = df.query('idx<=100')
    df_l.append(df)

deep-sweep-5
earthy-sweep-1
zany-surf-809
flowing-sweep-10
dashing-sweep-17
sandy-sweep-16
ancient-sweep-21
woven-sweep-20
morning-sweep-29
apricot-sweep-28
wild-sweep-33
polished-sweep-31
comfy-sweep-36
polar-sweep-39
copper-yogurt-806
rare-sweep-43


In [None]:
df_all = pd.concat(df_l)
df_all = df_all.merge(res_df[['name', 'dataset_name', 'decode_method_eval' ]] ,how='left', on='name')
#df_all.swifter.allow_dask_on_strings().groupby(["idx"]).apply(get_num_clusters)

In [None]:
df_all2 = df_all.groupby(["dataset_name", "idx"]).apply(get_clusters)

In [None]:
fname = './results/diversity_eval.csv'
#df_all2.to_csv(fname )
df_all2 
df_all2 = pd.read_csv(fname)
df_all2 = replace_names(df_all2)

In [None]:
df_n_clusters = df_all2.groupby(['dataset_name','idx','decode_method_eval', ])['cluster'].agg(  lambda x: len(set(x)) ).reset_index().dropna()
df_n_clusters.rename({'cluster':'n_clusters'},axis=1, inplace=True)

In [None]:
fig = px.violin(data_frame=df_n_clusters, x='dataset_name', y='n_clusters', color="decode_method_eval", 
            labels={"n_clusters": "# Clusters", "dataset_name": "Dataset"})
#fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(
    legend=dict(
       orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="left",
    x=0, 
        font_size=15,
), legend_title_text=None, 
                  xaxis = dict( titlefont = dict(size=18)),
                  yaxis = dict( titlefont = dict(size=18)),
                  
)
fig.update_layout(#width =800, height=250, 
                 # font_family="Serif", 
                  font_size=18,
                  margin_l=5, margin_t=5, margin_b=5, margin_r=5)
fig.update_layout(xaxis_title=None)  
fig.update_annotations(font_size=15)

fig.update_layout(font_size=18, 
                  legend=dict(
       orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="left",
    x=0, 
        font_size=15
), legend_title_text=None, 
       # font_size=12,  
                  xaxis = dict( titlefont = dict(size=18)),
     #             yaxis = dict( titlefont = dict(size=15)),
                  
)
fig.write_image("./results/eval_diversity.pdf",  width=1.5*450, height=0.75*600, engine='kaleido')

fig.show()

In [None]:
px.histogram(df_n_clusters, x="n_clusters", color="decode_method_eval", facet_row = 'dataset_name')


#### Common phrases 

LEAVE FOR NOW 
* what % of examples have a common phrase? 
* what phrases are they?


#### Fluency 

Perplexity is the only measurement I can really think about here. 

In [None]:
from evaluate import load

In [None]:
perplexity = load("perplexity", module_type="metric")

predictions=  df_all2['pp'].tolist()
results = perplexity.compute(predictions=predictions, model_id='EleutherAI/gpt-neo-1.3B', device='cuda')
df_all2['perplexity'] = results['perplexities']

In [None]:
fname = './results/fluency_eval.csv'
#df_all2.to_csv(fname )

df_all2 = pd.read_csv(fname)
#df_all2 = replace_names(df_all2)

In [None]:
df_all2['dataset_name'].replace({"Rotten Tomatoes": "RT", "Financial PhraseBank":"FP" }, inplace=True)
df_all2['decode_method_eval'].replace({"sample": "Sample", "Beam Search":"Beam search",
                                     "Diverse Beam Search (6 beams)": "Diverse beam search (6 beam groups)",
                                     "Diverse Beam Search (48 beams)": "Diverse beam search (48 beam groups)"}, inplace=True)
df_all2['decode_method_eval']  = pd.Categorical(df_all2['decode_method_eval'], 
                    categories=["Sample","Beam search","Diverse beam search (6 beam groups)","Diverse beam search (48 beam groups)"], ordered=False)
df_all2['dataset_name']  = pd.Categorical(df_all2['dataset_name'], 
                    categories=["RT","FP"], ordered=False)

In [None]:
avg_perplexity_per_idx = df_all2.groupby(['dataset_name', 'idx', 'decode_method_eval'])['perplexity'].agg('median').reset_index().dropna()
avg_perplexity_per_idx.rename({'perplexity': 'median_perplexity'}, axis=1, inplace=True)

In [None]:
fig = px.box(data_frame=avg_perplexity_per_idx, x='dataset_name', y='median_perplexity', color="decode_method_eval", range_y=[0, 200],
            labels={"dataset_name": "Dataset", "median_perplexity": "Median Perplexity"})
#fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(
    legend=dict(
       orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="left",
    x=0, 
        font_size=15
), legend_title_text=None, 
                  xaxis = dict( titlefont = dict(size=18)),
                  yaxis = dict( titlefont = dict(size=18)),
                  
)
fig.update_annotations(font_size=15)

fig.update_layout(#width =800, height=250, 
                 # font_family="Serif", 
                  font_size=18,
                  margin_l=5, margin_t=5, margin_b=5, margin_r=5)
fig.update_layout(xaxis_title=None)  

            
# )
# fig.update_layout(#width =800, height=250, 
#                  # font_family="Serif", 
#                   font_size=18,
#                   margin_l=5, margin_t=5, margin_b=5, margin_r=5)
# fig.update_layout(xaxis_title=None)  
# fig.update_annotations(font_size=15)

# fig.update_layout(font_size=18, 
#                   legend=dict(
#        orientation="h",
#     yanchor="bottom",
#     y=1.02,
#     xanchor="left",
#     x=0, 
#         font_size=15
# ), legend_title_text=None, 
#        # font_size=12,  
#                   xaxis = dict( titlefont = dict(size=18)),
#      #             yaxis = dict( titlefont = dict(size=15)),
                  
# )
# fig.write_image("./results/eval_diversity.pdf",  width=1.5*450, height=0.75*600, engine='kaleido')


fig.write_image("./results/eval_perplexity.pdf",  width=1.5*450, height=0.75*600, engine='kaleido')
fig.show()

ValueError: 
Image export using the "kaleido" engine requires the kaleido package,
which can be installed using pip:
    $ pip install -U kaleido 


In [None]:
import plotly.io as pio
#save a figure of 300dpi, width 1.5 inches, height 0.75inches
pio.write_image(fig, "test.pdf", width=1.5*300, height=0.75*300)

ValueError: 
The orca executable is required in order to export figures as static images,
but the executable that was found at '/usr/bin/orca'
does not seem to be a valid plotly orca executable. Please refer to the end of
this message for details on what went wrong.

If you haven't installed orca yet, you can do so using conda as follows:

    $ conda install -c plotly plotly-orca

Alternatively, see other installation methods in the orca project README at
https://github.com/plotly/orca

After installation is complete, no further configuration should be needed.

If you have installed orca, then for some reason plotly.py was unable to
locate it. In this case, set the `plotly.io.orca.config.executable`
property to the full path of your orca executable. For example:

    >>> plotly.io.orca.config.executable = '/path/to/orca'

After updating this executable property, try the export operation again.
If it is successful then you may want to save this configuration so that it
will be applied automatically in future sessions. You can do this as follows:

    >>> plotly.io.orca.config.save()

If you're still having trouble, feel free to ask for help on the forums at
https://community.plot.ly/c/api/python

The error encountered is that unexpected output was returned by the command
    $ /usr/bin/xvfb-run --auto-servernum --server-args -screen 0 640x480x24 +extension RANDR +extension GLX /usr/bin/orca --help

b"\n(process:7157): dconf-WARNING **: 12:12:14.460: unable to open file '/run/user/52339/dconf-service/keyfile/user': Failed to open file ?/run/user/52339/dconf-service/keyfile/user?: open() failed: No such file or directory; expect degraded performance\n/usr/lib64/python2.7/site-packages/orca/orca.py:52: PyGIWarning: Gtk was imported without specifying a version first. Use gi.require_version('Gtk', '3.0') before import to ensure that the right version gets loaded.\n  from gi.repository import Gtk\n/usr/lib64/python2.7/site-packages/orca/mouse_review.py:31: PyGIWarning: Wnck was imported without specifying a version first. Use gi.require_version('Wnck', '3.0') before import to ensure that the right version gets loaded.\n  from gi.repository import Wnck\nusage: orca [-h] [-v] [-r] [-t] [-l] [-e OPTION] [-d OPTION] [-p NAME]\n            [-u DIR] [--debug-file FILE] [--debug]\n\norca - scriptable screen reader\n\noptional arguments:\n  -h, --help                   show this help message and exit\n  -v, --version                3.6.3\n  -r, --replace                Replace a currently running Orca\n  -t, --text-setup             Set up user preferences (text version)\n  -l, --list-apps              Print the known running applications\n  -e OPTION, --enable OPTION   Force use of option\n  -d OPTION, --disable OPTION  Prevent use of option\n  -p NAME, --profile NAME      Load profile\n  -u DIR, --user-prefs DIR     Use alternate directory for user preferences\n  --debug-file FILE            Send debug output to the specified file\n  --debug                      Send debug output to debug-YYYY-MM-DD-\n                               HH:MM:SS.out\n\nReport bugs to orca-list@gnome.org.\n"


In [None]:
!pip install -U kaleido

Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[K     |████████████████████████████████| 79.9 MB 61.0 MB/s eta 0:00:01             | 1.8 MB 563 kB/s eta 0:02:19kB/s eta 0:00:25
[?25hInstalling collected packages: kaleido
Successfully installed kaleido-0.2.1
