In [1]:
import pandas as pd
import json
import pandasql as ps
import exploration_utils as data_manip
import eval_grouping_queries as queries

In [2]:
result_dir = "../data/generation_result_data/"

data_files = ["material_perf_10.csv", "physical_perf_10.csv", "social_perf_10.csv"]

config_dir = "../data/truism_data/"

config_files = ["material_data_2.json", "physical_data_2.json", "social_data_2.json"]

In [3]:
data = []

for file in data_files:
    full_path = result_dir + file
    data.append(pd.read_csv(full_path))

configs = []

for file in config_files:
    full_path = config_dir + file
    with open(full_path) as f:
        configs.append(json.load(f))

In [4]:
data[0].head()

Unnamed: 0,truism_number,perturbation,premise,avg_binary_score,avg_ratio_score
0,0,original,original,1.0,0.016736
1,0,original,asymmetric_premise,0.0,-0.010222
2,0,original,asymmetric_conclusion,0.0,-0.010865
3,0,negation,original,0.0,-0.019636
4,0,negation,asymmetric_premise,1.0,0.018121


In [5]:
perturbation_order = data_manip.get_perturbation_order(data[0], 24, 0)

In [6]:
accuracy_sep_dict = {
                      "template" : [],
                      "set_number" : [],
                      "linguistic_operator" : [],
                      "asymetric_operator" : [],
                      "stat" : []
                    }

last_num = 0
for i, d in enumerate(data):
    accuracy_sep_dict = data_manip.transform_results_to_seperate_sql_dict(d, 
                                                                          configs[i],
                                                                          accuracy_sep_dict,
                                                                          stat_name="avg_binary_score",
                                                                          last_num=last_num,
                                                                          option=0)

In [7]:
accuracy_merged_dict = {
                         "template" : [],
                         "set_number" : [],
                         "perturbation" : [],
                         "stat" : []
                       }

last_num = 0
for i, d in enumerate(data):
    accuracy_merged_dict = data_manip.transform_results_to_merged_sql_dict(d, 
                                                                           configs[i],
                                                                           accuracy_merged_dict,
                                                                           stat_name="avg_binary_score",
                                                                           last_num=last_num,
                                                                           option=0)
    last_num += 20

In [8]:
sep_df = pd.DataFrame.from_dict(accuracy_sep_dict)
merged_df = pd.DataFrame.from_dict(accuracy_merged_dict)

In [9]:
sep_df.head()

Unnamed: 0,template,set_number,linguistic_operator,asymetric_operator,stat
0,1,0,original,original,1.0
1,1,0,original,asymmetric_premise,0.0
2,1,0,original,asymmetric_conclusion,0.0
3,1,0,negation,original,0.0
4,1,0,negation,asymmetric_premise,1.0


In [10]:
grouped_template_sep_operator_accuracy_data = queries.run_template_seperate_operator_grouping_query(sep_df)

In [11]:
grouped_template_sep_operator_accuracy_data

Unnamed: 0,template,linguistic_operator,asymetric_operator,avg_stat
0,1,antonym,asymmetric_conclusion,0.535000
1,1,antonym,asymmetric_premise,0.520000
2,1,antonym,original,0.485000
3,1,negation,asymmetric_conclusion,0.510000
4,1,negation,asymmetric_premise,0.515000
...,...,...,...,...
91,4,paraphrase,asymmetric_premise,0.469231
92,4,paraphrase,original,0.542308
93,4,paraphrase_inversion,asymmetric_conclusion,0.450000
94,4,paraphrase_inversion,asymmetric_premise,0.453846


In [12]:
grouped_template_merged_operator_accuracy_data = queries.run_template_merged_operator_grouping_query(merged_df)

In [13]:
grouped_template_merged_operator_accuracy_data

Unnamed: 0,template,perturbation,avg_stat
0,1,antonym-asymmetric_conclusion,0.535000
1,1,antonym-asymmetric_premise,0.520000
2,1,antonym-original,0.485000
3,1,negation-asymmetric_conclusion,0.510000
4,1,negation-asymmetric_premise,0.515000
...,...,...,...
91,4,paraphrase-asymmetric_premise,0.469231
92,4,paraphrase-original,0.542308
93,4,paraphrase_inversion-asymmetric_conclusion,0.450000
94,4,paraphrase_inversion-asymmetric_premise,0.453846


In [14]:
grouped_set_merged_operator_accuracy_data = queries.run_set_merged_operator_grouping_query(merged_df)

In [15]:
grouped_set_merged_operator_accuracy_data

Unnamed: 0,set_number,perturbation,avg_stat
0,0,antonym-asymmetric_conclusion,1.0
1,0,antonym-asymmetric_premise,1.0
2,0,antonym-original,0.0
3,0,negation-asymmetric_conclusion,1.0
4,0,negation-asymmetric_premise,1.0
...,...,...,...
1435,59,paraphrase-asymmetric_premise,0.0
1436,59,paraphrase-original,1.0
1437,59,paraphrase_inversion-asymmetric_conclusion,1.0
1438,59,paraphrase_inversion-asymmetric_premise,1.0


In [16]:
grouped_template_accuracy_data = queries.run_template_grouping_query(merged_df)

In [17]:
grouped_template_accuracy_data

Unnamed: 0,template,avg_stat,count
0,1,0.49,480
1,2,0.474583,240
2,3,0.5,96
3,4,0.505288,624


In [18]:
grouped_merged_operator_accuracy_data = queries.run_perturbation_grouping_query(merged_df)

In [19]:
grouped_merged_operator_accuracy_data

Unnamed: 0,perturbation,avg_stat,count
0,original-asymmetric_conclusion,0.493333,60
1,original-asymmetric_premise,0.503333,60
2,original-original,0.49,60
3,negation-asymmetric_conclusion,0.526667,60
4,negation-asymmetric_premise,0.535,60
5,negation-original,0.483333,60
6,negation_antonym-asymmetric_conclusion,0.461667,60
7,negation_antonym-asymmetric_premise,0.46,60
8,negation_antonym-original,0.541667,60
9,negation_paraphrase-asymmetric_conclusion,0.513333,60


In [20]:
def create_table(grouped_numbers, columns_in_order, column_name, stat):
    output = {}
    for column in columns_in_order:
        output[column] = []
    
    for i, row in grouped_numbers.iterrows():
        output[row[column_name]].append(row[stat])
    
    return pd.DataFrame.from_dict(output)

In [21]:
template_table = create_table(grouped_template_merged_operator_accuracy_data, perturbation_order, "perturbation", "avg_stat")


In [22]:
template_table

Unnamed: 0,original-original,original-asymmetric_premise,original-asymmetric_conclusion,negation-original,negation-asymmetric_premise,negation-asymmetric_conclusion,antonym-original,antonym-asymmetric_premise,antonym-asymmetric_conclusion,paraphrase-original,...,paraphrase_inversion-asymmetric_conclusion,negation_antonym-original,negation_antonym-asymmetric_premise,negation_antonym-asymmetric_conclusion,negation_paraphrase-original,negation_paraphrase-asymmetric_premise,negation_paraphrase-asymmetric_conclusion,negation_paraphrase_inversion-original,negation_paraphrase_inversion-asymmetric_premise,negation_paraphrase_inversion-asymmetric_conclusion
0,0.465,0.52,0.51,0.51,0.515,0.51,0.485,0.52,0.535,0.515,...,0.345,0.56,0.42,0.435,0.535,0.43,0.455,0.36,0.62,0.635
1,0.57,0.48,0.44,0.48,0.5,0.49,0.55,0.42,0.42,0.67,...,0.4,0.56,0.46,0.45,0.4,0.6,0.6,0.47,0.47,0.44
2,0.25,0.725,0.7,0.5,0.5,0.5,0.75,0.25,0.25,0.25,...,0.55,0.5,0.5,0.5,0.5,0.375,0.35,0.5,0.5,0.5
3,0.515385,0.465385,0.469231,0.461538,0.569231,0.557692,0.430769,0.573077,0.576923,0.542308,...,0.45,0.526923,0.484615,0.480769,0.45,0.561538,0.55,0.457692,0.542308,0.534615


In [23]:
set_table = create_table(grouped_set_merged_operator_accuracy_data, perturbation_order, "perturbation", "avg_stat")


In [24]:
# set_table

In [25]:
pd.DataFrame.to_csv(template_table, "data/analyzed_generation_data/template_acc_table.csv")
pd.DataFrame.to_csv(set_table, "data/analyzed_generation_data/set_acc_table.csv")