In [1]:
import pandas as pd
import json
import pandasql as ps
import data_manipulation_functions as data_manip
import eval_grouping_queries as queries

In [2]:
result_dir = "data/entailment_result_data/"

data_files = ["material_entail_perf_2_10.csv", "physical_entail_perf_2_10.csv", "social_entail_perf_2_10.csv"]

config_dir = "data/truism_data/"

config_files = ["material_data_2.json", "physical_data_2.json", "social_data_2.json"]

In [3]:
data = []

for file in data_files:
    full_path = result_dir + file
    data.append(pd.read_csv(full_path))

configs = []

for file in config_files:
    full_path = config_dir + file
    with open(full_path) as f:
        configs.append(json.load(f))

In [4]:
data[0].head()

Unnamed: 0,set_number,perturbation,asym_perturbs,avg_accuracy_score
0,0,original,original,0.0
1,0,original,asymmetric_premise,0.0
2,0,original,asymmetric_conclusion,0.0
3,0,negation,original,0.0
4,0,negation,asymmetric_premise,0.0


In [5]:
perturbation_order = data_manip.get_perturbation_order(data[0], 24, 1)

In [6]:
accuracy_sep_dict = {
                      "template" : [],
                      "set_number" : [],
                      "linguistic_operator" : [],
                      "asymetric_operator" : [],
                      "stat" : []
                    }

last_num = 0
for i, d in enumerate(data):
    accuracy_sep_dict = data_manip.transform_results_to_seperate_sql_dict(d, 
                                                                          configs[i],
                                                                          accuracy_sep_dict,
                                                                          stat_name="avg_accuracy_score",
                                                                          last_num=last_num,
                                                                          option=1)

In [7]:
accuracy_merged_dict = {
                         "template" : [],
                         "set_number" : [],
                         "perturbation" : [],
                         "stat" : []
                       }

last_num = 0
for i, d in enumerate(data):
    accuracy_merged_dict = data_manip.transform_results_to_merged_sql_dict(d, 
                                                                           configs[i],
                                                                           accuracy_merged_dict,
                                                                           stat_name="avg_accuracy_score",
                                                                           last_num=last_num,
                                                                           option=1)
    last_num += 20

In [8]:
sep_df = pd.DataFrame.from_dict(accuracy_sep_dict)
merged_df = pd.DataFrame.from_dict(accuracy_merged_dict)

In [9]:
sep_df.head()

Unnamed: 0,template,set_number,linguistic_operator,asymetric_operator,stat
0,1,0,original,original,0.0
1,1,0,original,asymmetric_premise,0.0
2,1,0,original,asymmetric_conclusion,0.0
3,1,0,negation,original,0.0
4,1,0,negation,asymmetric_premise,0.0


In [10]:
grouped_template_sep_operator_accuracy_data = queries.run_template_seperate_operator_grouping_query(sep_df)

In [11]:
grouped_template_sep_operator_accuracy_data

Unnamed: 0,template,linguistic_operator,asymetric_operator,avg_stat
0,1,antonym,asymmetric_conclusion,0.000000
1,1,antonym,asymmetric_premise,0.000000
2,1,antonym,original,0.000000
3,1,negation,asymmetric_conclusion,0.000000
4,1,negation,asymmetric_premise,0.000000
...,...,...,...,...
91,4,paraphrase,asymmetric_premise,0.076923
92,4,paraphrase,original,0.073077
93,4,paraphrase_inversion,asymmetric_conclusion,0.015385
94,4,paraphrase_inversion,asymmetric_premise,0.019231


In [12]:
grouped_template_merged_operator_accuracy_data = queries.run_template_merged_operator_grouping_query(merged_df)

In [13]:
grouped_template_merged_operator_accuracy_data

Unnamed: 0,template,perturbation,avg_stat
0,1,antonym-asymmetric_conclusion,0.000000
1,1,antonym-asymmetric_premise,0.000000
2,1,antonym-original,0.000000
3,1,negation-asymmetric_conclusion,0.000000
4,1,negation-asymmetric_premise,0.000000
...,...,...,...
91,4,paraphrase-asymmetric_premise,0.076923
92,4,paraphrase-original,0.073077
93,4,paraphrase_inversion-asymmetric_conclusion,0.015385
94,4,paraphrase_inversion-asymmetric_premise,0.019231


In [23]:
grouped_set_merged_operator_accuracy_data = queries.run_set_merged_operator_grouping_query(merged_df)

In [24]:
grouped_set_merged_operator_accuracy_data

Unnamed: 0,set_number,perturbation,avg_stat
0,0,antonym-asymmetric_conclusion,0.0
1,0,antonym-asymmetric_premise,0.0
2,0,antonym-original,0.0
3,0,negation-asymmetric_conclusion,0.0
4,0,negation-asymmetric_premise,0.0
...,...,...,...
1435,59,paraphrase-asymmetric_premise,0.0
1436,59,paraphrase-original,0.0
1437,59,paraphrase_inversion-asymmetric_conclusion,0.0
1438,59,paraphrase_inversion-asymmetric_premise,0.0


In [25]:
grouped_template_accuracy_data = queries.run_template_grouping_query(merged_df)

In [26]:
grouped_template_accuracy_data

Unnamed: 0,template,avg_stat,count
0,1,0.003958,480
1,2,0.019583,240
2,3,0.164583,96
3,4,0.071154,624


In [27]:
grouped_merged_operator_accuracy_data = queries.run_perturbation_grouping_query(merged_df)

In [28]:
grouped_merged_operator_accuracy_data

Unnamed: 0,perturbation,avg_stat,count
0,original-asymmetric_conclusion,0.046667,60
1,original-asymmetric_premise,0.048333,60
2,original-original,0.065,60
3,negation-asymmetric_conclusion,0.06,60
4,negation-asymmetric_premise,0.066667,60
5,negation-original,0.088333,60
6,negation_antonym-asymmetric_conclusion,0.058333,60
7,negation_antonym-asymmetric_premise,0.06,60
8,negation_antonym-original,0.045,60
9,negation_paraphrase-asymmetric_conclusion,0.075,60


In [29]:
def create_table(grouped_numbers, columns_in_order, column_name, stat):
    output = {}
    for column in columns_in_order:
        output[column] = []
    
    for i, row in grouped_numbers.iterrows():
        output[row[column_name]].append(row[stat])
    
    return pd.DataFrame.from_dict(output)

In [30]:
template_table = create_table(grouped_template_merged_operator_accuracy_data, perturbation_order, "perturbation", "avg_stat")


In [31]:
template_table

Unnamed: 0,original-original,original-asymmetric_premise,original-asymmetric_conclusion,negation-original,negation-asymmetric_premise,negation-asymmetric_conclusion,antonym-original,antonym-asymmetric_premise,antonym-asymmetric_conclusion,paraphrase-original,...,paraphrase_inversion-asymmetric_conclusion,negation_antonym-original,negation_antonym-asymmetric_premise,negation_antonym-asymmetric_conclusion,negation_paraphrase-original,negation_paraphrase-asymmetric_premise,negation_paraphrase-asymmetric_conclusion,negation_paraphrase_inversion-original,negation_paraphrase_inversion-asymmetric_premise,negation_paraphrase_inversion-asymmetric_conclusion
0,0.0,0.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.025,0.04,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.08,0.07,0.12,0.11,0.09,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.25,0.0,0.0,0.25,0.25,0.25,0.2,0.25,0.25,0.25,...,0.225,0.0,0.25,0.25,0.15,0.2,0.25,0.0,0.25,0.225
3,0.111538,0.080769,0.080769,0.096154,0.073077,0.065385,0.069231,0.05,0.038462,0.073077,...,0.015385,0.103846,0.080769,0.065385,0.1,0.123077,0.134615,0.057692,0.038462,0.038462


In [32]:
set_table = create_table(grouped_set_merged_operator_accuracy_data, perturbation_order, "perturbation", "avg_stat")


In [33]:
# set_table

In [34]:
pd.DataFrame.to_csv(template_table, "data/analyzed_entailment_data/template_acc_table.csv")
pd.DataFrame.to_csv(set_table, "data/analyzed_entailment_data/set_acc_table.csv")