In [None]:
import json
import sys
import os
from tqdm import notebook
from collections import Counter
import importlib
import pandas as pd
import tqdm

sys.path.append("")#path to code

from split_logic.grammar.sparql_parser import SPARQLParser
from split_logic.grammar.sql_parser import SQLParser
from split_logic.grammar import atom_and_compound_cache

importlib.reload(atom_and_compound_cache)

In [None]:
PROJECT_PATH = ''#path to project
EXPERIEMENTS_DATAPATH = ''#path to experiements folder

In [None]:
sparql_dataset = json.load(open('', 'r'))#path to whole sql dataset
sparql_list = [sample["masked_query"] for sample in sparql_dataset]
sparql_parser = SPARQLParser(sparql_list)
parser_dict = {'wikidata': sparql_parser}
compound_path = os.path.join(PROJECT_PATH, 'dataset/lcquad/tmcd_split')
cached_sparql_parser = atom_and_compound_cache.AtomAndCompoundCache(parser_dict, query_key_name = None,
                                                                    kb_id_key_name=None,
                                                                    return_compound_list_flag = False,
                                                                   compound_cache_path=compound_path)

In [None]:
sql_dataset = json.load(open())#path to whole sparql dataset
db2attr_dict = json.load(open(os.path.join(PROJECT_PATH,
                                           "dataset/wikisql/table_id2new_attrs_for_parsing.json")))

compound_path = os.path.join(PROJECT_PATH, 'dataset/wikisql/tmcd_split')
parser_dict = dict()
for sample in tqdm.tqdm(sql_dataset, total=len(sql_dataset)):
    db_id = sample['kb_id']
    db_attributes = db2attr_dict[db_id]
    if db_id not in parser_dict:
        parser_instance = SQLParser(db_attributes)
        parser_dict[db_id] = parser_instance
        
        
cached_sql_parser = atom_and_compound_cache.AtomAndCompoundCache(parser_dict,
                                                                 query_key_name=None, kb_id_key_name=None,
                                                                 return_compound_list_flag=False,
                                                                 compound_cache_path=compound_path)

In [None]:
DATASET_PARSER = cached_sparql_parser
FOLDER_NAME = 'tmcd_sparql_hirm_control_v5'
FILE_NAME = 'epoch_94_tm_0.85_t5_predictions.json'

prediction_file = json.load(open(os.path.join(EXPERIEMENTS_DATAPATH, FOLDER_NAME, FILE_NAME), 'r'))
true_questions, predicted_queries, true_queries = prediction_file['input_questions'], prediction_file['predicted_queries'], prediction_file['true_queries']

DATASET_FOLDER_PATH = 'dataset/lcquad/tmcd_split'
TEST_FILE_NAME = 'english_test_split_coef_0.1.json'
TRAIN_FILE_NAME = 'english_train_split_coef_0.1.json'

test_set = json.load(open(os.path.join(PROJECT_PATH, DATASET_FOLDER_PATH, TEST_FILE_NAME), 'r'))
train_set = json.load(open(os.path.join(PROJECT_PATH, DATASET_FOLDER_PATH, TRAIN_FILE_NAME), 'r'))

assert len(test_set) == len(true_queries)
for i in range(len(test_set)):
    assert test_set[i]['masked_query'] == true_queries[i]
    
true_queries_kb_list = []
for i in range(len(test_set)):
    true_queries_kb_list.append(test_set[i]['kb_id'])

### Train/test and prediction parsing

In [None]:
train_queries_components = []
for sample in notebook.tqdm(train_set, total=len(train_set)):
    query, question, kb_id = sample['masked_query'], sample['question'], sample['kb_id']
    compounds = DATASET_PARSER.get_compounds(query, kb_id)
    atoms = DATASET_PARSER.get_atoms(query, kb_id)
    result_dict = {
        'question': question,
        'query': query,
        'compound': compounds,
        'atoms': atoms
    }
    train_queries_components.append(result_dict)

In [None]:
test_true_queries_components, test_pred_queries_components = [], []
for question, true_query, pred_query, kb_id in notebook.tqdm(zip(true_questions, true_queries, predicted_queries, true_queries_kb_list), total=len(true_queries)):
    true_compounds = DATASET_PARSER.get_compounds(true_query, kb_id)
    true_atoms = DATASET_PARSER.get_atoms(true_query, kb_id)
    
    pred_compounds = DATASET_PARSER.get_compounds(pred_query, kb_id)
    pred_atoms = DATASET_PARSER.get_atoms(pred_query, kb_id)
    
    true_result_dict = {
        'question': question,
        'query': true_query,
        'compound': true_compounds,
        'atoms': true_atoms
    }
    test_true_queries_components.append(true_result_dict)
    
    pred_result_dict = {
        'question': question,
        'query': pred_query,
        'compound': pred_compounds,
        'atoms': pred_atoms
    }
    test_pred_queries_components.append(pred_result_dict)

### Predicted compound accuracy

In [None]:
def compare_compounds(true_comp_d, pred_comp_d):
    compound_hits_dict = dict()
    for key in true_comp_d:
        true_compound_counter = Counter(true_comp_d[key])
        total_hits = sum(true_compound_counter.values())
        compound_match_hits = 0
        for compound in pred_comp_d[key]:
            if compound in true_compound_counter:
                compound_match_hits += 1
        
        if total_hits == 0 and total_hits == compound_match_hits:
            compound_acc = 1.0
        else:
            compound_acc = compound_match_hits / total_hits
            
        compound_hits_dict[key] = compound_acc
    return compound_hits_dict

def compare_atoms(true_atoms, pred_atoms):
    true_atoms_set = set(true_atoms)
    pred_atoms_set = set(pred_atoms)
    return len(true_atoms_set.intersection(pred_atoms_set)) / len(true_atoms)

def test():
    print(compare_atoms(test_pred_queries_components[0]['atoms'], test_true_queries_components[0]['atoms'])) 
    print(test_pred_queries_components[0]['compound'])
    print(test_true_queries_components[0]['compound'])
    print(compare_compounds(test_pred_queries_components[0]['compound'], test_true_queries_components[0]['compound']))
    
test()

In [None]:
compound_result_dict = {key: 0 for key in DATASET_PARSER.parsers_env_list}
atom_acc = 0

for true_comp, pred_comp in zip(test_true_queries_components, test_pred_queries_components):
    compound_acc = compare_compounds(true_comp['compound'], pred_comp['compound'])
    for key in compound_result_dict:
        compound_result_dict[key] += compound_acc[key]
        
    atom_acc += compare_atoms(true_comp['atoms'], pred_comp['atoms'])
    
em_match_prob = 1
for key in compound_result_dict:
    compound_result_dict[key] = round(compound_result_dict[key] / len(test_true_queries_components), 3)
    em_match_prob *= compound_result_dict[key]

em_match_prob = round(em_match_prob, 3)
atom_acc =  round(atom_acc / len(test_true_queries_components), 3)

print('Compound accuracy: ', compound_result_dict)
print('Atom accuracy: ', atom_acc)
print('EM Match prob: ', em_match_prob)

### Syntax correctness of predicted compounds

In [None]:
def compare_syntactic_structure(true_comp_d, pred_comp_d):
    syntax_score = 0
    expected_compounds = []
    for key in true_comp_d:
        if len(true_comp_d[key]) > 0:
            expected_compounds.append(key)
    
    for key in expected_compounds:
        if len(true_comp_d[key]) > 0:
            syntax_score += 1
    
    syntax_score /= len(expected_compounds)
    return syntax_score
    

def test():
    print(test_pred_queries_components[0]['compound'])
    print(test_true_queries_components[0]['compound'])
    print(compare_syntactic_structure(test_pred_queries_components[0]['compound'], test_true_queries_components[0]['compound']))
    
test()

In [None]:
total_syntax_score = 0
for true_comp, pred_comp in zip(test_true_queries_components, test_pred_queries_components):
    syntax_score = compare_syntactic_structure(true_comp['compound'], pred_comp['compound'])
    total_syntax_score += syntax_score
    
print('Syntax score: ', round(total_syntax_score / len(test_true_queries_components), 3))

### OOD compound accuracy

In [None]:
train_structures = set()
for sample in train_queries_components:
    train_compounds = sample['compound']
    for compound in train_compounds:
        train_structures.update(set(train_compounds[compound]))

In [None]:
oov_comp_acc_dict = {key: {'hits': 0, 'total': 0, 'sample_idx_list': [], 'hit_compound_example_set': set(), 'total_compound_example_set': set()} for key in compound_result_dict}
for idx, (true_sample, pred_sample) in enumerate(zip(test_true_queries_components, test_pred_queries_components)):
    test_compound, pred_compound = true_sample['compound'], pred_sample['compound']
    for t_c in test_compound:
        test_c_l, pred_c_l = test_compound[t_c], pred_compound[t_c]
        for test_c in test_c_l:
            if test_c not in train_structures:
                if test_c in pred_c_l:
                    oov_comp_acc_dict[t_c]['hits'] += 1
                    oov_comp_acc_dict[t_c]['sample_idx_list'].append(idx)
                    oov_comp_acc_dict[t_c]['hit_compound_example_set'].add(pred_c)
                oov_comp_acc_dict[t_c]['total'] += 1
                oov_comp_acc_dict[t_c]['total_compound_example_set'].add(pred_c)

print('OOV compound accuracy per component: ')
for key in oov_comp_acc_dict:
    hits = oov_comp_acc_dict[key]['hits']
    print(f'total hits {key}: ', hits)
    total = oov_comp_acc_dict[key]['total']
    print(f'total ood {key}: ', total)
    acc = 0 if total == 0 else hits/total
    print(f'Compound ood accuracy {key} = {round(acc, 3)}')
    


### Текущее распределение train/test по компаундам/атомам

In [None]:
train_dataset = json.load(open('/Users/somov-od/Documents/phd/projects/CompGen/text2query/dataset/lcquad/tmcd_split/english_train_split_coef_0.1.json'))
test_dataset = json.load(open('/Users/somov-od/Documents/phd/projects/CompGen/text2query/dataset/lcquad/tmcd_split/english_test_split_coef_0.1.json'))

len(train_dataset), len(test_dataset)

1) Насколько LM задача сама по себе генерирует новые верные стурктуры?

### TMCD Environment Analysis

In [None]:
### посмотрим когда у нас абсолютно пустые select_compound + triplet_compound - это значит что у нас точно неверная грамматика генерации

k = 0
failed_grammar = []
for idx, sample in enumerate(test_pred_queries_components):
    compound_dict = sample['pred_compound']
    # если первое сгенеренное это ask - смотрим только на compound
    if 'ask' == sample['pred_query'].split()[0]:
        if len(compound_dict['triplet_compound']) > 0:
            k += 1
        else:
            failed_grammar.append([idx, sample])
    else:w
        if  len(compound_dict['select_compound']) > 0 and len(compound_dict['triplet_compound']) > 0 :
            k += 1
        else:
            failed_grammar.append([idx, sample])
        
# вывод - у нас 68% генерируется даже не всегда грамматически корректный запрос. Получается нам нужно в том числе уметь генерить грамматически верные запросы тоже.
k / len(test_pred_queries_components)

In [None]:
failed_grammar

In [None]:
test_true_queries_components[260]

In [None]:
failed_grammar

In [None]:
'ask' in failed_grammar[0]['pred_query']

In [None]:
import matplotlib.pylab as plt
import numpy as np

%pylab inline

In [None]:
compound_dict = {key: 0 for key in train_queries_components[0]['compound']}

In [None]:
# Как у нас пересекаются структуры?

train_structures = set()
for sample in train_queries_components:
    train_compounds = sample['compound']
    for compound in train_compounds:
        train_structures.update(set(train_compounds[compound]))
        
len(train_structures)

test_structures, pred_structures = set(), set()
for true_sample, pred_sample in zip(test_true_queries_components, test_pred_queries_components):
    test_compound, pred_compound = true_sample['compound'], pred_sample['pred_compound']
    for t_c, p_c in zip(test_compound, pred_compound):
        test_structures.update(set(test_compound[t_c]))
        pred_structures.update(set(pred_compound[p_c]))
        
len(test_structures), len(pred_structures)


print('Пересечение трейн структур и теста:', len(train_structures.intersection(test_structures)) / len(train_structures))

print('Пересечение трейн структур и предикта:', len(train_structures.intersection(pred_structures)) / len(train_structures))

print('Пересечение тест структур и предикта:', len(test_structures.intersection(pred_structures)) / len(test_structures))

In [None]:
# У нас есть новые структуры в предикте, которых нет в трейне. Какое качесвто модели на них?


oov_comp_acc_dict = {key: {'hits': 0, 'total': 0, 'sample_idx_list': [], 'hit_compound_example_set': set(), 'total_compound_example_set': set()} for key in compound_dict}
for idx, (true_sample, pred_sample) in enumerate(zip(test_true_queries_components, test_pred_queries_components)):
    test_compound, pred_compound = true_sample['compound'], pred_sample['pred_compound']
    # для каждой структуре в запросе
    for t_c in test_compound:
        test_c_l, pred_c_l = test_compound[t_c], pred_compound[t_c]
        # смотрим на предсказанные структуры
        for pred_c in pred_c_l:
            # модель его не видела на обучении
            if pred_c not in train_structures:
                # при этом мы предсказали его верно
                if pred_c in test_c_l:
                    oov_comp_acc_dict[t_c]['hits'] += 1
                    oov_comp_acc_dict[t_c]['sample_idx_list'].append(idx)
                    oov_comp_acc_dict[t_c]['hit_compound_example_set'].add(pred_c)
                oov_comp_acc_dict[t_c]['total'] += 1
                oov_comp_acc_dict[t_c]['total_compound_example_set'].add(pred_c)

print('OOV compound accuracy per component: ')
for key in oov_comp_acc_dict:
    hits = oov_comp_acc_dict[key]['hits']
    total = oov_comp_acc_dict[key]['total']
    acc = 0 if total == 0 else hits/total
    print(f'{key} = {round(acc, 3)}')
    print(f'Total compounds for {key}: ', total)
    print()
    
    
# вывод - модель не может обощить на select структуры, которые не видела на этапе обучения. Может обощить на triplet - егр 
# filter и order у нас как раз повторяются в трейне/тесте - поэтому тут они 0

# Из 2к придуманных структур, мы только 30% генерим верно. 
print(f"{len(oov_comp_acc_dict['triplet_compound']['hit_compound_example_set']) / len(oov_comp_acc_dict['triplet_compound']['total_compound_example_set'])}")

### Инженерия environment





In [None]:
train_components_counter = {key: 0 for key in train_queries_components[0]['compound']}
for sample in train_queries_components:
    compound_dict = sample['compound']
    for key in compound_dict:
        train_components_counter[key] += len(compound_dict[key])
    
for key in compound_dict:
    train_components_counter[key] /= len(train_queries_components)

In [None]:
test_components_counter = {key: 0 for key in train_queries_components[0]['compound']}
for sample in test_true_queries_components:
    compound_dict = sample['compound']
    for key in compound_dict:
        test_components_counter[key] += len(compound_dict[key])
        
for key in compound_dict:
    test_components_counter[key] /= len(test_true_queries_components)

In [None]:
test_pred_components_counter = {key: 0 for key in train_queries_components[0]['compound']}
for sample in test_pred_queries_components:
    compound_dict = sample['pred_compound']
    for key in compound_dict:
        test_pred_components_counter[key] += len(compound_dict[key])
        
for key in compound_dict:
    test_pred_components_counter[key] /= len(test_pred_queries_components)

In [None]:
X_axis

In [None]:
test_components_counter

In [None]:
  
plt.figure(figsize=(15, 6), dpi=80)
X_axis = np.arange(len(train_components_counter))
X_keys = list(train_components_counter.keys())
train_vals = list(train_components_counter.values())
test_vals = list(test_components_counter.values())
pred_vals = list(test_pred_components_counter.values())
  
width = 0.2
plt.bar(X_axis, train_vals, width, label = 'Train')
plt.bar(X_axis + width, test_vals, width, label = 'Test')
plt.bar(X_axis + width*2, pred_vals, width, label = 'Pred')
  
plt.xticks(X_axis, X_keys)
plt.legend()
plt.show()