# Changing format of original datasets 

In [None]:
#turning original datasets into usable files for analysis
import json
import re
import  string

devset_path = "Electra_analysis/electra/finetuning_data/squad/newsqa_dev.json"
trainset_path = "Electra_analysis/electra/finetuning_data/squad/newsqa_train.json"
#or change the path to where the corresponding files can be found

#since it there are no unanswerable questions just take the preds generated by the model 
pred_path = "Electra_analysis/electra/models/electra_large/results/squad_qa/squad_preds.json"

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
        return re.sub(regex, ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

with open(devset_path,encoding="utf8" ) as f: #enconding may not be needed
    dev = json.load(f)
    dev_data = dev['data']

with open(pred_path, encoding="utf8") as f:
    pred = json.load(f)

has_answer_results = []
no_answer_results = []
for data_blob in dev_data:
    paragraphs = data_blob['paragraphs']
    for paragraph in paragraphs:
        for qa in paragraph['qas']:
            pred_answer = normalize_answer(pred[qa['id']])
            correct = False
            if qa['is_impossible']:
                if pred_answer=='':
                    correct = True
                    pred_answer = '-'
                no_answer_results.append('{}\t{}\t{}\t{}\t{}\t'.format(qa['id'], correct, pred_answer, qa['question'], paragraph['context']))
            else:
                all_answers = []

                for answer_dict in qa['answers']:
                    answer = normalize_answer(answer_dict['text'])
                    all_answers.append(answer)
                    correct = correct or answer == pred_answer
                if not pred_answer:
                    pred_answer = '-'
                all_answers = '['+'|'.join(all_answers)+']'
                has_answer_results.append('{}\t{}\t{}\t{}\t{}\t{}\t'.format(paragraph['context'], qa['id'], correct, pred_answer, all_answers, qa['question']))

has_answer_results = sorted(has_answer_results, key=lambda x:x.split('\t')[2])
no_answer_results = sorted(no_answer_results, key=lambda x:x.split('\t')[1])
with open("elect-question-type-has-answer-stats.txt", 'w',encoding="utf8") as f: #enconding may not be needed
    f.write('{}\t{}\t{}\t{}\t{}\t{}\t\n'.format('context','id','TF','pred_answer','possible_answers','question'))
    f.write('\n'.join(has_answer_results))

# Creating csv file out of answerable questions

In [None]:
import csv
with open ('elect-question-type-has-answer-stats.txt', 'r', encoding="utf8") as f: #enconding may not be needed
    first_column_y = [column for column in csv.reader(f,delimiter='\t')]
    

In [None]:
from pandas import DataFrame
df_y = DataFrame (first_column_y,columns=first_column_y[0])


In [None]:
df_y = df_y.iloc[1:]
df_y.head()

In [None]:
#df_y = df_y[df_y.TF != ''] just in case where there would be something else in the TF column

In [None]:
df_y.to_csv('has_answer_newsqa.csv')