In [1]:
from suicide_squad.DataUtils import *
from suicide_squad.DataProcessing import *
from transformers import BertTokenizer, BertForQuestionAnswering,BertConfig
from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize

import numpy as np
import re
import logging
from ast import literal_eval
from IPython.display import clear_output

In [2]:
import logging
logging.getLogger('transformers.tokenization_utils').setLevel(logging.ERROR)

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [4]:
def get_id(text):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text,max_length=800))

def to_features(target,issues):
    
    target['text']='[CLS] '+target['question_text']+' [SEP] '+target['doc_text']+' [SEP]'
    
    doc_text_id_list=tokenizer.encode(target.doc_text)[1:-1]
    answer_id_list=tokenizer.encode(target.answer_text)[1:-1]
    question_id_list=tokenizer.encode(target.question_text)[1:-1]
    
    doc_text_id=' '.join([str(item) for item in doc_text_id_list])
    answer_id=' '.join([str(item) for item in answer_id_list])
    question_id=' '.join([str(item) for item in question_id_list])
    
    if((target.answer_text) and (target.question_text) and (target.doc_text)):
        try:
            matches=[match for match in re.finditer(answer_id, doc_text_id)]
            if(matches):
                start=matches[0].start()
                end=matches[0].end()
                target['start_label']=len(doc_text_id[:start].split(' '))+len(question_id_list)+1
                target['end_label']=len(answer_id_list)+target['start_label']
                answer_id_extracted=tokenizer.encode(target.question_text,target.doc_text)[target['start_label']:target['end_label']]
                target['extracted_answer']=tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(answer_id_extracted))
                target['answers_match']=(answer_id==' '.join([str(item) for item in answer_id_extracted]))
            else:
                issues.append(target.name)
                target['start_label']=0
                target['end_label']=0
        except Exception as e:
            logging.warning(e)
            target['start_label']=0
            target['end_label']=0
        
        target['total_tokens']=len(doc_text_id.split(' '))+len(question_id.split(' '))+3
    return target

In [5]:
squad_to_csv(files,download_folder)

Downloading Files
Converting Files to CSV
Finished Downloading and Converting to CSV


['D:\\GitHub\\Suicide-SQuAD\\data\\original\\train-v1.1.csv',
 'D:\\GitHub\\Suicide-SQuAD\\data\\original\\dev-v1.1.csv',
 'D:\\GitHub\\Suicide-SQuAD\\data\\original\\train-v2.0.csv',
 'D:\\GitHub\\Suicide-SQuAD\\data\\original\\dev-v2.0.csv']

In [6]:
squad_dfs=squad_to_df(files,download_folder)

Downloading Files
Downloading Files
Converting Files to CSV
Finished Downloading and Converting to CSV
Converting Files to DataFrame
Finished Downloading and Converting to DataFrame


In [7]:
all_issues={}
selected_columns=['qa_id','start_label','end_label','text','total_tokens']
squad_final={}
for k,v in squad_dfs.items():
    
    print('Preprocessing: ',k)
    issues=[]
    temp=v.dropna(subset=['answer_text'])
    temp=temp[~temp.is_impossible]
    if('dev' in k):
        count_starts=temp['answer_text'].apply(literal_eval).apply(lambda x:len(set(x)))
        temp_new=pd.DataFrame({col:np.repeat(temp[col].values, count_starts) for col in temp.columns.drop('answer_text')})
        answers=temp['answer_text'].apply(literal_eval).apply(lambda x:np.unique(x)).values
        answers_all=[]
        for item_a in answers:
            for item_b in item_a:
                answers_all.append(item_b)
        temp_new['answer_text']=answers_all
        temp=temp_new
    features=temp.apply(lambda x:to_features(x,issues),axis=1)

    all_issues[k]=temp[temp.index.isin(issues)]
    temp[temp.index.isin(issues)].to_csv('.//data/processed//'+k+'_issues.csv')
    features.dropna(how='all',inplace=True)
    final=features[~features.index.isin(issues)]
    squad_final[k]=final
    final.to_csv('.//data/processed//'+k+'_bert_preprocessed.csv')
    print('Finished Preprocessing: ',k)
    clear_output(wait=True)
clear_output(wait=True)

In [11]:
for k,v in squad_final.items():
    spans=[]
    print('Processing: ',k)
    name=k.replace('.','_').replace('-','_')
    if('dev' in k):
        qa_id_span_map={}
        for g_label,g in v.groupby(['qa_id']):
            span=[list(range(*item)) for item in list(zip(g.start_label,g.end_label+1))]
            qa_id_span_map[g_label]=str(list(np.unique(np.hstack(span))))
        temp=v.copy()
        temp['span_label']=v['qa_id'].map(qa_id_span_map)
        temp=temp.drop_duplicates('qa_id')
        temp.to_csv('.//data/processed//'+name+'_bert_ready.csv')
        print('finished processing ',k)
        clear_output(wait=True)
    else:
        temp=v.copy()
        span=[list(range(*item)) for item in list(zip(v.start_label,v.end_label+1))]
        temp.loc[:,'span_label']=span
        temp.to_csv('.//data/processed//'+name+'_bert_ready.csv')
        print('finished processing ',k)
        clear_output(wait=True)        
        
clear_output(wait=True)
print('done')

done
