In [1]:
import json

def read_rr_file(filename):
    with open(filename, 'r') as fp:
        data = json.load(fp)
    return data

In [2]:
rr_train = read_rr_file('/Users/sanjana/destruct/destruct/data/roboreviewer/RR-train.json')
abs_train = read_rr_file('/Users/sanjana/destruct/destruct/data/roboreviewer/abstracts-summarization-train.json')

In [5]:
rid = abs_train['ReviewID']['57']
rid

'CD002240'

In [6]:
rr_train['CD002240']

[{'sample_size': '134',
  'population': ['tertiary hospital in The Netherlands, 134 children aged 4 to 18 years with functional constipation',
   'children with functional fecal incontinence associated with constipation',
   'childhood constipation'],
  'population_mesh': [{'mesh_term': 'Tertiary Care Centers',
    'mesh_ui': 'D062606',
    'cui': 'C0587437'},
   {'mesh_term': 'Netherlands', 'mesh_ui': 'D009426', 'cui': 'C0027778'},
   {'mesh_term': 'Child', 'mesh_ui': 'D002648', 'cui': 'C0008059'},
   {'mesh_term': 'Aging', 'mesh_ui': 'D000375', 'cui': 'C0001811'},
   {'mesh_term': 'Constipation', 'mesh_ui': 'D003248', 'cui': 'C0009806'},
   {'mesh_term': 'Fecal Incontinence',
    'mesh_ui': 'D005242',
    'cui': 'C0015732'}],
  'interventions': ['Behavioral therapy',
   'laxative therapy',
   'behavioral therapy or conventional treatment',
   'conventional treatment',
   'Behavioral therapy with laxatives',
   'behavioral therapy with laxatives'],
  'interventions_mesh': [{'mesh_term

In [13]:
abs_train.keys()

dict_keys(['ReviewID', 'SummaryBackground', 'SummaryConclusions', 'PMID', 'Title', 'Abstract'])

In [60]:

def make_abs_summ_data(abs_data):
    abs_rr_data = {}

    for abs_id, rid in abs_data['ReviewID'].items():
        summary_back = abs_data['SummaryBackground'][abs_id]
        summary_conc = abs_data['SummaryConclusions'][abs_id]
        pmid = abs_data['PMID'][abs_id]
        title = abs_data['Title'][abs_id]
        abstract = abs_data['Abstract'][abs_id]
        abs_rr_data[rid] = {'SummaryBackground': summary_back, 'SummaryConclusions': summary_conc, 
                             'PMID': pmid, 
                            'Title': title, 
                             'Abstract': abstract}
    #### sanity check #####  
    for k , v in abs_rr_train.items():
        if len(list(v.keys())) < 5:
            print(v.keys())
        
    return abs_rr_data

In [52]:
def make_struct_data(rr_data):
    struct_rr_data = {}
    for rid, pmid_values in rr_data.items():
        dict_values = {'sample_size': [], 'outcomes':[], 'punchline_text': []}
        for pmid in pmid_values:
            for k in dict_values.keys():
                dict_val = dict_values[k] + pmid[k] if type(pmid[k]) is list else [pmid[k]]
                dict_values[k] = list(set(dict_val))
        struct_rr_data[rid] = dict_values
        
    #### sanity check #####
    for k , v in struct_rr_train.items():
        if len(list(v.keys())) < 3:
            print(v.keys())
            
    return struct_rr_data



In [46]:
def make_tabular_data(struct_rr, abs_rr):
    all_data_src = []
    all_data_tgt = []
    for rid, rvalue in struct_rr.items():
        if rid in abs_rr:
            tgt_summary = abs_rr[rid]['SummaryConclusions']
        
            src_sents = [','.join(src_val) for k , src_val in rvalue.items()]
            all_data_src.append('|'.join(src_sents))
            all_data_tgt.append(tgt_summary)
    return all_data_src, all_data_tgt

In [39]:
def write_data(src_data, tgt_data, src_file, tgt_file):
    
    
    with open(src_file, 'w') as fp1:
        fp1.write('\n'.join(src_data))
    
        
    with open(tgt_file, 'w') as fp2:
        fp2.write('\n'.join(tgt_data))
         

In [61]:
rr_train = read_rr_file('/Users/sanjana/destruct/destruct/data/roboreviewer/RR-train.json')
abs_train = read_rr_file('/Users/sanjana/destruct/destruct/data/roboreviewer/abstracts-summarization-train.json')

In [62]:
abs_train = make_abs_summ_data(abs_train)
rr_train = make_struct_data(rr_train)

In [73]:
train_data_src, train_data_tgt = make_tabular_data(rr_train, abs_train)
write_data(train_data_src, train_data_tgt, 'src_rr_train.txt', 'tgt_rr_train.txt')

In [74]:
rr_test = read_rr_file('/Users/sanjana/destruct/destruct/data/roboreviewer/RR-test.json')
abs_test = read_rr_file('/Users/sanjana/destruct/destruct/data/roboreviewer/abstracts-summarization-test.json')
abs_test = make_abs_summ_data(abs_test)
rr_test = make_struct_data(rr_test)
test_data_src, test_data_tgt = make_tabular_data(rr_test, abs_test)
write_data(test_data_src, test_data_tgt, 'src_rr_test.txt', 'tgt_rr_test.txt')

In [75]:
rr_dev = read_rr_file('/Users/sanjana/destruct/destruct/data/roboreviewer/RR-dev.json')
abs_dev = read_rr_file('/Users/sanjana/destruct/destruct/data/roboreviewer/abstracts-summarization-dev.json')
abs_dev = make_abs_summ_data(abs_dev)
rr_dev = make_struct_data(rr_dev)
dev_data_src, dev_data_tgt = make_tabular_data(rr_dev, abs_dev)
write_data(dev_data_src, dev_data_tgt, 'src_rr_dev.txt', 'tgt_rr_dev.txt')