In [1]:
import pandas as pd

In [2]:
text_train_inputs = pd.read_csv('train-inputs.csv')
text_train_targets = pd.read_csv('train-targets.csv')

text_dev_inputs = pd.read_csv('dev-inputs.csv')
text_dev_targets = pd.read_csv('dev-targets.csv')

text_test_inputs = pd.read_csv('test-inputs.csv')
text_test_targets = pd.read_csv('test-targets.csv')

In [142]:
list(text_dev_targets[text_dev_targets['ReviewID'] == 'CD007016']['Target'])

['At present, there is insufficient RCT evidence to enable evaluation of the effect of TPD in patients with AKI. Well-designed and larger RCTs are required to better understand the risks and benefits of TPD for AKI.']

In [3]:
import json

def read_json(filename):
    with open(filename, 'r') as fp:
        all_lines = json.load(fp)
    return all_lines

rr_train_input = read_json('RR-train.json')
rr_train_output = read_json('abstracts-summarization-train.json')
rr_dev_input = read_json('RR-dev.json')
rr_dev_output = read_json('abstracts-summarization-dev.json')
rr_test_input = read_json('RR-test.json')
rr_test_output = read_json('abstracts-summarization-test.json')

In [4]:
def _make_review_pmid_map(input_data):
    review_pmid_map = {}
    print(input_data.keys())
    for k , rid in input_data['ReviewID'].items():
        pmids = input_data['PMID'][k]
        review_pmid_map[rid] = pmids
    return review_pmid_map

In [5]:
rr_train_review_pmid_map = _make_review_pmid_map(rr_train_output)
rr_dev_review_pmid_map = _make_review_pmid_map(rr_dev_output)
rr_test_review_pmid_map = _make_review_pmid_map(rr_test_output)

dict_keys(['ReviewID', 'SummaryBackground', 'SummaryConclusions', 'PMID', 'Title', 'Abstract'])
dict_keys(['ReviewID', 'SummaryBackground', 'SummaryConclusions', 'PMID', 'Title', 'Abstract'])
dict_keys(['ReviewID', 'SummaryBackground', 'SummaryConclusions', 'PMID', 'Title', 'Abstract'])


In [166]:
import re, string

def _find_pmid(abstracts, ptext):
    for i,abst in enumerate(abstracts):
            if ptext in abst:
                return i
    return None
  
def preprocess_str(text):
    import re
    puncts = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~↑↓―'
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    text = regex.sub(' ', text)
    text = [w.lower().strip(puncts) for w in text.split()if w.strip()]
    
    return ' '.join(text)

def preprocess(row_vals):
    new_row_vals = []
    for each_val in row_vals:
        each_val = each_val if type(each_val) is list else [str(each_val)]
        #print(each_val)
        each_val = [preprocess_str(each) for each in each_val]
        each_val = ' <sep> '.join(each_val)
        new_row_vals.append(each_val)
    return new_row_vals

def make_review_data(review_pmid_map, text_inputs, rr_input, text_target):
    review_data = {'PMID' : [], 'Abstract' : [], 'ReviewID': [], 'SummaryConclusions': []}
    for rid, pmids in review_pmid_map.items():
        pmids = [int(each) for each in pmids]
        df_sample = text_inputs[text_inputs['PMID'].isin(pmids)]
        df_sample = df_sample[df_sample['Abstract'].notna()]
        df_pmids = list(df_sample['PMID'])
        df_abstract = list(df_sample['Abstract'])
        target = text_target[text_target['ReviewID'] == rid]
        target = list(target['Target'])[0]
        target = preprocess_str(target)
        pmid_data = []
        abstract_data = []
        pico_data = {}
        for pico_ele in rr_input[rid]:
            punchline_text = pico_ele['punchline_text']
            pmid_ind = _find_pmid(df_abstract, punchline_text)
            if pmid_ind:
                pmid = df_pmids[pmid_ind]
                abstract = df_abstract[pmid_ind]
                pmid_data.append(pmid)
                abstract_data.append(abstract)
                
                for k , v in pico_ele.items():
                    if k not in pico_data:
                        pico_data[k] = []
                    pico_data[k].append(v)
        if pmid_data:
            if not df_abstract:
                print(rid)
            pmid_data = preprocess(pmid_data)
            abstract_data = preprocess(abstract_data)
            
            #review_data['PMID'].append(pmid_data)
            #review_data['Abstract'].append(abstract_data)
            for k , v in pico_data.items():
                if 'mesh' not in k:
                    v = preprocess(v)
                    pico_data[k] = v
                    if k not in review_data:
                        review_data[k] = []
                    #v = ["<%s> "%k + each + " </%s>"%k for each in v]
                    #v = " ".join(v)
                    review_data[k].append(v)
            review_data['PMID'].append(pmid_data)
            review_data['Abstract'].append(abstract_data)
            review_data['ReviewID'].append(rid)
            review_data['SummaryConclusions'].append(target)
                
    return review_data

        
        

In [167]:
review_data_dev = make_review_data(rr_dev_review_pmid_map, text_dev_inputs, rr_dev_input, text_dev_targets)

In [160]:
review_data_dev['ReviewID']

['CD007119',
 'CD006525',
 'CD005571',
 'CD003390',
 'CD002830',
 'CD003656',
 'CD004450',
 'CD006919',
 'CD005653',
 'CD002170',
 'CD008686',
 'CD006182',
 'CD007921',
 'CD004439',
 'CD007775',
 'CD005515',
 'CD003317',
 'CD008407',
 'CD005454',
 'CD001951',
 'CD002744',
 'CD004374',
 'CD004954',
 'CD005550',
 'CD006001',
 'CD005640',
 'CD006188',
 'CD000328',
 'CD008896',
 'CD003046',
 'CD003078',
 'CD005170',
 'CD001281',
 'CD001545',
 'CD005291',
 'CD006921',
 'CD008914',
 'CD006063',
 'CD001531',
 'CD009107',
 'CD003669',
 'CD003150',
 'CD000165',
 'CD007176',
 'CD007990',
 'CD000212',
 'CD003895',
 'CD010609',
 'CD002759',
 'CD004147',
 'CD004402',
 'CD007004',
 'CD000088',
 'CD000019',
 'CD009411',
 'CD003092',
 'CD007440',
 'CD007302',
 'CD005533',
 'CD004621',
 'CD003487',
 'CD009046',
 'CD003076',
 'CD001708',
 'CD003822',
 'CD003388',
 'CD003265',
 'CD003283',
 'CD007749',
 'CD006225',
 'CD007196',
 'CD008444',
 'CD004441',
 'CD006971',
 'CD004316',
 'CD000424',
 'CD008392',

In [168]:
dev_df = pd.DataFrame(review_data_dev)

In [162]:
def _sanity_check(df):
    for ind, row in df.iterrows():
        pmids = list(row['PMID'])
        pop = list(row['population'])
        Abstract = list(row['Abstract'])
        
        assert( len(pmids) == len(pop) ==len(Abstract))

In [163]:
_sanity_check(dev_df)

In [169]:
dev_df

Unnamed: 0,PMID,Abstract,ReviewID,SummaryConclusions,sample_size,population,interventions,outcomes,punchline_text,punchline_effect,random_sequence_generation,allocation_concealment,blinding_participants_personnel
0,"[15178717, 11487675, 7878629, 15351854, 9215844]",[recombinant urokinase r uk is a high molecula...,CD007119,there is inadequate evidence to draw strong co...,"[108, 149, not found, 180, 42]",[adult and pediatric patients from 1 year of a...,[recombinant urokinase urokinase alfa <sep> re...,[restoring total cvad function <sep> safety an...,[all three concentrations of r uk were signifi...,"[ sig increase, no diff, sig increase, no d...","[judgement low, judgement low, judgement high ...","[judgement low, judgement low, judgement high ...","[judgement low, judgement low, judgement high ..."
1,"[18802161, 10634337, 8857869, 17504586, 163306...",[to determine the effectiveness of the allevia...,CD006525,collaborative care is associated with signific...,"[472, not found, 153, 105, 1500, 288, 61, 208,...",[or 18 years with major depression 49 dysthymi...,[euc <sep> collaborative care management <sep>...,[emotional <sep> quality of life and lower pai...,[improvement was also found for 5 point decrea...,"[ sig increase, no diff, sig decrease, sig ...","[judgement low, judgement high unclear, judgem...","[judgement low, judgement high unclear, judgem...","[judgement low, judgement high unclear, judgem..."
2,"[12869081, 17099376, 10768245, 16167973, 75559...",[peristomal infection can sometimes complicate...,CD005571,administration of systemic prophylactic antibi...,"[141, 96, 216, 99, 100, 633, 115, not found, 3...",[forty patients who for various reasons were a...,[percutaneous endoscopic gastrostomy peg place...,[peristomal wound infection <sep> erythema and...,[peristomal wound infection was significantly ...,"[ sig decrease, no diff, no diff, no diff, ...","[judgement low, judgement high unclear, judgem...","[judgement low, judgement high unclear, judgem...","[judgement low, judgement high unclear, judgem..."
3,"[1974941, 8257478]",[41 33 of 123 patients with acute psychiatric ...,CD003390,the limited available evidence suggests folate...,"[123, 96]",[123 patients with acute psychiatric disorders...,"[placebo, oral 5 methyltetrahydrofolic acid <s...","[clinical and social recovery, rey s verbal me...",[among both depressed and schizophrenic patien...,"[ sig increase, sig increase]","[judgement high unclear, judgement high unclear]","[judgement high unclear, judgement high unclear]","[judgement high unclear, judgement high unclear]"
4,[6376480],[in a double blind clinical study 27 acutely a...,CD002830,this is an important and surprisingly under re...,[27],[27 acutely agitated patients <sep> acutely ag...,[haloperidol <sep> intramuscular injection of ...,[],[at 30 minutes following treatment 81 of the p...,[ sig increase],[judgement high unclear],[judgement high unclear],[judgement high unclear]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
407,"[20575920, 1853618]",[ambiguity remains about the effectiveness of ...,CD002929,from the limited results it is unclear whether...,"[827, 3088]",[eight hundred twenty seven participants under...,[mask group all non scrubbed staff wore a mask...,[pre operative stay <sep> surgical site infect...,[overall 83 10 2 surgical site infections were...,"[ no diff, no diff]","[judgement low, judgement high unclear]","[judgement low, judgement high unclear]","[judgement low, judgement high unclear]"
408,"[20409497, 16908789, 21041839, 21991893, 21635...",[smoking remains the primary preventable cause...,CD006103,cytisine increases the chances of quitting alt...,"[1202, 130, 602, 1542, 198, 128, 618, 32, 714,...",[smokers using varenicline <sep> 2010 american...,[behavioral smoking cessation program <sep> va...,[percentage of abstinence <sep> abstinence out...,[the ptc group had a significantly higher perc...,"[ sig increase, sig increase, sig decrease, ...","[judgement high unclear, judgement high unclea...","[judgement high unclear, judgement high unclea...","[judgement high unclear, judgement high unclea..."
409,"[10874062, 12621132, 12621133, 7698575, 119324...",[patients with familial adenomatous polyposis ...,CD004079,there was evidence from three pooled rcts that...,"[77, 635, 1121, 22, 41, 10, 272]",[patients with familial adenomatous polyposis ...,"[placebo <sep> celecoxib, placebo <sep> aspiri...",[mean number of colorectal polyps <sep> number...,[the reductions in the group receiving 100 mg ...,"[ sig decrease, sig increase, sig increase, ...","[judgement low, judgement low, judgement low, ...","[judgement high unclear, judgement low, judgem...","[judgement low, judgement low, judgement low, ..."
410,"[9598622, 19135641, 7269016, 17552953, 6895918...",[to compare in a randomized prospective study ...,CD006576,antibiotic prophylaxis is effective in prevent...,"[83, not found, 40, 497, 63, 192, 400, 117, 11...",[patients with symptomatic urinary tract infec...,[oral co amoxiclav <sep> oral co amoxiclav gro...,[rate of positive msus <sep> positive msus <se...,[there was no statistically significant differ...,"[ no diff, no diff, sig increase, sig incre...","[judgement high unclear, judgement high unclea...","[judgement high unclear, judgement high unclea...","[judgement high unclear, judgement high unclea..."


In [112]:
list(dev_df['population'].items())[0]

(0,
 '<population> adult and pediatric patients from 1 year of age <sep> one hundred eight patients with cvad withdrawal or total occlusion were enrolled and randomized to treatment 104 patients received at least one instillation of study drug and 101 patients completed treatment <sep> occluded central venous catheters in oncology patients </population> <population> one hundred forty nine patients were randomized 74 received <sep> patients were eligible for inclusion if blood could not be withdrawn from their catheter after a period of normal function of at least 48 hours </population> <population> thrombosed central venous catheters <sep> fifty dysfunctional central venous catheters proven radiographically to be occluded by thrombus </population> <population> 180 patients were enrolled at 43 sites in the united states and canada <sep> adult and pediatric patients with occluded non hemodialysis cvads of any duration or type <sep> most patients were adults although 20 were 18 years of a

In [83]:
import re
puncts = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~↑↓―'
a = "he0uw!-t"
a = re.sub(r"[!\"#$%&\'()*+,/:;<=>?@[\\]^_`{|}~↑↓-]+", ' ', a)
a

'This is  fortunately  A 23 Test  string'