In [1]:
import pandas as pd
import io
import re

In [3]:
def get_lines(filename):
    with open(filename,"r") as f:
        return f.readlines()

In [4]:
data_dir = 'medical/200k_abstracts/'
train_data_lines=get_lines(data_dir+"train.txt")

In [5]:
def preprocess_data(filename):
    input_lines=get_lines(filename)
    #This will be used to separte the abstracts from  one another using String mets
    abstract_lines=""
    # Empty list of abstracts
    abstract_samples=[]
    for line in input_lines:
        # Check for a new abstract
        if line.startswith("###"):
            abstract_id=line
            # And since we are in a new abstract we will Reset the abstract_lines
            abstract_lines=""
        # Check for a new line \n escape seq
        elif line.isspace():
            # Split the Lines of the abstract and will return a list of one abstract
            abstract_line_split=abstract_lines.splitlines()
            # Now we have to iterate through this singular abstract
            for abstract_line_number, abstract_line in enumerate(abstract_line_split):
                #  Enumerate() method adds a counter to an iterable and returns it in a form of enumerating object.
                # Create a empty Dict per line
                line_data={}
                # Split on the tab \t esc seq
                target_text_split=abstract_line.split("\t")
                # Get the Label of the sentence as the Label
                line_data["target"]=target_text_split[0]
                # Get the Text of the Lien as the Text Key
                line_data["text"]=target_text_split[1].lower()
                # Also adding the Line Nnumber as it will also aid the model
                line_data["line_number"]=abstract_line_number
                # Number of Lines in that particular abstract
                line_data["total_lines"]=len(abstract_line_split)-1
                # Now we have to append them to the absract_samples list
                abstract_samples.append(line_data)
        # So if both the cases are not there then the line is a labelled sentence
        else:
            abstract_lines+=line
    return abstract_samples

In [8]:
train_data_lines=get_lines(data_dir+"train.txt")
val_data_lines=get_lines(data_dir+"dev.txt")
test_data_lines=get_lines(data_dir+"test.txt")

In [15]:
target_list = ['OBJECTIVE','METHODS', 'BACKGROUND', 'CONCLUSIONS', 'RESULTS']
data_dir = 'medical/200k_abstracts/'
train_data_lines=get_lines(data_dir+"train.txt")

def extract_text_by_target(train_data_lines, target_list):
    reduced_list = []
    tmp = {}
    for i in range(len(train_data_lines)):
        line = train_data_lines[i]
        if line.startswith("###"):
            reduced_list.append(tmp)
            tmp = {}
            target = 'None'
            tmp['id'] = line.split('\n')[0]
        for t in target_list:
            if t in line:
                if t == target:
                    line = line.replace(target, ' ')
                    line = line.replace('\t', ' ')
                    tmp[target] += (' '.join(line.split('\n'))).strip()
                else:
                    target = t
                    line = line.replace(target, ' ')
                    line = line.replace('\t', ' ')
                    tmp[target] = (' '.join(line.split('\n'))).strip()
    reduced_list = reduced_list[1:]
    return reduced_list

def combine_text(reduced_list):
    for i in range(len(reduced_list)):
        combined_text = ''
        info = reduced_list[i]
        for key in info:
            if key != 'id':
                combined_text += info[key]
        reduced_list[i]['combined'] = combined_text
    return reduced_list

def clean_combine_text(reduced_list):
    for i in range(len(reduced_list)):
        combo_txt = reduced_list[i]['combined']
        txt = re.sub("[\(\[].*?[\)\]]", "", combo_txt)
        txt = re.sub("number\s+(NCT)\d+","", txt)
        txt = re.sub("(NCT)\d+", "", txt)
        txt = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', txt)
        txt = re.sub("clinicaltrials\.gov","", txt, flags=re.IGNORECASE)
        reduced_list[i]['combined_cleaned'] = txt.strip()
    return reduced_list

def extract_clean_text(train_data_path, target_list):
    train_data_lines=get_lines(train_data_path)
    reduced_list = extract_text_by_target(train_data_lines, target_list)
    reduced_list = combine_text(reduced_list)
    reduced_list = clean_combine_text(reduced_list)
    return reduced_list

In [13]:
#df = pd.DataFrame(reduced_list)

In [16]:
train_reduced = extract_clean_text(data_dir+"train.txt", target_list)
dev_reduced = extract_clean_text(data_dir+"dev.txt", target_list)
test_reduced = extract_clean_text(data_dir+"test.txt", target_list)

In [18]:
df = pd.DataFrame(train_reduced+dev_reduced+test_reduced)

In [20]:
df['len'] = df['combined_cleaned'].str.split().str.len()
df = df[df['len'] > 30]

In [21]:
df.len.describe()

count    195651.000000
mean        242.857598
std          62.447768
min          33.000000
25%         205.000000
50%         238.000000
75%         269.000000
max        1114.000000
Name: len, dtype: float64

In [22]:
keyword_list = ['legal', 'law']

def check_keywords(txt):
    kw = 0
    for word in keyword_list:
        if word in txt.lower():
            kw=1
            break
    return kw

In [23]:
df['contains_kw'] = df.combined.apply(check_keywords)

In [24]:
df.contains_kw.value_counts()

0    191939
1      3712
Name: contains_kw, dtype: int64

In [25]:
kw_df = df[df['contains_kw'] == 1]
kw_df.len.describe()

count    3712.000000
mean      236.740032
std        65.256375
min        57.000000
25%       195.000000
50%       231.000000
75%       267.000000
max       692.000000
Name: len, dtype: float64

In [26]:
kw_df

Unnamed: 0,id,BACKGROUND,METHODS,CONCLUSIONS,combined,combined_cleaned,RESULTS,OBJECTIVE,len,contains_kw
8,###8532270,,Forty-six pregnant women with first episodes o...,Suppressive acyclovir therapy reduced the need...,To determine if suppressive acyclovir therapy ...,To determine if suppressive acyclovir therapy ...,None of the 21 patients treated with acyclovir...,To determine if suppressive acyclovir therapy ...,237,1
14,###12551795,,A randomized trial of three techniques of coni...,There is no major difference in obstetrical ou...,To evaluate the long-term recurrence rates and...,To evaluate the long-term recurrence rates and...,Eighty-six patients were followed-up for more ...,To evaluate the long-term recurrence rates and...,176,1
25,###18366490,Misoprostol is the drug of choice for medical ...,A randomised comparative trial where 300 women...,Both sublingual and vaginal administrations of...,Misoprostol is the drug of choice for medical ...,Misoprostol is the drug of choice for medical ...,No statistically significant differences in th...,To compare the outcome of sublingual with vagi...,238,1
92,###20692774,,A prospective randomized trial conducted from ...,Our study allowed to confirm the benefits of a...,To assess the effects of ambulation during the...,To assess the effects of ambulation during the...,Upright position reduces significantly ( for a...,To assess the effects of ambulation during the...,187,1
122,###1735488,,"A randomized , prospective trial .Thirty-one c...","In hMG/hCG cycles , two IUIs timed as describe...",To compare a single periovulatory intrauterine...,To compare a single periovulatory intrauterine...,Clinical pregnancies developed in 2 of 23 cycl...,To compare a single periovulatory intrauterine...,148,1
...,...,...,...,...,...,...,...,...,...,...
195286,###24798072,,This is a prospective randomised clinical tria...,For women aged under 38 years with good progno...,To analyze the impact of the eSET followed by ...,To analyze the impact of the eSET followed by ...,"In the intention-to-treat analysis , the cumul...",To analyze the impact of the eSET followed by ...,201,1
195371,###23273888,,In a prospective study carried out at 13 sites...,Rifaximin at 25mg/5 days showed better therape...,To compare efficacy and tolerability between d...,To compare efficacy and tolerability between d...,"Among 114 women recruited , 103 were evaluable...",To compare efficacy and tolerability between d...,200,1
195556,###25411294,Prenatal calcium and iron supplements are reco...,This was a randomized crossover trial in pregn...,A pH-sensitive enteric coating substantially r...,Prenatal calcium and iron supplements are reco...,Prenatal calcium and iron supplements are reco...,Forty-nine participants with FCA for both EC a...,We aimed to establish in vivo evidence that en...,298,1
195582,###24499812,Netherlands Organisation for Health Research a...,"In this open-label , multicentre , internation...",In women with a tubal pregnancy and a healthy ...,Netherlands Organisation for Health Research a...,Netherlands Organisation for Health Research a...,446 women were randomly assigned between Sept ...,,351,1


In [63]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

In [66]:
%%time
print(summarizer(df.combined_cleaned[14998], max_length=130, min_length=50, do_sample=False))

[{'summary_text': 'Among patients with quiescent ulcerative colitis, lower fecal concentrations of calprotectin are associated with lower rates of relapse. We performed an open-label , randomized controlled trial to investigate whether increasing doses of mesalamine reduce concentrations of Calprotectin.'}]
CPU times: user 11.6 s, sys: 118 ms, total: 11.7 s
Wall time: 11.4 s


In [67]:
%%time
print(summarizer(df.combined_cleaned[14998], max_length=130, min_length=50, do_sample=True))

[{'summary_text': 'Among patients with quiescent ulcerative colitis, lower fecal concentrations of calprotectin are associated with lower rates of relapse. We performed an open-label , randomized controlled trial to investigate whether increasing doses of mesalamine reduce concentrations of fecal cal Protectin. The primary outcome was continued remission with FC < 50 g/g.'}]
CPU times: user 14.7 s, sys: 279 ms, total: 14.9 s
Wall time: 14.5 s


In [106]:
kw_df.combined_cleaned.loc[134]

'To test the application in practice of computerized fetal heart rate  analysis in pregnancy .Randomized distribution of subjects with computerized analysis automatically revealed or concealed .A district general hospital and a teaching hospital outside London .2869 pregnant women studied within a year .Quality and duration of the cardiotocogram ; quantitative measurement of FHR variation ; number of stillbirths .With interactive advice to the operator , records were of improved quality  with potentially much reduced recording time .The short-term FHR variation measured in the last records before intervention is reported for the first time .The benefits of using the computers include improvement in record quality and saving of time .In addition , where interpretation depended on estimation of FHR variation there was prima facie evidence of observer misinterpretation ; visual analysis was unreliable .A larger trial is now required with more rigorous constraints on intervention .'

In [109]:
%%time
print(summarizer(kw_df.combined_cleaned.loc[134], max_length=130, min_length=50, do_sample=False))

[{'summary_text': 'Computerized fetal heart rate analysis in pregnancy.Randomized distribution of subjects with computerized analysis automatically revealed or concealed.2869 pregnant women studied within a year .Quality and duration of the cardiotocogram. quantitative measurement of FHR variation. number of stillbirths.'}]
CPU times: user 8.78 s, sys: 80.3 ms, total: 8.86 s
Wall time: 8.73 s
