In [1]:
import re
from typing import List
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
eudract_no = '2013-000856-16'

In [3]:
base_url = f'https://www.clinicaltrialsregister.eu/ctr-search/trial/{eudract_no}/results'

In [4]:
raw_html = requests.get(base_url,verify = False)



In [5]:
soup = BeautifulSoup(raw_html.content,'lxml')

In [6]:
table_ids = ['trialInformationSection', 'subjectDispositionSection', 'baselineCharacteristicsSection',
             'endPointsSection', re.compile(r"^endPoint[0-9]+Section$"), 'adverseEventsSection', 'moreInformationSection']

### Functions and Attributes

In [7]:
subject_dispo_dict = {
    'Allocation method':'blinding_type','Blinding Used':'blinding','Arm Title':'arms_number',
    'Arm Description':'arm_description','Arm Type':'arm_type','Investigational medicinal product name':
    'drug_name','Pharmaceutical forms':'dosage_form','Routes of administration':'route'
},

In [8]:
def get_arm_titles(table_id = table_ids[1],text = 'Arm title') :
    temp = soup.find('table',{'id':table_id})
    a = re.findall(text,temp.text)
    return len(a)
num_of_arm_titles = get_arm_titles()

In [9]:
def get_dataframe(df:pd.DataFrame = None) :
    if df is not None :
        df['arms'] = list(range(1,num_of_arm_titles+1))
    else :
        df = pd.DataFrame({'arms':list(range(1,num_of_arm_titles+1))})
    return df.set_index('arms')

In [10]:
def get_rows(table_id=None,temp=None,skip=3) -> List[BeautifulSoup]:
    if temp == None :
        temp = soup.find('table',{'id':table_id})
        
    rows = temp.find_all('tr')
    rows = rows[skip:]
    return rows

In [11]:
def get_first_two_columns_and_their_td(tr:BeautifulSoup) :
    tds = tr.find_all('td')
    if len(tds) < 2 :
        a = tds[0].text.strip()
        return a,None,tds[0],None,len(tds)
    a,b = tds[0],tds[1]
    a,b = a.text.strip(),b.text.strip()
    return a,b,tds[0],tds[1],len(tds)

In [12]:
def get_embedded_table(table_id=None,many=False,temp=None) :
    
    if temp == None :
        temp = soup.find('table',{'id':table_id})
    
    li = temp.find_all('td',{'class':'embeddedTableContainer'})
    ans = []
    #print(len(li))
    for i in range(len(li)) :
        df = pd.read_html(li[i].prettify(),flavor='bs4')[0]
        df = df.set_index(0).T

        if len(df.index)>num_of_arm_titles :
            df = df.drop(index=num_of_arm_titles+1)
        df = get_dataframe(df)
        ans.append(df)
    if many :
        return ans
    return ans[0]

In [13]:
def get_main_table(rows,break_text='') :
    drug=1
    flag = False
    df = get_dataframe()
    for i in rows :
        a,b,a_td,b_td,l_td = get_first_two_columns_and_their_td(i)
        if l_td>2 :
            break
        if a == '' :
            continue
        if b == '':
            continue

        if a == break_text :
            if flag :
                drug+=1
            else :
                flag = True
                
        if a not in df :
            df[a]= np.NaN
        df.loc[drug,a] = b
    return df

## 0 - Trial Information Table

In [14]:
trial_info_dict = {'EudraCT number':'EudraCT number','Sponsor organisation name':'Sponser',
'Main objective of the trial':'objective','Date of interim/final analysis':'Study_start_date'
,'Global end of trial date':'Study_end_date','Worldwide total number of subjects':'Total Subject'}

In [15]:
indx = 0
trial_info_rows = get_rows(table_ids[indx])
final_trial_info = get_dataframe()

In [16]:
for i in range(len(trial_info_rows)) :
    a,b,a_td,b_td,l_td = get_first_two_columns_and_their_td(trial_info_rows[i])
    if a == '':
        continue
    if a == 'Additional study identifiers' :
        li = []
        i+=1
        while(i<len(trial_info_rows)) :
            
            a,b,a_td,b_td,l_td = get_first_two_columns_and_their_td(trial_info_rows[i])
            if 'header' in a_td.attrs['class'] :
                break
            li.append(b)
            i+=1
        final_trial_info['Additional study identifiers'] = ', '.join(li)
    if a == 'Population of trial subjects' :
        li,s = [],0
        i+=1
        while(i<len(trial_info_rows)) :
            
            a,b,a_td,b_td,l_td = get_first_two_columns_and_their_td(trial_info_rows[i])
            if 'header' in a_td.attrs['class'] or 'Worldwide' in a:
                break
            contr_pop = b.split(':')
            if len(contr_pop) == 2 :
                li.append(contr_pop[0].strip())
                s+= int(contr_pop[1].strip())
            i+=1
        final_trial_info['countries'] =  ', '.join(li)
        final_trial_info['Sites'] = s
        
    if a == 'Number of subjects enrolled per age group' :
        s,s_more_than65 = 0,0
        flag = False
        i+=1
        while(i<len(trial_info_rows)) :
            a,b,a_td,b_td,l_td = get_first_two_columns_and_their_td(trial_info_rows[i])
            s+= int(b)
            if a == 'From 65 to 84 years' :
                flag = True
            if flag :
                s_more_than65+=int(b)
            i+=1
            
        final_trial_info['Age'] = s
        final_trial_info['Age_more_than_65'] = s_more_than65
    if a in trial_info_dict :
        final_trial_info[trial_info_dict[a]] = b

In [17]:
final_trial_info
#final_trial_info.to_excel(f'{eudract_no}final_trial_info.xlsx')

Unnamed: 0_level_0,Additional study identifiers,Sponser,Study_start_date,Study_end_date,objective,countries,Sites,Total Subject,Age,Age_more_than_65
arms,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,"-, NCT01854047, U1111-1138-3962",Sanofi aventis recherche & développement,04 May 2015,08 Apr 2015,To evaluate the efficacy of different doses an...,"South Africa, Spain, Turkey, Ukraine, United S...",776,776,776,74
2,"-, NCT01854047, U1111-1138-3962",Sanofi aventis recherche & développement,04 May 2015,08 Apr 2015,To evaluate the efficacy of different doses an...,"South Africa, Spain, Turkey, Ukraine, United S...",776,776,776,74
3,"-, NCT01854047, U1111-1138-3962",Sanofi aventis recherche & développement,04 May 2015,08 Apr 2015,To evaluate the efficacy of different doses an...,"South Africa, Spain, Turkey, Ukraine, United S...",776,776,776,74
4,"-, NCT01854047, U1111-1138-3962",Sanofi aventis recherche & développement,04 May 2015,08 Apr 2015,To evaluate the efficacy of different doses an...,"South Africa, Spain, Turkey, Ukraine, United S...",776,776,776,74
5,"-, NCT01854047, U1111-1138-3962",Sanofi aventis recherche & développement,04 May 2015,08 Apr 2015,To evaluate the efficacy of different doses an...,"South Africa, Spain, Turkey, Ukraine, United S...",776,776,776,74


## 1-Subject Disposition Table

In [18]:
subject_dispo_dict = {
    'Allocation method':'blinding_type','Blinding used':'blinding','Arm title':'arms_number',
    'Arm description':'arm_description','Arm type':'arm_type','Investigational medicinal product name':
    'drug_name','Pharmaceutical forms':'dosage_form','Routes of administration':'route'
}

In [19]:
indx = 1
subject_dispo_rows = get_rows(table_ids[indx])
final_sub_dispo = get_dataframe()

In [20]:
drug = 1
flag = False
for i in subject_dispo_rows :
    a,b,a_td,b_td,l_td = get_first_two_columns_and_their_td(i)
    #print(a,b)
    if a in subject_dispo_dict :
        if subject_dispo_dict[a] not in final_sub_dispo:
            final_sub_dispo[subject_dispo_dict[a]] = np.NaN
        if a == 'Arm title' :
            if flag:
                drug+=1
            else :
                flag = True
        #print(a,b) 
        final_sub_dispo.loc[drug,subject_dispo_dict[a]] = b
        if subject_dispo_dict[a] == 'blinding_type' :
            if 'Randomised' in final_sub_dispo[subject_dispo_dict[a]][drug]:
                final_sub_dispo['randomization'] = 'Yes'
            else :
                final_sub_dispo['randomization'] = 'No'            

In [21]:
final_sub_dispo.fillna(axis=0,method='ffill',inplace=True)
final_sub_dispo

Unnamed: 0_level_0,blinding_type,randomization,blinding,arms_number,arm_description,arm_type,drug_name,dosage_form,route
arms,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,Randomised - controlled,Yes,Double blind,Placebo q2w,2 subcutaneous injections of Placebo (for Dupi...,Placebo,Placebo (for Dupilumab),Solution for injection,Subcutaneous use
2,Randomised - controlled,Yes,Double blind,Dupilumab 300 mg q2w,2 subcutaneous injections of Dupilumab 300 mg ...,Experimental,Dupilumab,Solution for injection,Subcutaneous use
3,Randomised - controlled,Yes,Double blind,Dupilumab 200 mg q2w,2 subcutaneous injections of Dupilumab 200 mg ...,Experimental,Dupilumab,Solution for injection,Subcutaneous use
4,Randomised - controlled,Yes,Double blind,Dupilumab 300 mg q4w,2 subcutaneous injections of Dupilumab 300 mg ...,Experimental,Placebo (for Dupilumab),Solution for injection,Subcutaneous use
5,Randomised - controlled,Yes,Double blind,Dupilumab 200 mg q4w,2 subcutaneous injections of Dupilumab 200 mg ...,Experimental,Placebo (for Dupilumab),Solution for injection,Subcutaneous use


In [22]:
#final_sub_dispo.to_excel(f'{eudract_no}final_sub_dispo.xlsx')

## 2- Baseline Characterstics

In [23]:
indx = 2
baseline_char_rows = get_rows(table_ids[indx])
final_baseline_char = get_main_table(baseline_char_rows,'Reporting group title')

In [24]:
em = get_embedded_table(table_ids[indx])

In [25]:
final_baseline_char = pd.concat([final_baseline_char,em],axis=1)
final_baseline_char

Unnamed: 0_level_0,Reporting group title,Reporting group description,Reporting group values,Number of subjects,Age categorical,Units: Subjects,Age continuous,Units: years,arithmetic mean (standard deviation),Gender categorical,Units: Subjects,Female,Male,Number of Subjects with Blood Eosinphil Count,Units: Subjects,<0.3 Giga/L,≥ 0.3 Giga/L
arms,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,Placebo q2w,2 subcutaneous injections of Placebo (for Dupi...,Placebo q2w,158,,,,,49 ± 12.7,,,104,54,,,90,68
2,Dupilumab 300 mg q2w,2 subcutaneous injections of Dupilumab 300 mg ...,Dupilumab 300 mg q2w,157,,,,,47.5 ± 12.4,,,103,54,,,93,64
3,Dupilumab 200 mg q2w,2 subcutaneous injections of Dupilumab 200 mg ...,Dupilumab 200 mg q2w,150,,,,,51 ± 13.4,,,96,54,,,85,65
4,Dupilumab 300 mg q4w,2 subcutaneous injections of Dupilumab 300 mg ...,Dupilumab 300 mg q4w,157,,,,,47.9 ± 13.1,,,100,57,,,91,66
5,Dupilumab 200 mg q4w,2 subcutaneous injections of Dupilumab 200 mg ...,Dupilumab 200 mg q4w,154,,,,,47.9 ± 13.1,,,87,67,,,92,62


In [26]:
arms_dict = final_baseline_char['Reporting group title'].to_dict()
arms_dict = {v:k for k,v in arms_dict.items()}
arms_dict

{'Placebo q2w': 1,
 'Dupilumab 300 mg q2w': 2,
 'Dupilumab 200 mg q2w': 3,
 'Dupilumab 300 mg q4w': 4,
 'Dupilumab 200 mg q4w': 5}

In [27]:
#final_baseline_char.to_excel(f'{eudract_no}final_baseline_char.xlsx')

## 5- Adverse Events

In [28]:
ae_list = ['arms','type_of_ae','e_reference','endpoint','total_n','affected_n','occurence_n','occurence_related(sae)','deaths(sae)']

In [29]:
indx = 5
final_ae = pd.DataFrame()
for i in ae_list :
    final_ae[i] = np.NaN

In [30]:
ae_li = get_embedded_table(table_ids[indx],many=True)

In [31]:
def get_final_ae(type_of_ae,temp_df,cols) :
    i = 1
    while(i<len(cols)) :
        sr:pd.Series = temp_df.iloc[i]
        if sr.isnull().all() :
            if temp_df.iloc[i+1].isnull().all() :
                i+=1
                continue
            if i != 1 :
                #print(mat)
                for j in mat :
                    final_ae.loc[len(final_ae.index)] = j
            mat = np.array([[np.NaN]*len(ae_list)]*num_of_arm_titles,dtype=np.object0)
            mat[:,1] = type_of_ae
            mat[:,3] = sr.name 
            mat[:,0] = list(range(1,num_of_arm_titles+1))
        else :
            if sr.name == 'subjects affected / exposed' :
                mat[:,5] = pd.to_numeric(sr.str.split('/',1,expand=True)[0].str.strip(),errors='coerce')
                mat[:,4] = pd.to_numeric(sr.str.split('/',1,expand=True)[1].str.split('(',1,expand=True)[0].str.strip(),errors='coerce')
            if sr.name == 'occurrences all number' :
                mat[:,6] = pd.to_numeric(sr.str.strip(),errors='coerce')
            if sr.name == 'occurrences causally related to treatment / all' :
                mat[:,7] = sr.str.strip()
            if sr.name == 'deaths causally related to treatment / all' :
                mat[:,8] = sr.str.strip()
        i+=1 

In [32]:
get_final_ae('sae',ae_li[0].T,ae_li[0].columns)
get_final_ae('nsae',ae_li[1].T,ae_li[1].columns)

In [33]:
ae_rows = get_rows(table_id=table_ids[5])
v = ''
for i in ae_rows :
    a,b,a_td,b_td,l_td = get_first_two_columns_and_their_td(i)
    if a == 'Dictionary name' :
        v = v+b
    elif a == 'Dictionary version' :
        v = v+b
        break
final_ae['e_reference'] = v

In [34]:
#final_ae.to_excel(f'{eudract_no}final_ae.xlsx')
final_ae

Unnamed: 0,arms,type_of_ae,e_reference,endpoint,total_n,affected_n,occurence_n,occurence_related(sae),deaths(sae)
0,1.0,sae,MedDRA18.0,Total subjects affected by serious adverse events,158.0,9.0,,,
1,2.0,sae,MedDRA18.0,Total subjects affected by serious adverse events,156.0,13.0,,,
2,3.0,sae,MedDRA18.0,Total subjects affected by serious adverse events,148.0,10.0,,,
3,4.0,sae,MedDRA18.0,Total subjects affected by serious adverse events,157.0,16.0,,,
4,5.0,sae,MedDRA18.0,Total subjects affected by serious adverse events,150.0,6.0,,,
...,...,...,...,...,...,...,...,...,...
295,1.0,nsae,MedDRA18.0,Sinusitis,158.0,11.0,14.0,,
296,2.0,nsae,MedDRA18.0,Sinusitis,156.0,6.0,7.0,,
297,3.0,nsae,MedDRA18.0,Sinusitis,148.0,5.0,7.0,,
298,4.0,nsae,MedDRA18.0,Sinusitis,157.0,13.0,14.0,,


## 4 -Endpoints 

In [35]:
indx = 4
endpt_html = soup.find_all('table',{'id':table_ids[indx]})
final_endpoint = get_dataframe()
main_li: List[pd.DataFrame] = []
em_li = []
endpoint_dict = {'End point title':'endpoint_title','End point description':'endpoint_description','End point type':'ednpoint_type','End point timeframe':'endpoint_time',
'Number of subjects analysed':'n_analyzed','Units':'unit','Baseline':'baseline','Week':'values','Change from baseline':'cfb','Statistical analysis description':'statistical_analysis_description',
'Comparison groups':'comparison_groups','Analysis type':'analysis_type','P-value':'p_value','Method':'pvalue_method',
'Parameter type':'comparison_type','Point estimate':'comparison','level':'ci_level','sides':'sides','lower limit':'lower_limit','upper limit':'upper_limit'
}

In [36]:
for i in endpt_html :
    flag = False
    i_rows = get_rows(temp=i)
    main_li.append(get_main_table(i_rows))
    em_li.append(get_embedded_table(temp = i))
    main_li[-1].fillna(axis=0,method='ffill',inplace=True)
    for j in i_rows :
        a,b,a_td,b_td,l_td = get_first_two_columns_and_their_td(j)
        if a == 'Statistical analysis title' :
            flag = True
            drug = arms_dict[b.split('vs')[0].strip()]
            #print(a,drug)
        else :
            if flag :
                if l_td == 1 :
                    sdf,b = a.split(' ',1)
                    a = sdf.strip()
                    b = b.strip()
                if a not in em_li[-1] :
                    em_li[-1][a] = np.NaN
                em_li[-1].loc[drug,a] = b.replace('\n', ' ').replace('\r', '')
            else :
                continue
    main_li[-1] = pd.concat([main_li[-1],em_li[-1]],axis=1)   

In [37]:
for df in main_li :
    cols = df.columns
    for i in cols :
        if 'Units' in i :
            df[i] = i.split(':',1)[1].strip()
            df.rename(columns={i:'unit'},inplace=True)
#         if 'Week' in i :
#             df.rename(columns={i:'week'},inplace=True)
#         if 'Change from baseline' in i :
#             df.rename(columns={i:'cfb'},inplace=True)
#         if 'Baseline' in i :
#             df.rename(columns={i:'baseline'},inplace=True)
    df.rename(columns=endpoint_dict,inplace=True)
#     df.reset_index(inplace=True)
    df.dropna(axis=1,how='all',inplace=True)

In [38]:
final_endpoint = pd.concat(main_li,axis=0)

In [39]:
final_endpoint.to_excel(f'{eudract_no}final_endpoint.xlsx')
final_endpoint

Unnamed: 0_level_0,endpoint_title,endpoint_description,ednpoint_type,endpoint_time,End point values,n_analyzed,unit,"Baseline (n=158,157,150,157,154)","Week 12 (n=129,146,136,134,134)","Change from baseline (n=129,146,136,134,134)",...,"HEos ITT Population at Week 12 (n=68,64,64,66,59)","HEos ITT Population at Week 24 (n=68,64,64,66,59)","ITT Population at Week 12 (n=158,156,148,157,150)","ITT Population at Week 24 (n=158,156,148,157,150)","ITT Population (n=143,148,138,148,140)","HEos ITT Population (n=62,59,58,61,56)","ITT Population (n=143,148,136,148,140)","Baseline (n=90,93,85,91,92)","Week 12 (n=71,87,79,79,81)","Change from Baseline (n=71,87,79,79,81)"
arms,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Change From Baseline in Forced Expiratory Volu...,FEV1 was the volume of air exhaled in the firs...,Primary,Baseline to Week 12,Placebo q2w,158,Liter,1.82 ± 0.55,2.01 ± 0.69,0.13 ± 0.37,...,,,,,,,,,,
2,Change From Baseline in Forced Expiratory Volu...,FEV1 was the volume of air exhaled in the firs...,Primary,Baseline to Week 12,Dupilumab 300 mg q2w,157,Liter,1.85 ± 0.53,2.12 ± 0.59,0.26 ± 0.39,...,,,,,,,,,,
3,Change From Baseline in Forced Expiratory Volu...,FEV1 was the volume of air exhaled in the firs...,Primary,Baseline to Week 12,Dupilumab 200 mg q2w,150,Liter,1.79 ± 0.52,2.12 ± 0.68,0.32 ± 0.38,...,,,,,,,,,,
4,Change From Baseline in Forced Expiratory Volu...,FEV1 was the volume of air exhaled in the firs...,Primary,Baseline to Week 12,Dupilumab 300 mg q4w,157,Liter,1.86 ± 0.57,2.14 ± 0.69,0.24 ± 0.4,...,,,,,,,,,,
5,Change From Baseline in Forced Expiratory Volu...,FEV1 was the volume of air exhaled in the firs...,Primary,Baseline to Week 12,Dupilumab 200 mg q4w,154,Liter,1.88 ± 0.54,2.07 ± 0.63,0.2 ± 0.41,...,,,,,,,,,,
1,Change From Baseline in FEV1 at Week 12 - ITT ...,Analysis was performed on HEos ITT population ...,Primary,Baseline to Week 12,Placebo q2w,68,Liter,,,,...,,,,,,,,,,
2,Change From Baseline in FEV1 at Week 12 - ITT ...,Analysis was performed on HEos ITT population ...,Primary,Baseline to Week 12,Dupilumab 300 mg q2w,64,Liter,,,,...,,,,,,,,,,
3,Change From Baseline in FEV1 at Week 12 - ITT ...,Analysis was performed on HEos ITT population ...,Primary,Baseline to Week 12,Dupilumab 200 mg q2w,65,Liter,,,,...,,,,,,,,,,
4,Change From Baseline in FEV1 at Week 12 - ITT ...,Analysis was performed on HEos ITT population ...,Primary,Baseline to Week 12,Dupilumab 300 mg q4w,66,Liter,,,,...,,,,,,,,,,
5,Change From Baseline in FEV1 at Week 12 - ITT ...,Analysis was performed on HEos ITT population ...,Primary,Baseline to Week 12,Dupilumab 200 mg q4w,62,Liter,,,,...,,,,,,,,,,
