```
         Copyright Rein Halbersma 2020.
Distributed under the Boost Software License, Version 1.0.
   (See accompanying file LICENSE_1_0.txt or copy at
         http://www.boost.org/LICENSE_1_0.txt)
```

# Re-implementing Marcel Wieting's [Code Review Stack Exchange post](https://codereview.stackexchange.com/questions/239521/performance-read-large-amount-of-xmls-and-load-into-single-csv)


In [1]:
import os

import pandas as pd
from tqdm import tqdm

import scrape.nct as nct

In [2]:
# https://clinicaltrials.gov/ct2/resources/download#DownloadAllData
nct_url = 'https://clinicaltrials.gov/AllPublicXML.zip'
file = nct_url.split('/')[-1]
path = file.split('.')[0]

In [3]:
nct.download_data(nct_url, file)    # 1.6 Gb on disk

100%|██████████| 1558/1558 [03:47&lt;00:00,  6.83it/s]


In [4]:
nct.unzip_data(file, path)  # 8.5 Gb on disk

In [5]:
xml_files = [
    os.path.join(dirpath, file)
    for dirpath, _, filenames in os.walk(path)
    for file in filenames
    if file.endswith('.xml')
]

In [6]:
df = pd.concat((
    nct.parse_xml(file)
    for file in tqdm(xml_files)
))

100%|██████████| 353491/353491 [16:50&lt;00:00, 349.81it/s]


In [7]:
df.info()   # 3.0 Gb in memory, 12.0 Gb on disk

&lt;class &#39;pandas.core.frame.DataFrame&#39;&gt;
Int64Index: 103541358 entries, 0 to 139
Data columns (total 3 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   id      int64 
 1   key     object
 2   value   object
dtypes: int64(1), object(2)
memory usage: 3.1+ GB


In [8]:
df.head()

Unnamed: 0,id,key,value
0,4271358,/clinical_study,\n
1,4271358,/clinical_study/comment()[1],This xml conforms to an XML Schema at:\n h...
2,4271358,/clinical_study/required_header,\n
3,4271358,/clinical_study/required_header/download_date,ClinicalTrials.gov processed this data on Octo...
4,4271358,/clinical_study/required_header/link_text,Link to the current ClinicalTrials.gov record.


In [9]:
df.tail()

Unnamed: 0,id,key,value
135,2808104,/clinical_study/intervention_browse/comment(),CAUTION: The following MeSH terms are assign...
136,2808104,/clinical_study/intervention_browse/mesh_term,Mazindol
137,2808104,/clinical_study/patient_data,\n
138,2808104,/clinical_study/patient_data/sharing_ipd,Undecided
139,2808104,/clinical_study/comment()[2],Results have not yet been posted for this stu...
