In [3]:
import pandas as pd 
import xml.etree.ElementTree as etree
from Bio import Entrez

In [5]:
# Set URL for XML file with all BioSamples associated with project
from urllib.request import urlopen
url = 'https://www.ncbi.nlm.nih.gov/portal/utils/file_backend.cgi?Db=biosample&HistoryId=NCID_1_125617756_130.14.18.48_5555_1587647699_834710852_0MetA0_S_HStore&QueryKey=8&Sort=&Filter=all&CompleteResultCount=683&Mode=file&View=fullxml&p$l=Email&portalSnapshot=%2Fprojects%2FBioSample%2Fbiosample%401.33&BaseUrl=&PortName=live&RootTag=BioSampleSet&FileName=&ContentType=xml'
xml_file = urlopen(url)

tree = etree.parse(xml_file)
root = tree.getroot()

In [6]:
# Append list of each all attributes for each sample within XML
all_samples = []
for child in root:
  samples = []
  for child in child:
    attribute_element = child.findall('Attribute')
    if attribute_element is not None:
      elements = []
      for element in attribute_element:
        individual = [element.items()[0][1], element.text]
        elements.append(individual)
      samples.append(elements)
  all_samples.append(samples)

In [7]:
# Replace all spaces with underscores to remove duplicates
for i in range(len(all_samples)):
  for num in range(len(all_samples[i][5])):
    all_samples[i][5][num][0] = all_samples[i][5][num][0].replace(' ', '_')

# Full list of Attributes across all sample 
features = []
for i in range(len(all_samples)):
  for x, y in all_samples[i][5]:
    features.append(x)
features = set(features)
len(features)

51

In [8]:
#Left join each sample by feature to create dataframe with all samples

df = pd.DataFrame(features, columns = ['features'])
df.set_index(['features'], inplace = True)
              
for i in range(len(all_samples)):
    temp = pd.DataFrame(all_samples[i][5], columns = ['features', str('Sample' + '_' + str(i))])
    temp.set_index(['features'], inplace = True)
    df = df.join(temp, how = 'left')
    if i % 100 == 0:
      print(str(i))

# Transpose so each row is a subject and each column is sample
df = df.T


0
100
200
300
400
500
600


In [9]:
df.head()

features,INSDC_status,fecal_date,study_center,INSDC_center_alias,experiment_title,physical_specimen_remaining,host_subject_id,year_diagnosed,samp_size,uc_extent,...,timepoint,Sample_Name,geo_loc_name,dna_extracted,sample_type,bmi,cd_location,longitude,ENA_checklist,body_product
Sample_0,public,23-Jun-09,LBNL,UCSDMI,Jansson_Lamendella_Crohns,False,1629:739,1966.0,".1,g",,...,2,ERS1464576,Sweden,True,stool,27.0,Ileal (L1),18.644,ERC000011,UBERON:feces
Sample_1,public,22-Apr-10,LBNL,UCSDMI,Jansson_Lamendella_Crohns,False,1629:1004,,".1,g",,...,2,ERS1464575,Sweden,True,stool,,,18.644,ERC000011,UBERON:feces
Sample_2,public,27-Dec-10,LBNL,UCSDMI,Jansson_Lamendella_Crohns,False,1629:908,,".1,g",,...,7,ERS1464574,Sweden,True,stool,22.0,,18.644,ERC000011,UBERON:feces
Sample_3,public,14-Jul-09,LBNL,UCSDMI,Jansson_Lamendella_Crohns,False,1629:1015,,".1,g",,...,2,ERS1464573,Sweden,True,stool,,,18.644,ERC000011,UBERON:feces
Sample_4,public,26-May-10,LBNL,UCSDMI,Jansson_Lamendella_Crohns,False,1629:1018,,".1,g",,...,2,ERS1464572,Sweden,True,stool,,,18.644,ERC000011,UBERON:feces


In [45]:
df.reset_index(inplace = True)
df.rename(columns = {'index' : 'samples'}, inplace= True)

In [4]:
# Build link to project on EBI and download sample data

base = 'https://www.ebi.ac.uk/ena/data/warehouse/filereport'
project = 'PRJEB18471'
result = 'read_run'
fields = 'study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,tax_id,scientific_name,instrument_model,library_layout,fastq_ftp'
download = 'txt'

url = base + '?accession=' + project + '&result=' + result + '&fields=' + fields + '&download=' + download

download_links = pd.read_csv(url, sep = '\t')

# Check download is correct
download_links.head()

Unnamed: 0,study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,tax_id,scientific_name,instrument_model,library_layout,fastq_ftp
0,PRJEB18471,SAMEA19334668,ERS1463894,ERX1815536,ERR1746339,408170,human gut metagenome,Illumina HiSeq 2000,SINGLE,ftp.sra.ebi.ac.uk/vol1/fastq/ERR174/009/ERR174...
1,PRJEB18471,SAMEA19335418,ERS1463895,ERX1815537,ERR1746340,408170,human gut metagenome,Illumina HiSeq 2000,SINGLE,ftp.sra.ebi.ac.uk/vol1/fastq/ERR174/000/ERR174...
2,PRJEB18471,SAMEA19336168,ERS1463896,ERX1815538,ERR1746341,408170,human gut metagenome,Illumina HiSeq 2000,SINGLE,ftp.sra.ebi.ac.uk/vol1/fastq/ERR174/001/ERR174...
3,PRJEB18471,SAMEA19336918,ERS1463897,ERX1815539,ERR1746342,408170,human gut metagenome,Illumina HiSeq 2000,SINGLE,ftp.sra.ebi.ac.uk/vol1/fastq/ERR174/002/ERR174...
4,PRJEB18471,SAMEA19337668,ERS1463898,ERX1815540,ERR1746343,408170,human gut metagenome,Illumina HiSeq 2000,SINGLE,ftp.sra.ebi.ac.uk/vol1/fastq/ERR174/003/ERR174...


In [46]:
# Merge data parsed from XML with ftp links for sequencing data from EBI
df = df.merge(download_links, how = 'left', left_on = 'SRA_accession', right_on = 'secondary_sample_accession')