In [1]:
from pysradb import SRAdb, download_sradb_file
from pysradb.filter_attrs import guess_tissue_type, guess_cell_type, guess_strain_type

# Download SRAmetadb.sqlite

In [2]:
download_sradb_file()

Downloading SRAmetadb.sqlite.gz: 2.15GB [01:17, 29.6MB/s]                            


Extracting /panfs/cmb-panasas2/skchoudh/github_projects/pysradb/notebooks/SRAmetadb.sqlite.gz ...


Extracting SRAmetadb.sqlite.gz: 100%|██████████| 33.0G/33.0G [10:42<00:00, 55.2MB/s]


Done!
Metadata associated with /panfs/cmb-panasas2/skchoudh/github_projects/pysradb/notebooks/SRAmetadb.sqlite:
                 name                value
0      schema version                  1.0
1  creation timestamp  2018-12-07 00:39:29


In [2]:
db = SRAdb('SRAmetadb.sqlite')


# List tables

In [3]:
sra_tables = db.list_tables()
sra_tables

['metaInfo',
 'submission',
 'study',
 'sample',
 'experiment',
 'run',
 'sra',
 'sra_ft',
 'sra_ft_content',
 'sra_ft_segments',
 'sra_ft_segdir',
 'col_desc',
 'fastq']

# List fields in a table

In [4]:
db.list_fields('study')

['study_ID',
 'study_alias',
 'study_accession',
 'study_title',
 'study_type',
 'study_abstract',
 'broker_name',
 'center_name',
 'center_project_name',
 'study_description',
 'related_studies',
 'primary_study',
 'sra_link',
 'study_url_link',
 'xref_link',
 'study_entrez_link',
 'ddbj_link',
 'ena_link',
 'study_attribute',
 'submission_accession',
 'sradb_updated']

# Describe columns in a table

In [5]:
db.desc_table('study')

Unnamed: 0,cid,name,dtype,notnull,dflt_value,pk
0,0,study_ID,REAL,0,,0
1,1,study_alias,TEXT,0,,0
2,2,study_accession,TEXT,0,,0
3,3,study_title,TEXT,0,,0
4,4,study_type,TEXT,0,,0
5,5,study_abstract,TEXT,0,,0
6,6,broker_name,TEXT,0,,0
7,7,center_name,TEXT,0,,0
8,8,center_project_name,TEXT,0,,0
9,9,study_description,TEXT,0,,0


In [6]:
sorted(db.desc_table('sra_ft').name.tolist())

['SRR_bamFile',
 'SRX_bamFile',
 'SRX_fastqFTP',
 'adapter_spec',
 'anonymized_name',
 'base_caller',
 'bases',
 'center_project_name',
 'common_name',
 'description',
 'design_description',
 'experiment_ID',
 'experiment_accession',
 'experiment_alias',
 'experiment_attribute',
 'experiment_entrez_link',
 'experiment_name',
 'experiment_title',
 'experiment_url_link',
 'individual_name',
 'instrument_model',
 'instrument_name',
 'library_construction_protocol',
 'library_layout',
 'library_name',
 'library_selection',
 'library_source',
 'library_strategy',
 'multiplier',
 'number_of_levels',
 'platform',
 'platform_parameters',
 'primary_study',
 'qtype',
 'quality_scorer',
 'read_spec',
 'related_studies',
 'run_ID',
 'run_accession',
 'run_alias',
 'run_attribute',
 'run_center',
 'run_date',
 'run_entrez_link',
 'run_url_link',
 'sample_ID',
 'sample_accession',
 'sample_alias',
 'sample_attribute',
 'sample_entrez_link',
 'sample_name',
 'sample_url_link',
 'sequence_space',
 'sp

# Query

In [7]:
db.query('SELECT * FROM study LIMIT 3')

Unnamed: 0,broker_name,center_name,center_project_name,ddbj_link,ena_link,primary_study,related_studies,sra_link,sradb_updated,study_ID,...,study_accession,study_alias,study_attribute,study_description,study_entrez_link,study_title,study_type,study_url_link,submission_accession,xref_link
0,,KEIO,Bacillus subtilis subsp. natto BEST195,,,,,,2016-11-20 16:42:15,1.0,...,DRP000001,DRP000001,,,,Bacillus subtilis subsp. natto BEST195 genome ...,Whole Genome Sequencing,,DRA000001,pubmed: 20398357 || pubmed: 25329997
1,,KEIO,Bacillus subtilis subsp. subtilis str. 168,,,,,,2017-09-21 20:23:35,2.0,...,DRP000002,DRP000002,,Whole genome resequencing of B. subtilis subti...,,Model organism for prokaryotic cell differenti...,Whole Genome Sequencing,,DRA000002,pubmed: 20398357
2,,UT-MGS,Integrative Transcriptome Analysis,,,,,,2016-11-20 16:42:16,3.0,...,DRP000003,DRP000003,,Although recent studies have revealed that the...,,Comprehensive identification and characterizat...,Transcriptome Analysis,DBTSS: http://dbtss.hgc.jp/,DRA000003,pubmed: 20400770


# Number of rows in each table

In [8]:
db.all_row_counts()

Unnamed: 0,count
metaInfo,2
submission,1059814
study,170120
sample,4757292
experiment,5330360
run,6026441
sra,4100423
sra_ft,4100423
sra_ft_content,4100423
sra_ft_segments,2693865


# Which study types are the most prominent?

In [9]:
db.query('SELECT study_type AS StudyType, count(*) AS Number FROM "study" GROUP BY study_type order by Number DESC')

Unnamed: 0,Number,StudyType
0,53318,
1,45968,Whole Genome Sequencing
2,38518,Other
3,16149,Transcriptome Analysis
4,14415,Metagenomics
5,763,Population Genomics
6,607,Epigenetics
7,230,Exome Sequencing
8,110,Cancer Genomics
9,31,Pooled Clone Sequencing


# Which Sequencing Instruments are most common?

In [10]:
instruments = db.query('SELECT instrument_model AS "Instrument Model", count( * ) AS Experiments FROM "experiment" GROUP BY instrument_model order by Experiments DESC').sort_values(by=['Instrument Model'])
instruments

Unnamed: 0,Experiments,Instrument Model
21,8704,454 GS
32,986,454 GS 20
9,49413,454 GS FLX
50,10,454 GS FLX
4,148483,454 GS FLX Titanium
15,16702,454 GS FLX+
13,23956,454 GS Junior
53,1,AB 310 Genetic Analyzer
42,111,AB 3130 Genetic Analyzer
44,55,AB 3130xL Genetic Analyzer


# Which assays are most common?

In [11]:
assays = db.query('SELECT library_strategy AS "Library Strategy", count( * ) AS Runs FROM "experiment" GROUP BY library_strategy order by Runs DESC')
assays

Unnamed: 0,Library Strategy,Runs
0,,1984183
1,WGS,1092423
2,AMPLICON,690550
3,RNA-Seq,627669
4,OTHER,314145
5,WXS,237358
6,CLONE,89555
7,ChIP-Seq,82832
8,POOLCLONE,53738
9,Bisulfite-Seq,32335


# List SRA Run table

In [12]:
df = db.sra_metadata('SRP017942')
df

Unnamed: 0,study_accession,experiment_accession,experiment_title,experiment_attribute,sample_attribute,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,spots,adapter_spec,avg_read_length
0,SRP017942,SRX217028,GSM1063575: 293T_GFP; Homo sapiens; RNA-Seq,GEO Accession: GSM1063575,source_name: 293T cells || cell line: 293T cel...,SRR648667,9606,other,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,1806641316,50184481,,36.0
1,SRP017942,SRX217029,GSM1063576: 293T_GFP_2hrs_severe_Heat_Shock; H...,GEO Accession: GSM1063576,source_name: 293T cells || cell line: 293T cel...,SRR648668,9606,other,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,3436984836,95471801,,36.0
2,SRP017942,SRX217030,GSM1063577: 293T_Hspa1a; Homo sapiens; RNA-Seq,GEO Accession: GSM1063577,source_name: 293T cells || cell line: 293T cel...,SRR648669,9606,other,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,3330909216,92525256,,36.0
3,SRP017942,SRX217031,GSM1063578: 293T_Hspa1a_2hrs_severe_Heat_Shock...,GEO Accession: GSM1063578,source_name: 293T cells || cell line: 293T cel...,SRR648670,9606,other,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,3622123512,100614542,,36.0
4,SRP017942,SRX217956,GSM794854: 3T3-Control-Riboseq; Mus musculus; ...,GEO Accession: GSM794854,source_name: 3T3 cells || treatment: control |...,SRR649752,10090,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,594945396,16526261,,36.0
5,SRP017942,SRX217957,GSM794855: 3T3-8hrs-mild-HS-Riboseq; Mus muscu...,GEO Accession: GSM794855,source_name: 3T3 cells || treatment: mild heat...,SRR649753,10090,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,730843272,20301202,,36.0
6,SRP017942,SRX217958,GSM794856: 3T3-2hrs-severe-HS-Riboseq; Mus mus...,GEO Accession: GSM794856,source_name: 3T3 cells || treatment: severe he...,SRR649754,10090,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,1785538908,49598303,,36.0
7,SRP017942,SRX217027,GSM1063574: 3T3-Hsp70_inhibitor_3hr; Mus muscu...,GEO Accession: GSM1063574,source_name: 3T3 cells || cell line: 3T3 cells...,SRR648666,10090,other,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,5932552920,148313823,,40.0
8,SRP017942,SRX217959,GSM794857: 3T3-Control-Riboseq-rep2; Mus muscu...,GEO Accession: GSM794857,source_name: 3T3 cells || treatment: control |...,SRR649755,10090,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,3713863640,92846591,,40.0
9,SRP017942,SRX217960,GSM794858: 3T3-8hrs-mild-HS-Riboseq-rep2; Mus ...,GEO Accession: GSM794858,source_name: 3T3 cells || treatment: mild heat...,SRR649756,10090,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,3967599960,99189999,,40.0


In [13]:
df = db.sra_metadata('SRP002605')
df

Unnamed: 0,study_accession,experiment_accession,experiment_title,experiment_attribute,sample_attribute,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,spots,adapter_spec,avg_read_length
0,SRP002605,SRX021966,GSM546920: Footprint_mock_32hr_runs1-2,GEO Accession: GSM546920,source_name: HeLa || transfection: mock || tim...,SRR057511,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM546920: Footprint_mock_32hr_runs1-2,173069316,4807481,,36.0
1,SRP002605,SRX021966,GSM546920: Footprint_mock_32hr_runs1-2,GEO Accession: GSM546920,source_name: HeLa || transfection: mock || tim...,SRR057512,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM546920: Footprint_mock_32hr_runs1-2,670274352,18618732,,36.0
2,SRP002605,SRX021967,GSM546921: mRNASeq_mock_32hr_runs1-3,GEO Accession: GSM546921,source_name: HeLa || transfection: mock || tim...,SRR057513,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM546921: mRNASeq_mock_32hr_runs1-3,159276276,4424341,,36.0
3,SRP002605,SRX021967,GSM546921: mRNASeq_mock_32hr_runs1-3,GEO Accession: GSM546921,source_name: HeLa || transfection: mock || tim...,SRR057515,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM546921: mRNASeq_mock_32hr_runs1-3,92929104,2581364,,36.0
4,SRP002605,SRX021968,GSM546922: Footprint_miR-155_32hr_runs1-2,GEO Accession: GSM546922,source_name: HeLa || transfection: miR-155 dup...,SRR057516,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM546922: Footprint_miR-155_32hr_runs1-2,212446440,5901290,,36.0
5,SRP002605,SRX021968,GSM546922: Footprint_miR-155_32hr_runs1-2,GEO Accession: GSM546922,source_name: HeLa || transfection: miR-155 dup...,SRR057517,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM546922: Footprint_miR-155_32hr_runs1-2,522725688,14520158,,36.0
6,SRP002605,SRX021969,GSM546923: mRNASeq_miR-155_32hr_runs1-3,GEO Accession: GSM546923,source_name: HeLa || transfection: miR-155 dup...,SRR057518,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM546923: mRNASeq_miR-155_32hr_runs1-3,216374004,6010389,,36.0
7,SRP002605,SRX021969,GSM546923: mRNASeq_miR-155_32hr_runs1-3,GEO Accession: GSM546923,source_name: HeLa || transfection: miR-155 dup...,SRR057519,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM546923: mRNASeq_miR-155_32hr_runs1-3,207243972,5756777,,36.0
8,SRP002605,SRX021969,GSM546923: mRNASeq_miR-155_32hr_runs1-3,GEO Accession: GSM546923,source_name: HeLa || transfection: miR-155 dup...,SRR057520,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM546923: mRNASeq_miR-155_32hr_runs1-3,94523436,2625651,,36.0
9,SRP002605,SRX021970,GSM546924: Footprint_miR-1_32hr_runs1-2,GEO Accession: GSM546924,source_name: HeLa || transfection: miR-1 duple...,SRR057521,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM546924: Footprint_miR-1_32hr_runs1-2,212789088,5910808,,36.0


In [14]:
df = db.sra_metadata('SRP098789')
df.head()

Unnamed: 0,study_accession,experiment_accession,experiment_title,experiment_attribute,sample_attribute,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,spots,adapter_spec,avg_read_length
0,SRP098789,SRX2536403,"GSM2475997: 1.5 Ã‚ÂµM PF-067446846, 10 min, re...",GEO Accession: GSM2475997,source_name: Huh7_1.5 Ã‚ÂµM PF-067446846_10 mi...,SRR5227288,9606,other,SINGLE -,OTHER,TRANSCRIPTOMIC,,2104142750,42082855,,50.0
1,SRP098789,SRX2536404,"GSM2475998: 1.5 Ã‚ÂµM PF-067446846, 10 min, re...",GEO Accession: GSM2475998,source_name: Huh7_1.5 Ã‚ÂµM PF-067446846_10 mi...,SRR5227289,9606,other,SINGLE -,OTHER,TRANSCRIPTOMIC,,2082873050,41657461,,50.0
2,SRP098789,SRX2536405,"GSM2475999: 1.5 Ã‚ÂµM PF-067446846, 10 min, re...",GEO Accession: GSM2475999,source_name: Huh7_1.5 Ã‚ÂµM PF-067446846_10 mi...,SRR5227290,9606,other,SINGLE -,OTHER,TRANSCRIPTOMIC,,2023148650,40462973,,50.0
3,SRP098789,SRX2536406,"GSM2476000: 0.3 Ã‚ÂµM PF-067446846, 10 min, re...",GEO Accession: GSM2476000,source_name: Huh7_0.3 Ã‚ÂµM PF-067446846_10 mi...,SRR5227291,9606,other,SINGLE -,OTHER,TRANSCRIPTOMIC,,2057165950,41143319,,50.0
4,SRP098789,SRX2536407,"GSM2476001: 0.3 Ã‚ÂµM PF-067446846, 10 min, re...",GEO Accession: GSM2476001,source_name: Huh7_0.3 Ã‚ÂµM PF-067446846_10 mi...,SRR5227292,9606,other,SINGLE -,OTHER,TRANSCRIPTOMIC,,3027621850,60552437,,50.0


In [15]:
df = db.sra_metadata('SRP000941')
df.head()

Unnamed: 0,study_accession,experiment_accession,experiment_title,experiment_attribute,sample_attribute,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,spots,adapter_spec,avg_read_length
0,SRP000941,SRX040606,Reference Epigenome: ChIP-Seq Input from hESC ...,EXPERIMENT_TYPE: ChIP-Seq Input || EXTRACTION_...,MOLECULE: genomic DNA || DISEASE: None || BIOM...,SRR097974,9606,RANDOM,SINGLE -,ChIP-Seq,GENOMIC,YL328,143248680,4774956,,30.0
1,SRP000941,SRX040606,Reference Epigenome: ChIP-Seq Input from hESC ...,EXPERIMENT_TYPE: ChIP-Seq Input || EXTRACTION_...,MOLECULE: genomic DNA || DISEASE: None || BIOM...,SRR097975,9606,RANDOM,SINGLE -,ChIP-Seq,GENOMIC,YL328,143602590,4786753,,30.0
2,SRP000941,SRX040606,Reference Epigenome: ChIP-Seq Input from hESC ...,EXPERIMENT_TYPE: ChIP-Seq Input || EXTRACTION_...,MOLECULE: genomic DNA || DISEASE: None || BIOM...,SRR097976,9606,RANDOM,SINGLE -,ChIP-Seq,GENOMIC,YL328,139682730,4656091,,30.0
3,SRP000941,SRX056710,Reference Epigenome: ChIP-Seq Analysis of H3K5...,EXPERIMENT_TYPE: Histone H3K56ac || EXTRACTION...,MOLECULE: genomic DNA || DISEASE: None || BIOM...,SRR179694,9606,ChIP,SINGLE -,ChIP-Seq,GENOMIC,SK279,1100862480,36695416,,30.0
4,SRP000941,SRX056721,Reference Epigenome: ChIP-Seq Analysis of H3K2...,EXPERIMENT_TYPE: Histone H3K27ac || EXTRACTION...,MOLECULE: genomic DNA || DISEASE: None || BIOM...,SRR179705,9606,ChIP,SINGLE -,ChIP-Seq,GENOMIC,SK265,639397380,21313246,,30.0


In [16]:
df_rna = df[df.library_strategy == 'RNA-Seq']
df_rna.head()

Unnamed: 0,study_accession,experiment_accession,experiment_title,experiment_attribute,sample_attribute,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,spots,adapter_spec,avg_read_length
672,SRP000941,SRX007166,Sequencing of small RNA from the H1 cell line,EXPERIMENT_TYPE: smRNA-Seq || EXTRACTION_PROTO...,MOLECULE: genomic DNA || DISEASE: None || BIOM...,SRR020285,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,smRNA-seq_h1_r1,489650116,11387212,,43.0
673,SRP000941,SRX007168,Sequencing of small RNA from the IMR90 cell line,EXPERIMENT_TYPE: smRNA-Seq || EXTRACTION_PROTO...,SEX: female || BIOMATERIAL_PROVIDER: ATCC || D...,SRR020286,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,smRNA-seq_imr90_r1,388025507,9023849,,43.0
674,SRP000941,SRX007165,"Strand-specific, shotgun sequencing of mRNA fr...",EXPERIMENT_TYPE: mRNA-Seq || EXTRACTION_PROTOC...,MOLECULE: genomic DNA || DISEASE: None || BIOM...,SRR020287,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,mRNA-seq_h1_r1,213042296,4954472,,43.0
675,SRP000941,SRX007167,"Strand-specific, shotgun sequencing of mRNA fr...",EXPERIMENT_TYPE: mRNA-Seq || EXTRACTION_PROTOC...,SEX: female || BIOMATERIAL_PROVIDER: ATCC || D...,SRR020292,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,mRNA-seq_imr90_r1,220029108,5116956,,43.0
711,SRP000941,SRX056684,Shotgun sequencing of polyA+ RNA isolated from...,EXPERIMENT_TYPE: mRNA-Seq || EXTRACTION_PROTOC...,MOLECULE: genomic DNA || DISEASE: None || BIOM...,SRR179588,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,mRNA-Seq_ff_ips_19_11_r1,2311289650,46225793,,50.0


In [17]:
db.download(df=df_rna.head(), out_dir='/staging/as/skchoudh/pysradb_downloads')

SRP000941/SRX056684/SRR179588: 100%|██████████| 5/5 [00:09<00:00,  1.89s/it]


Unnamed: 0,study_accession,experiment_accession,experiment_title,experiment_attribute,sample_attribute,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,spots,adapter_spec,avg_read_length,download_url
672,SRP000941,SRX007166,Sequencing of small RNA from the H1 cell line,EXPERIMENT_TYPE: smRNA-Seq || EXTRACTION_PROTO...,MOLECULE: genomic DNA || DISEASE: None || BIOM...,SRR020285,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,smRNA-seq_h1_r1,489650116,11387212,,43.0,anonftp@ftp-trace.ncbi.nlm.nih.gov:/sra/sra-in...
673,SRP000941,SRX007168,Sequencing of small RNA from the IMR90 cell line,EXPERIMENT_TYPE: smRNA-Seq || EXTRACTION_PROTO...,SEX: female || BIOMATERIAL_PROVIDER: ATCC || D...,SRR020286,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,smRNA-seq_imr90_r1,388025507,9023849,,43.0,anonftp@ftp-trace.ncbi.nlm.nih.gov:/sra/sra-in...
674,SRP000941,SRX007165,"Strand-specific, shotgun sequencing of mRNA fr...",EXPERIMENT_TYPE: mRNA-Seq || EXTRACTION_PROTOC...,MOLECULE: genomic DNA || DISEASE: None || BIOM...,SRR020287,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,mRNA-seq_h1_r1,213042296,4954472,,43.0,anonftp@ftp-trace.ncbi.nlm.nih.gov:/sra/sra-in...
675,SRP000941,SRX007167,"Strand-specific, shotgun sequencing of mRNA fr...",EXPERIMENT_TYPE: mRNA-Seq || EXTRACTION_PROTOC...,SEX: female || BIOMATERIAL_PROVIDER: ATCC || D...,SRR020292,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,mRNA-seq_imr90_r1,220029108,5116956,,43.0,anonftp@ftp-trace.ncbi.nlm.nih.gov:/sra/sra-in...
711,SRP000941,SRX056684,Shotgun sequencing of polyA+ RNA isolated from...,EXPERIMENT_TYPE: mRNA-Seq || EXTRACTION_PROTOC...,MOLECULE: genomic DNA || DISEASE: None || BIOM...,SRR179588,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,mRNA-Seq_ff_ips_19_11_r1,2311289650,46225793,,50.0,anonftp@ftp-trace.ncbi.nlm.nih.gov:/sra/sra-in...


In [18]:
df = db.sra_metadata('SRP000941')
df = df[df.library_strategy == 'RNA-Seq']
df_paired = df[df.library_layout.str.contains('PAIRED')]
df_unpaired = df[df.library_layout.str.contains('SINGLE')]

In [19]:
df_paired.head()

Unnamed: 0,study_accession,experiment_accession,experiment_title,experiment_attribute,sample_attribute,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,spots,adapter_spec,avg_read_length
1886,SRP000941,SRX263860,polyA RNA sequencing of STL001 Fat Cells,EXPERIMENT_TYPE: mRNA-Seq || EXTRACTION_PROTOC...,MOLECULE: genomic DNA || DISEASE: None || BIOM...,SRR1045522,9606,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,polyA-RNA-seq_STL001FT_r1a,1106627800,5533139,,200.0
1887,SRP000941,SRX263860,polyA RNA sequencing of STL001 Fat Cells,EXPERIMENT_TYPE: mRNA-Seq || EXTRACTION_PROTOC...,MOLECULE: genomic DNA || DISEASE: None || BIOM...,SRR1045523,9606,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,polyA-RNA-seq_STL001FT_r1a,1897588600,9487943,,200.0
1888,SRP000941,SRX263860,polyA RNA sequencing of STL001 Fat Cells,EXPERIMENT_TYPE: mRNA-Seq || EXTRACTION_PROTOC...,MOLECULE: genomic DNA || DISEASE: None || BIOM...,SRR1045524,9606,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,polyA-RNA-seq_STL001FT_r1a,1901851200,9509256,,200.0
1889,SRP000941,SRX263860,polyA RNA sequencing of STL001 Fat Cells,EXPERIMENT_TYPE: mRNA-Seq || EXTRACTION_PROTOC...,MOLECULE: genomic DNA || DISEASE: None || BIOM...,SRR1045525,9606,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,polyA-RNA-seq_STL001FT_r1a,1891729200,9458646,,200.0
1890,SRP000941,SRX263862,polyA RNA sequencing of STL001 Gastric Cells,EXPERIMENT_TYPE: mRNA-Seq || EXTRACTION_PROTOC...,MOLECULE: genomic DNA || DISEASE: None || BIOM...,SRR1045526,9606,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,polyA-RNA-seq_STL001GA_r1a,1189486000,5947430,,200.0


In [20]:
df_unpaired.head()

Unnamed: 0,study_accession,experiment_accession,experiment_title,experiment_attribute,sample_attribute,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,spots,adapter_spec,avg_read_length
672,SRP000941,SRX007166,Sequencing of small RNA from the H1 cell line,EXPERIMENT_TYPE: smRNA-Seq || EXTRACTION_PROTO...,MOLECULE: genomic DNA || DISEASE: None || BIOM...,SRR020285,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,smRNA-seq_h1_r1,489650116,11387212,,43.0
673,SRP000941,SRX007168,Sequencing of small RNA from the IMR90 cell line,EXPERIMENT_TYPE: smRNA-Seq || EXTRACTION_PROTO...,SEX: female || BIOMATERIAL_PROVIDER: ATCC || D...,SRR020286,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,smRNA-seq_imr90_r1,388025507,9023849,,43.0
674,SRP000941,SRX007165,"Strand-specific, shotgun sequencing of mRNA fr...",EXPERIMENT_TYPE: mRNA-Seq || EXTRACTION_PROTOC...,MOLECULE: genomic DNA || DISEASE: None || BIOM...,SRR020287,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,mRNA-seq_h1_r1,213042296,4954472,,43.0
675,SRP000941,SRX007167,"Strand-specific, shotgun sequencing of mRNA fr...",EXPERIMENT_TYPE: mRNA-Seq || EXTRACTION_PROTOC...,SEX: female || BIOMATERIAL_PROVIDER: ATCC || D...,SRR020292,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,mRNA-seq_imr90_r1,220029108,5116956,,43.0
711,SRP000941,SRX056684,Shotgun sequencing of polyA+ RNA isolated from...,EXPERIMENT_TYPE: mRNA-Seq || EXTRACTION_PROTOC...,MOLECULE: genomic DNA || DISEASE: None || BIOM...,SRR179588,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,mRNA-Seq_ff_ips_19_11_r1,2311289650,46225793,,50.0


In [21]:
db.download(df=df_unpaired.head(), out_dir='/staging/as/skchoudh/SRA_datasets/')

SRP000941/SRX056684/SRR179588: 100%|██████████| 5/5 [00:09<00:00,  1.91s/it]


Unnamed: 0,study_accession,experiment_accession,experiment_title,experiment_attribute,sample_attribute,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,spots,adapter_spec,avg_read_length,download_url
672,SRP000941,SRX007166,Sequencing of small RNA from the H1 cell line,EXPERIMENT_TYPE: smRNA-Seq || EXTRACTION_PROTO...,MOLECULE: genomic DNA || DISEASE: None || BIOM...,SRR020285,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,smRNA-seq_h1_r1,489650116,11387212,,43.0,anonftp@ftp-trace.ncbi.nlm.nih.gov:/sra/sra-in...
673,SRP000941,SRX007168,Sequencing of small RNA from the IMR90 cell line,EXPERIMENT_TYPE: smRNA-Seq || EXTRACTION_PROTO...,SEX: female || BIOMATERIAL_PROVIDER: ATCC || D...,SRR020286,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,smRNA-seq_imr90_r1,388025507,9023849,,43.0,anonftp@ftp-trace.ncbi.nlm.nih.gov:/sra/sra-in...
674,SRP000941,SRX007165,"Strand-specific, shotgun sequencing of mRNA fr...",EXPERIMENT_TYPE: mRNA-Seq || EXTRACTION_PROTOC...,MOLECULE: genomic DNA || DISEASE: None || BIOM...,SRR020287,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,mRNA-seq_h1_r1,213042296,4954472,,43.0,anonftp@ftp-trace.ncbi.nlm.nih.gov:/sra/sra-in...
675,SRP000941,SRX007167,"Strand-specific, shotgun sequencing of mRNA fr...",EXPERIMENT_TYPE: mRNA-Seq || EXTRACTION_PROTOC...,SEX: female || BIOMATERIAL_PROVIDER: ATCC || D...,SRR020292,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,mRNA-seq_imr90_r1,220029108,5116956,,43.0,anonftp@ftp-trace.ncbi.nlm.nih.gov:/sra/sra-in...
711,SRP000941,SRX056684,Shotgun sequencing of polyA+ RNA isolated from...,EXPERIMENT_TYPE: mRNA-Seq || EXTRACTION_PROTOC...,MOLECULE: genomic DNA || DISEASE: None || BIOM...,SRR179588,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,mRNA-Seq_ff_ips_19_11_r1,2311289650,46225793,,50.0,anonftp@ftp-trace.ncbi.nlm.nih.gov:/sra/sra-in...


# Slooooow downloads through ftp

If you don't have aspera....

In [22]:
db.download(df=df_unpaired.head(), protocol='ftp', out_dir='/staging/as/skchoudh/SRA_datasets/')


                Consider using `fastp` after installing aspera-client.
SRP000941/SRX056684/SRR179588: 100%|██████████| 5/5 [00:52<00:00, 12.54s/it]


Unnamed: 0,study_accession,experiment_accession,experiment_title,experiment_attribute,sample_attribute,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,spots,adapter_spec,avg_read_length,download_url
672,SRP000941,SRX007166,Sequencing of small RNA from the H1 cell line,EXPERIMENT_TYPE: smRNA-Seq || EXTRACTION_PROTO...,MOLECULE: genomic DNA || DISEASE: None || BIOM...,SRR020285,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,smRNA-seq_h1_r1,489650116,11387212,,43.0,ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-insta...
673,SRP000941,SRX007168,Sequencing of small RNA from the IMR90 cell line,EXPERIMENT_TYPE: smRNA-Seq || EXTRACTION_PROTO...,SEX: female || BIOMATERIAL_PROVIDER: ATCC || D...,SRR020286,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,smRNA-seq_imr90_r1,388025507,9023849,,43.0,ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-insta...
674,SRP000941,SRX007165,"Strand-specific, shotgun sequencing of mRNA fr...",EXPERIMENT_TYPE: mRNA-Seq || EXTRACTION_PROTOC...,MOLECULE: genomic DNA || DISEASE: None || BIOM...,SRR020287,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,mRNA-seq_h1_r1,213042296,4954472,,43.0,ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-insta...
675,SRP000941,SRX007167,"Strand-specific, shotgun sequencing of mRNA fr...",EXPERIMENT_TYPE: mRNA-Seq || EXTRACTION_PROTOC...,SEX: female || BIOMATERIAL_PROVIDER: ATCC || D...,SRR020292,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,mRNA-seq_imr90_r1,220029108,5116956,,43.0,ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-insta...
711,SRP000941,SRX056684,Shotgun sequencing of polyA+ RNA isolated from...,EXPERIMENT_TYPE: mRNA-Seq || EXTRACTION_PROTOC...,MOLECULE: genomic DNA || DISEASE: None || BIOM...,SRR179588,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,mRNA-Seq_ff_ips_19_11_r1,2311289650,46225793,,50.0,ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-insta...


In [23]:
df = db.sra_metadata('SRP043036')
df = df.sort_values(by='experiment_accession')


# Get Srains of species!

In [24]:
df['sample_attribute'].apply(lambda x: guess_strain_type(x)).tolist()

['by4741',
 'by4741',
 'by4741',
 'by4741',
 'by4741',
 'by4741',
 'by4741',
 'by4741',
 's288c',
 's288c',
 's288c',
 's288c']

In [25]:
df = db.sra_metadata('SRP016501')
df


Unnamed: 0,study_accession,experiment_accession,experiment_title,experiment_attribute,sample_attribute,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,spots,adapter_spec,avg_read_length
0,SRP016501,SRX196389,GSM1020765: chicken_c_brain; Gallus gallus; RN...,GEO Accession: GSM1020765,source_name: chicken_brain || tissue: brain,SRR594518,9031,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,,1896222720,27088896,,70.0
1,SRP016501,SRX196394,GSM1020770: chicken_c_lung; Gallus gallus; RNA...,GEO Accession: GSM1020770,source_name: chicken_lung || tissue: lung,SRR594523,9031,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,,1862299600,26604280,,70.0
2,SRP016501,SRX196395,GSM1020771: chicken_c_skm; Gallus gallus; RNA-Seq,GEO Accession: GSM1020771,source_name: chicken_skm || tissue: skeletal m...,SRR594524,9031,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,,1792450520,25606436,,70.0
3,SRP016501,SRX196381,GSM1020757: chicken_b_colon; Gallus gallus; RN...,GEO Accession: GSM1020757,source_name: chicken_colon || tissue: colon,SRR594510,9031,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,,1703779272,23663601,,72.0
4,SRP016501,SRX196384,GSM1020760: chicken_b_liver; Gallus gallus; RN...,GEO Accession: GSM1020760,source_name: chicken_liver || tissue: liver,SRR594513,9031,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,,1833743232,25468656,,72.0
5,SRP016501,SRX196387,GSM1020763: chicken_b_spleen; Gallus gallus; R...,GEO Accession: GSM1020763,source_name: chicken_spleen || tissue: spleen,SRR594516,9031,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,,1752978672,24346926,,72.0
6,SRP016501,SRX196388,GSM1020764: chicken_b_testes; Gallus gallus; R...,GEO Accession: GSM1020764,source_name: chicken_testes || tissue: testes,SRR594517,9031,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,,841438656,11686648,,72.0
7,SRP016501,SRX196390,GSM1020766: chicken_c_colon; Gallus gallus; RN...,GEO Accession: GSM1020766,source_name: chicken_colon || tissue: colon,SRR594519,9031,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,,1073294712,14906871,,72.0
8,SRP016501,SRX196392,GSM1020768: chicken_c_kidney; Gallus gallus; R...,GEO Accession: GSM1020768,source_name: chicken_kidney || tissue: kidney,SRR594521,9031,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,,2495525040,34660070,,72.0
9,SRP016501,SRX196393,GSM1020769: chicken_c_liver; Gallus gallus; RN...,GEO Accession: GSM1020769,source_name: chicken_liver || tissue: liver,SRR594522,9031,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,,1366420752,18978066,,72.0


# Guess cell/tissue type

In [26]:
df['sample_attribute'].apply(lambda x: guess_cell_type(x))

0       chicken_brain
1        chicken_lung
2         chicken_skm
3       chicken_colon
4       chicken_liver
5      chicken_spleen
6      chicken_testes
7       chicken_colon
8      chicken_kidney
9       chicken_liver
10     chicken_spleen
11      chicken_brain
12      chicken_heart
13     chicken_kidney
14       chicken_lung
15        chicken_skm
16      chicken_heart
17     chicken_testes
18      chicken_brain
19     chicken_kidney
20      chicken_colon
21      chicken_heart
22      chicken_liver
23       chicken_lung
24        chicken_skm
25     chicken_spleen
26     chicken_testes
27       rhesus_brain
28       rhesus_colon
29       rhesus_heart
            ...      
104         mouse_skm
105      mouse_spleen
106      mouse_testes
107         rat_liver
108         rat_heart
109           rat_skm
110         rat_brain
111         rat_colon
112         rat_heart
113        rat_kidney
114         rat_liver
115          rat_lung
116           rat_skm
117        rat_spleen
118       

In [27]:
df['sample_attribute'].apply(lambda x: guess_tissue_type(x))

0         brain
1          lung
2      skeletal
3         colon
4         liver
5        spleen
6        testes
7         colon
8        kidney
9         liver
10       spleen
11        brain
12        heart
13       kidney
14         lung
15     skeletal
16        heart
17       testes
18        brain
19       kidney
20        colon
21        heart
22        liver
23         lung
24     skeletal
25       spleen
26       testes
27        brain
28        colon
29        heart
         ...   
104    skeletal
105      spleen
106      testes
107       liver
108       heart
109    skeletal
110       brain
111       colon
112       heart
113      kidney
114       liver
115        lung
116    skeletal
117      spleen
118      testes
119       colon
120      kidney
121        lung
122      spleen
123      testes
124      kidney
125      spleen
126      testes
127       brain
128       brain
129       colon
130       heart
131       liver
132        lung
133    skeletal
Name: sample_attribute, 

# Search!

In [28]:
df = db.search_sra(search_str="breast cancer")

In [29]:
df.head()

Unnamed: 0,study_accession,experiment_accession,experiment_title,experiment_attribute,sample_attribute,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,spots
0,DRP000030,DRX000030,MCF7 MeDIP sample replica #1 for paired-end read,,sample_name: DRS000030 || sample comment: MCF7...,DRR000093,9606.0,other,PAIRED - NOMINAL_SDEV: 20; NOMINAL_LENGTH: 250;,OTHER,GENOMIC,KU_MeDIP_MCF7_1,442174248.0,6141309.0
1,DRP000030,DRX000053,HMC18 Input sample,,sample_name: DRS000052 || sample comment: HMC1...,DRR000116,9606.0,other,SINGLE -,OTHER,GENOMIC,KU_MeDIP_HMC18_I,323397648.0,8983268.0
2,DRP000030,DRX000047,MCF7 MeDIP sample replica #3,,sample_name: DRS000046 || sample comment: MCF7...,DRR000110,9606.0,other,SINGLE -,OTHER,GENOMIC,KU_MeDIP_MCF7_3,194263380.0,5396205.0
3,DRP000030,DRX000037,MCF7 MeDIP sample replica #2,,sample_name: DRS000036 || sample comment: MCF7...,DRR000100,9606.0,other,SINGLE -,OTHER,GENOMIC,KU_MeDIP_MCF7_2,514207656.0,14283546.0
4,DRP000030,DRX000036,HMEC Input sample,,sample_name: DRS000035 || sample comment: HMEC...,DRR000099,9606.0,other,SINGLE -,OTHER,GENOMIC,KU_MeDIP_HMEC_I,163146024.0,4531834.0


In [30]:
df = db.search_sra(search_str='"ribosome profiling"')
df.head()

Unnamed: 0,study_accession,experiment_accession,experiment_title,experiment_attribute,sample_attribute,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,spots
0,DRP003075,DRX019536,Illumina Genome Analyzer IIx sequencing of SAM...,,sample_name: RP_Cm_WT1 || strain: BW25113 || l...,DRR021383,83333.0,other,SINGLE -,OTHER,TRANSCRIPTOMIC,GAII05_3,978776500.0,12234706.0
1,DRP003075,DRX019537,Illumina Genome Analyzer IIx sequencing of SAM...,,sample_name: RP_Cm_WT2 || strain: BW25113 || l...,DRR021384,83333.0,other,SINGLE -,OTHER,TRANSCRIPTOMIC,GAII05_4,894201700.0,11177521.0
2,DRP003075,DRX019538,Illumina Genome Analyzer IIx sequencing of SAM...,,sample_name: RP_Cm_WT3(Mnase) || strain: BW251...,DRR021385,83333.0,other,SINGLE -,OTHER,TRANSCRIPTOMIC,GAII05_5,931536700.0,11644209.0
3,DRP003075,DRX019540,Illumina Genome Analyzer IIx sequencing of SAM...,,sample_name: RP_tet_WT1 || strain: BW25113 || ...,DRR021387,83333.0,other,SINGLE -,OTHER,TRANSCRIPTOMIC,GAII07_4,2759399000.0,27593987.0
4,DRP003075,DRX019541,Illumina Genome Analyzer IIx sequencing of SAM...,,sample_name: RP_tet_WT2 || strain: BW25113 || ...,DRR021388,83333.0,other,SINGLE -,OTHER,TRANSCRIPTOMIC,GAII07_5,2386196000.0,23861965.0


# Search by experiment ID (SRX/GSM)

In [32]:
df = db.search_by_expt_id('SRX1254413')
df

Unnamed: 0,experiment_ID,bamFile,fastqFTP,experiment_alias,experiment_accession,broker_name,center_name,title,study_name,study_accession,...,qtype,sra_link,experiment_url_link,xref_link,experiment_entrez_link,ddbj_link,ena_link,experiment_attribute,submission_accession,sradb_updated
0,1953400.0,,,GSM1887643,SRX1254413,,,GSM1887643: ribosome profiling; Homo sapiens; ...,GSE73136,SRP063852,...,,,,gds: 301887643,,,,GEO Accession: GSM1887643,SRA299240,2016-11-19 15:06:50
