In [1]:
%pylab inline
%load_ext autoreload
%autoreload 2

import pandas as pd
import os
import fnmatch
import errno    
import glob
from riboraptor.sradb import SRAdb

Populating the interactive namespace from numpy and matplotlib


  return f(*args, **kwds)


In [2]:
db = SRAdb('/staging/as/skchoudh/SRAmetadb.sqlite')


# List tables

In [3]:
sra_tables = db.list_tables()
sra_tables

['metaInfo',
 'submission',
 'study',
 'sample',
 'experiment',
 'run',
 'sra',
 'sra_ft',
 'sra_ft_content',
 'sra_ft_segments',
 'sra_ft_segdir',
 'col_desc',
 'fastq']

# List fields in a table

In [4]:
db.list_fields('study')

['study_ID',
 'study_alias',
 'study_accession',
 'study_title',
 'study_type',
 'study_abstract',
 'broker_name',
 'center_name',
 'center_project_name',
 'study_description',
 'related_studies',
 'primary_study',
 'sra_link',
 'study_url_link',
 'xref_link',
 'study_entrez_link',
 'ddbj_link',
 'ena_link',
 'study_attribute',
 'submission_accession',
 'sradb_updated']

# Describe columns in a table

In [5]:
db.desc_table('study')

Unnamed: 0,cid,name,dtype,notnull,dflt_value,pk
0,0,study_ID,REAL,0,,0
1,1,study_alias,TEXT,0,,0
2,2,study_accession,TEXT,0,,0
3,3,study_title,TEXT,0,,0
4,4,study_type,TEXT,0,,0
5,5,study_abstract,TEXT,0,,0
6,6,broker_name,TEXT,0,,0
7,7,center_name,TEXT,0,,0
8,8,center_project_name,TEXT,0,,0
9,9,study_description,TEXT,0,,0


In [8]:
sorted(db.desc_table('sra_ft').name.tolist())

['SRR_bamFile',
 'SRX_bamFile',
 'SRX_fastqFTP',
 'adapter_spec',
 'anonymized_name',
 'base_caller',
 'bases',
 'center_project_name',
 'common_name',
 'description',
 'design_description',
 'experiment_ID',
 'experiment_accession',
 'experiment_alias',
 'experiment_attribute',
 'experiment_entrez_link',
 'experiment_name',
 'experiment_title',
 'experiment_url_link',
 'individual_name',
 'instrument_model',
 'instrument_name',
 'library_construction_protocol',
 'library_layout',
 'library_name',
 'library_selection',
 'library_source',
 'library_strategy',
 'multiplier',
 'number_of_levels',
 'platform',
 'platform_parameters',
 'primary_study',
 'qtype',
 'quality_scorer',
 'read_spec',
 'related_studies',
 'run_ID',
 'run_accession',
 'run_alias',
 'run_attribute',
 'run_center',
 'run_date',
 'run_entrez_link',
 'run_url_link',
 'sample_ID',
 'sample_accession',
 'sample_alias',
 'sample_attribute',
 'sample_entrez_link',
 'sample_name',
 'sample_url_link',
 'sequence_space',
 'sp

# Query

In [6]:
db.get_query('SELECT * FROM study LIMIT 3')

Unnamed: 0,broker_name,center_name,center_project_name,ddbj_link,ena_link,primary_study,related_studies,sra_link,sradb_updated,study_ID,...,study_accession,study_alias,study_attribute,study_description,study_entrez_link,study_title,study_type,study_url_link,submission_accession,xref_link
0,,KEIO,Bacillus subtilis subsp. natto BEST195,,,,,,2016-11-20 16:42:15,1.0,...,DRP000001,DRP000001,,,,Bacillus subtilis subsp. natto BEST195 genome ...,Whole Genome Sequencing,,DRA000001,pubmed: 20398357 || pubmed: 25329997
1,,KEIO,Bacillus subtilis subsp. subtilis str. 168,,,,,,2017-09-21 20:23:35,2.0,...,DRP000002,DRP000002,,Whole genome resequencing of B. subtilis subti...,,Model organism for prokaryotic cell differenti...,Whole Genome Sequencing,,DRA000002,pubmed: 20398357
2,,UT-MGS,Integrative Transcriptome Analysis,,,,,,2016-11-20 16:42:16,3.0,...,DRP000003,DRP000003,,Although recent studies have revealed that the...,,Comprehensive identification and characterizat...,Transcriptome Analysis,DBTSS: http://dbtss.hgc.jp/,DRA000003,pubmed: 20400770


# Number of rows in each table

In [7]:
db.get_table_counts()

Unnamed: 0,count
submission,961295
sra_ft,5242098
sra_ft_segments,2964125
sample,4199180
experiment,4708115
sra_ft_segdir,984
fastq,5252155
sra_ft_content,5242098
study,155648
metaInfo,2


# Which study types are the most prominent?

In [8]:
db.get_query('SELECT study_type AS StudyType, count(*) AS Number FROM "study" GROUP BY study_type order by Number DESC')

Unnamed: 0,Number,StudyType
0,45967,Whole Genome Sequencing
1,38853,
2,38512,Other
3,16149,Transcriptome Analysis
4,14415,Metagenomics
5,763,Population Genomics
6,607,Epigenetics
7,230,Exome Sequencing
8,110,Cancer Genomics
9,31,Pooled Clone Sequencing


# Which Sequencing Instruments are most common?

In [9]:
instruments = db.get_query('SELECT instrument_model AS "Instrument Model", count( * ) AS Experiments FROM "experiment" GROUP BY instrument_model order by Experiments DESC').sort_values(by=['Instrument Model'])
instruments

Unnamed: 0,Experiments,Instrument Model
21,8704,454 GS
32,986,454 GS 20
9,49413,454 GS FLX
50,10,454 GS FLX
4,148483,454 GS FLX Titanium
15,16702,454 GS FLX+
13,23956,454 GS Junior
53,1,AB 310 Genetic Analyzer
42,111,AB 3130 Genetic Analyzer
44,55,AB 3130xL Genetic Analyzer


In [10]:
instruments['Instrument Model'][9]

'454 GS FLX'

In [11]:
len(instruments['Instrument Model'])

55

In [12]:
len(instruments['Instrument Model'].unique())

55

In [13]:
instruments['Instrument Model'][9]

'454 GS FLX'

In [14]:
instruments['Instrument Model'][50]

'454 GS FLX '

# Which assays are most common?

In [15]:
assays = db.get_query('SELECT library_strategy AS "Library Strategy", count( * ) AS Runs FROM "experiment" GROUP BY library_strategy order by Runs DESC')
assays

Unnamed: 0,Library Strategy,Runs
0,,1362293
1,WGS,1092229
2,AMPLICON,690441
3,RNA-Seq,627617
4,OTHER,314145
5,WXS,237358
6,CLONE,89555
7,ChIP-Seq,82832
8,POOLCLONE,53738
9,Bisulfite-Seq,32335


In [16]:
len(assays['Library Strategy'])

36

In [17]:
len(assays['Library Strategy'].unique())

36

In [None]:
assays['Library Strategy']

# List SRA Run table

In [12]:
df = db.sra_convert('SRP017942')
df

Unnamed: 0,study_accession,experiment_accession,experiment_name,experiment_title,sample_name,sample_alias,submission_date,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,spots,adapter_spec,avg_read_length
0,SRP017942,SRX217028,GSM1063575_1,GSM1063575: 293T_GFP; Homo sapiens; RNA-Seq,GSM1063575,GSM1063575,2013-01-16,SRR648667,9606,other,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,1806641316,50184481,,36.0
1,SRP017942,SRX217029,GSM1063576_1,GSM1063576: 293T_GFP_2hrs_severe_Heat_Shock; H...,GSM1063576,GSM1063576,2013-01-16,SRR648668,9606,other,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,3436984836,95471801,,36.0
2,SRP017942,SRX217030,GSM1063577_1,GSM1063577: 293T_Hspa1a; Homo sapiens; RNA-Seq,GSM1063577,GSM1063577,2013-01-16,SRR648669,9606,other,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,3330909216,92525256,,36.0
3,SRP017942,SRX217031,GSM1063578_1,GSM1063578: 293T_Hspa1a_2hrs_severe_Heat_Shock...,GSM1063578,GSM1063578,2013-01-16,SRR648670,9606,other,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,3622123512,100614542,,36.0
4,SRP017942,SRX217956,GSM794854_1,GSM794854: 3T3-Control-Riboseq; Mus musculus; ...,GSM794854,GSM794854,2013-01-16,SRR649752,10090,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,594945396,16526261,,36.0
5,SRP017942,SRX217957,GSM794855_1,GSM794855: 3T3-8hrs-mild-HS-Riboseq; Mus muscu...,GSM794855,GSM794855,2013-01-16,SRR649753,10090,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,730843272,20301202,,36.0
6,SRP017942,SRX217958,GSM794856_1,GSM794856: 3T3-2hrs-severe-HS-Riboseq; Mus mus...,GSM794856,GSM794856,2013-01-16,SRR649754,10090,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,1785538908,49598303,,36.0
7,SRP017942,SRX217027,GSM1063574_1,GSM1063574: 3T3-Hsp70_inhibitor_3hr; Mus muscu...,GSM1063574,GSM1063574,2013-01-16,SRR648666,10090,other,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,5932552920,148313823,,40.0
8,SRP017942,SRX217959,GSM794857_1,GSM794857: 3T3-Control-Riboseq-rep2; Mus muscu...,GSM794857,GSM794857,2013-01-16,SRR649755,10090,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,3713863640,92846591,,40.0
9,SRP017942,SRX217960,GSM794858_1,GSM794858: 3T3-8hrs-mild-HS-Riboseq-rep2; Mus ...,GSM794858,GSM794858,2013-01-16,SRR649756,10090,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,3967599960,99189999,,40.0


In [13]:
df = db.sra_convert('SRP002605')
df

Unnamed: 0,study_accession,experiment_accession,experiment_name,experiment_title,sample_name,sample_alias,submission_date,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,spots,adapter_spec,avg_read_length
0,SRP002605,SRX021966,GSM546920: Footprint_mock_32hr_runs1-2,GSM546920: Footprint_mock_32hr_runs1-2,GSM546920: Footprint_mock_32hr_runs1-2,GSM546920,2010-06-11,SRR057511,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM546920: Footprint_mock_32hr_runs1-2,173069316,4807481,,36.0
1,SRP002605,SRX021966,GSM546920: Footprint_mock_32hr_runs1-2,GSM546920: Footprint_mock_32hr_runs1-2,GSM546920: Footprint_mock_32hr_runs1-2,GSM546920,2010-06-11,SRR057512,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM546920: Footprint_mock_32hr_runs1-2,670274352,18618732,,36.0
2,SRP002605,SRX021967,GSM546921: mRNASeq_mock_32hr_runs1-3,GSM546921: mRNASeq_mock_32hr_runs1-3,GSM546921: mRNASeq_mock_32hr_runs1-3,GSM546921,2010-06-11,SRR057513,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM546921: mRNASeq_mock_32hr_runs1-3,159276276,4424341,,36.0
3,SRP002605,SRX021967,GSM546921: mRNASeq_mock_32hr_runs1-3,GSM546921: mRNASeq_mock_32hr_runs1-3,GSM546921: mRNASeq_mock_32hr_runs1-3,GSM546921,2010-06-11,SRR057515,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM546921: mRNASeq_mock_32hr_runs1-3,92929104,2581364,,36.0
4,SRP002605,SRX021968,GSM546922: Footprint_miR-155_32hr_runs1-2,GSM546922: Footprint_miR-155_32hr_runs1-2,GSM546922: Footprint_miR-155_32hr_runs1-2,GSM546922,2010-06-11,SRR057516,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM546922: Footprint_miR-155_32hr_runs1-2,212446440,5901290,,36.0
5,SRP002605,SRX021968,GSM546922: Footprint_miR-155_32hr_runs1-2,GSM546922: Footprint_miR-155_32hr_runs1-2,GSM546922: Footprint_miR-155_32hr_runs1-2,GSM546922,2010-06-11,SRR057517,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM546922: Footprint_miR-155_32hr_runs1-2,522725688,14520158,,36.0
6,SRP002605,SRX021969,GSM546923: mRNASeq_miR-155_32hr_runs1-3,GSM546923: mRNASeq_miR-155_32hr_runs1-3,GSM546923: mRNASeq_miR-155_32hr_runs1-3,GSM546923,2010-06-11,SRR057518,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM546923: mRNASeq_miR-155_32hr_runs1-3,216374004,6010389,,36.0
7,SRP002605,SRX021969,GSM546923: mRNASeq_miR-155_32hr_runs1-3,GSM546923: mRNASeq_miR-155_32hr_runs1-3,GSM546923: mRNASeq_miR-155_32hr_runs1-3,GSM546923,2010-06-11,SRR057519,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM546923: mRNASeq_miR-155_32hr_runs1-3,207243972,5756777,,36.0
8,SRP002605,SRX021969,GSM546923: mRNASeq_miR-155_32hr_runs1-3,GSM546923: mRNASeq_miR-155_32hr_runs1-3,GSM546923: mRNASeq_miR-155_32hr_runs1-3,GSM546923,2010-06-11,SRR057520,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM546923: mRNASeq_miR-155_32hr_runs1-3,94523436,2625651,,36.0
9,SRP002605,SRX021970,GSM546924: Footprint_miR-1_32hr_runs1-2,GSM546924: Footprint_miR-1_32hr_runs1-2,GSM546924: Footprint_miR-1_32hr_runs1-2,GSM546924,2010-06-11,SRR057521,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM546924: Footprint_miR-1_32hr_runs1-2,212789088,5910808,,36.0


In [11]:
df = db.sra_convert('SRP098789')
df

Unnamed: 0,study_accession,experiment_accession,experiment_name,experiment_title,sample_name,sample_alias,submission_date,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,spots,adapter_spec,avg_read_length
0,SRP098789,SRX2536403,GSM2475997,"GSM2475997: 1.5 Ã‚ÂµM PF-067446846, 10 min, re...",,GSM2475997,,SRR5227288,9606,other,SINGLE -,OTHER,TRANSCRIPTOMIC,,2104142750,42082855,,50.0
1,SRP098789,SRX2536404,GSM2475998,"GSM2475998: 1.5 Ã‚ÂµM PF-067446846, 10 min, re...",,GSM2475998,,SRR5227289,9606,other,SINGLE -,OTHER,TRANSCRIPTOMIC,,2082873050,41657461,,50.0
2,SRP098789,SRX2536405,GSM2475999,"GSM2475999: 1.5 Ã‚ÂµM PF-067446846, 10 min, re...",,GSM2475999,,SRR5227290,9606,other,SINGLE -,OTHER,TRANSCRIPTOMIC,,2023148650,40462973,,50.0
3,SRP098789,SRX2536406,GSM2476000,"GSM2476000: 0.3 Ã‚ÂµM PF-067446846, 10 min, re...",,GSM2476000,,SRR5227291,9606,other,SINGLE -,OTHER,TRANSCRIPTOMIC,,2057165950,41143319,,50.0
4,SRP098789,SRX2536407,GSM2476001,"GSM2476001: 0.3 Ã‚ÂµM PF-067446846, 10 min, re...",,GSM2476001,,SRR5227292,9606,other,SINGLE -,OTHER,TRANSCRIPTOMIC,,3027621850,60552437,,50.0
5,SRP098789,SRX2536408,GSM2476002,"GSM2476002: 0.3 Ã‚ÂµM PF-067446846, 10 min, re...",,GSM2476002,,SRR5227293,9606,other,SINGLE -,OTHER,TRANSCRIPTOMIC,,2135456900,42709138,,50.0
6,SRP098789,SRX2536409,GSM2476003,"GSM2476003: vehicle, 10 min rep 1; Homo sapien...",,GSM2476003,,SRR5227294,9606,other,SINGLE -,OTHER,TRANSCRIPTOMIC,,3800106100,76002122,,50.0
7,SRP098789,SRX2536410,GSM2476004,"GSM2476004: vehicle, 10 min rep 2; Homo sapien...",,GSM2476004,,SRR5227295,9606,other,SINGLE -,OTHER,TRANSCRIPTOMIC,,2306857400,46137148,,50.0
8,SRP098789,SRX2536411,GSM2476005,"GSM2476005: vehicle, 10 min rep 3; Homo sapien...",,GSM2476005,,SRR5227296,9606,other,SINGLE -,OTHER,TRANSCRIPTOMIC,,2636889200,52737784,,50.0
9,SRP098789,SRX2536412,GSM2476006,"GSM2476006: 1.5 Ã‚ÂµM PF-067446846, 60 min, re...",,GSM2476006,,SRR5227297,9606,other,SINGLE -,OTHER,TRANSCRIPTOMIC,,3019068250,60381365,,50.0
