In [1]:
%pylab inline

import glob
import os
import pandas as pd
from collections import defaultdict
from riboraptor.utils import summary_starlogs_over_runs
from riboraptor.sradb import SRAdb
from riboraptor.helpers import path_leaf, parse_star_logs, millify
from riboraptor.cutadapt_to_json import cutadapt_to_json

re_ribo_analysis_dir = '/staging/as/skchoudh/re-ribo-analysis/'

species = os.listdir(re_ribo_analysis_dir)
xl = pd.ExcelFile('../data/datasets/re-ribo-datasets.xlsx')
xl.sheet_names  # see all sheet names

Populating the interactive namespace from numpy and matplotlib


  return f(*args, **kwds)
  return f(*args, **kwds)


['hg38', 'mm10', 'sacCerR64', 'MG1655', 'BDGP6']

In [2]:
sradb = SRAdb('/staging/as/skchoudh/SRAmetadb.sqlite')
geodb = SRAdb('/staging/as/skchoudh/GEOmetadb.sqlite')


In [3]:
datasets_processed = defaultdict(list)
datasets_processed_assemblywise = defaultdict(list)
for directory in glob.glob('{}/*/*'.format(re_ribo_analysis_dir)):
    srp = path_leaf(directory)
    assembly = path_leaf(os.path.dirname(directory))
    datasets_processed[srp].append(assembly)
    datasets_processed_assemblywise[assembly].append(srp)

In [4]:
datasets = dict([(key, pd.read_excel('../data/datasets/re-ribo-datasets.xlsx', sheet_name=key)) for key in xl.sheet_names])


In [5]:
datasets['hg38']

Unnamed: 0,Publication,Year,SRP,SRP.1,Status,PMID,Tissue,Adapter provided?
0,Guo et al. (2010),2010,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,SRP002605,Done,https://www.ncbi.nlm.nih.gov/pubmed/20703300,HeLa,
1,Guo et al. (2010),2010,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,SRP003554,,https://www.ncbi.nlm.nih.gov/pubmed/20703300,HeLa,
2,Reid and Nicchitta (2012),2012,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,SRP007963,,https://www.ncbi.nlm.nih.gov/pubmed/22199352,HEK293,
3,Stadler & Fire (2011),2011,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,SRP010374,,https://www.ncbi.nlm.nih.gov/pubmed/22045228,HeLa,
4,Hsieh et al. (2012),2012,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,SRP010679,Done,https://www.ncbi.nlm.nih.gov/pubmed/22367541,PC3,
5,Fritsch et al. (2012),2012,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,SRP014542,Done,https://www.ncbi.nlm.nih.gov/pubmed/22879431,THP-1,
6,Lee et al. (2012),2012,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,SRP014629,,https://www.ncbi.nlm.nih.gov/pubmed/22927429,HEK293,
7,Stern-Ginossar et al. (2012),2012,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,SRP016143,,https://www.ncbi.nlm.nih.gov/pubmed/23180859,human foreskin fibroblasts,
8,Liu et al. (2012),2012,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,SRP017263,,https://www.ncbi.nlm.nih.gov/pubmed/23290916,HEK293,
9,Loayza-Puch et al. (2013),2013,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,SRP020544,,https://www.ncbi.nlm.nih.gov/pubmed/23594524,BJ fibroblast,


# How does the mapping rate vary across projects

In [11]:
assembly = 'hg38'
srp_dfs_modified  = {}

for srp in datasets_processed_assemblywise[assembly]:
    if os.path.exists(os.path.join(re_ribo_analysis_dir, assembly, srp)):        
        srp_df = sradb.sra_convert(srp.split('_')[0])
        srp_df.library_layout = srp_df.library_layout.fillna('SINGLE')
        srp_df = srp_df[srp_df.library_layout.str.contains('SINGLE')]
     
        srp_df['pass1_reads_with_adapters'] = None
        srp_df['total_reads_raw'] = None
        srp_df['pass1_adapter'] = None
        srp_df['pass2_adapter'] = None
        srp_df['pass2_reads_with_adapters'] = None
        srp_df['mapping_total_reads_input'] = None
        srp_df['uniquely_mapped'] = None
        
        
        srpdir = os.path.join(re_ribo_analysis_dir, assembly, srp)
        starlogsdir = os.path.join(srpdir, 'starlogs')
        srp_srx_grouped = srp_df.groupby('experiment_accession')
        preprocess_step1_dir = os.path.join(srpdir, 'preprocessed_step1')
        preprocess_step2_dir = os.path.join(srpdir, 'preprocessed')
        
        for srx, srx_group in srp_srx_grouped:
            srrs = srx_group['run_accession'].tolist()                             
            #starlogs_df = summary_starlogs_over_runs(starlogsdir, srrs)
            
            for srr in srrs:
                starlogs_df = None
                if os.path.isfile(os.path.join(starlogsdir, srr+'Log.final.out')):
                    starlogs_df = parse_star_logs(os.path.join(starlogsdir, srr+'Log.final.out'))
                # Preprocessed_step1 adapter info
                step1_txt = os.path.join(preprocess_step1_dir, srr + '.fastq.gz_trimming_report.txt')
                step2_txt = os.path.join(preprocess_step2_dir, srr + '_trimmed_trimmed.fq.gz_trimming_report.txt')
                step1_cutadapt_json = None
                step2_cutadapt_json = None
                
                if os.path.isfile(step1_txt):
                    step1_cutadapt_json = cutadapt_to_json(step1_txt)
                
                if os.path.isfile(step2_txt):
                    step2_cutadapt_json = cutadapt_to_json(step2_txt)
                
                if step1_cutadapt_json:
                    adapters = step1_cutadapt_json['adapters']
                    if len(step1_cutadapt_json['adapters']) == 0:
                        srp_df.loc[srp_df.run_accession==srr, 'pass1_adapter'] = 'Empty?'                        
                    elif isinstance(adapters, str):
                        srp_df.loc[srp_df.run_accession==srr, 'pass1_adapter'] = step1_cutadapt_json['adapters']
                    else:
                        srp_df.loc[srp_df.run_accession==srr, 'pass1_adapter'] = step1_cutadapt_json['adapters']['{} - {}'.format(srr, 'Adapter 1')]
                        trim_info1 = step1_cutadapt_json['trim_info'][srr]
                        srp_df.loc[srp_df.run_accession==srr, 'total_reads_raw'] = trim_info1['r_processed']
                        srp_df.loc[srp_df.run_accession==srr, 'pass1_reads_with_adapters'] = trim_info1['r_with_adapters']

                if step2_cutadapt_json:                    
                    adapters = step2_cutadapt_json['adapters']
                    if len(step2_cutadapt_json['adapters']) == 0:
                        srp_df.loc[srp_df.run_accession==srr, 'pass2_adapter'] = 'Empty?'   
                    elif isinstance(adapters, str):
                        srp_df.loc[srp_df.run_accession==srr, 'pass2_adapter'] = step2_cutadapt_json['adapters']
                    else:
                        srp_df.loc[srp_df.run_accession==srr, 'pass2_adapter'] = step2_cutadapt_json['adapters']['{} - {}'.format(srr + '_trimmed', 'Adapter 1')]                                        
                        trim_info2 = step2_cutadapt_json['trim_info'][srr]
                        srp_df.loc[srp_df.run_accession==srr, 'pass2_reads_with_adapters'] = trim_info2['r_with_adapters']               
                        srp_df.loc[srp_df.run_accession==srr, 'pass2_total_reads_processed'] = trim_info2['r_processed']
                    
                if starlogs_df: 
                    srp_df.loc[srp_df.run_accession==srr, 'mapping_total_reads_input'] = starlogs_df['total_reads']
                    srp_df.loc[srp_df.run_accession==srr, 'uniquely_mapped'] = starlogs_df['uniquely_mapped']
                    srp_df.loc[srp_df.run_accession==srr, 'uniquely_mapped_percent'] = starlogs_df['uniquely_mapped_percent']
                
                
        cols = ['pass1_reads_with_adapters', 'total_reads_raw', 'uniquely_mapped', 'mapping_total_reads_input']
        for col in cols:
            srp_df[col] = srp_df[col].apply(lambda z: millify(z))
        srp_dfs_modified[srp] = srp_df 

In [12]:
millify(None)

'NaN'

In [14]:
srp_dfs_modified['SRP010679_fixed_adapters']#.pass1_adapter

Unnamed: 0,study_accession,experiment_accession,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,...,adapter_spec,avg_read_length,pass1_reads_with_adapters,total_reads_raw,pass1_adapter,pass2_adapter,pass2_reads_with_adapters,mapping_total_reads_input,uniquely_mapped,uniquely_mapped_percent
0,SRP010679,SRX118285,SRR403882,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM869036: mRNA-seq for vehicle treated PC3 ce...,886121960,...,,40.0,21.6 M,22.2 M,CTGTAGGCAC,User provided,,21.5 M,12.7 M,58.91
1,SRP010679,SRX118286,SRR403883,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM869037: Footprint for vehicle treated PC3 c...,830504160,...,,40.0,20.4 M,20.8 M,CTGTAGGCAC,User provided,,20.5 M,3.2 M,15.71
2,SRP010679,SRX118287,SRR403884,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM869038: mRNA-seq for rapamcyin treated PC3 ...,875152240,...,,40.0,20.9 M,21.9 M,CTGTAGGCAC,User provided,,21.1 M,12.6 M,59.63
3,SRP010679,SRX118288,SRR403885,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM869039: Footprint for rapamcyin treated PC3...,789711800,...,,40.0,19.3 M,19.7 M,CTGTAGGCAC,User provided,,18.3 M,3.5 M,19.17
4,SRP010679,SRX118289,SRR403886,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM869040: mRNA-seq for PP242 treated PC3 cell...,786234960,...,,40.0,19.0 M,19.7 M,CTGTAGGCAC,User provided,,19.2 M,11.2 M,58.57
5,SRP010679,SRX118290,SRR403887,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM869041: Footprint for PP242 treated PC3 cel...,788273800,...,,40.0,19.3 M,19.7 M,CTGTAGGCAC,User provided,,18.3 M,3.6 M,19.48
6,SRP010679,SRX118291,SRR403888,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM869042: mRNA-seq for vehicle treated PC3 ce...,979031600,...,,40.0,23.7 M,24.5 M,CTGTAGGCAC,User provided,,24.3 M,13.6 M,56.04
7,SRP010679,SRX118292,SRR403889,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM869043: Footprint for vehicle treated PC3 c...,975917920,...,,40.0,23.6 M,24.4 M,CTGTAGGCAC,User provided,,24.1 M,7.9 M,32.98
8,SRP010679,SRX118293,SRR403890,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM869044: mRNA-seq for rapamcyin treated PC3 ...,764698520,...,,40.0,18.3 M,19.1 M,CTGTAGGCAC,User provided,,19.0 M,10.5 M,55.27
9,SRP010679,SRX118294,SRR403891,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM869045: Footprint for rapamcyin treated PC3...,1017802720,...,,40.0,24.8 M,25.4 M,CTGTAGGCAC,User provided,,24.1 M,5.3 M,21.96


In [9]:
x.columns

Index(['study_accession', 'experiment_accession', 'run_accession', 'taxon_id',
       'library_selection', 'library_layout', 'library_strategy',
       'library_source', 'library_name', 'bases', 'spots', 'adapter_spec',
       'avg_read_length', 'pass1_adapter', 'total_reads_raw',
       'pass1_reads_with_adapters', 'pass2_adapter',
       'mapping_total_reads_input', 'uniquely_mapped',
       'uniquely_mapped_percent'],
      dtype='object')

In [38]:
x['total_reads_raw'].apply(lambda z: millify(z))

0     22.2 M
1     20.8 M
2     21.9 M
3     19.7 M
4     19.7 M
5     19.7 M
6     24.5 M
7     24.4 M
8     19.1 M
9     25.4 M
10    25.8 M
11    25.3 M
Name: total_reads_raw, dtype: object

In [35]:
srp_dfs_modified.keys()

dict_keys(['SRP038695', 'SRP098789', 'SRP017942', 'SRP042937', 'SRP049168', 'SRP055009', 'SRP109126', 'SRP092068', 'SRP010679_fixed_adapters', 'SRP045214'])

In [16]:
step2_cutadapt_json

{'adapters': 'User provided',
 'length_counts': {},
 'length_exp': {},
 'length_obsexp': {},
 'trim_info': {}}

In [17]:
starlog_df

{'multi_mapped': 32008381,
 'multi_mapped_percent': 78.38,
 'total_reads': 40839670,
 'uniquely_mapped': 6889788,
 'uniquely_mapped_percent': 16.87,
 'unmapped_percent': 4.75}

In [18]:
starlogsdir

'/staging/as/skchoudh/re-ribo-analysis/hg38/SRP010679_fixed_adapters/starlogs'

In [19]:
step1_cutadapt_json

{'adapters': {'SRR403893 - Adapter 1': 'CTGTAGGCAC'},
 'length_counts': {'SRR403893 - Adapter 1': {1: 372801,
   2: 191462,
   3: 253301,
   4: 401023,
   5: 814034,
   6: 1548132,
   7: 2984751,
   8: 3972228,
   9: 4716147,
   10: 4178020,
   11: 2376079,
   12: 1130101,
   13: 427355,
   14: 217874,
   15: 135002,
   16: 87970,
   17: 77490,
   18: 72771,
   19: 34984,
   20: 19913,
   21: 12647,
   22: 15391,
   23: 45581,
   24: 7629,
   25: 11707,
   26: 7654,
   27: 7280,
   28: 12423,
   29: 14146,
   30: 13855,
   31: 13116,
   32: 4182,
   33: 1605,
   34: 1156,
   35: 1575,
   36: 1380,
   37: 1685,
   38: 24914,
   39: 31596,
   40: 288682}},
 'length_exp': {'SRR403893 - Adapter 1': {1: 6322918.8,
   2: 1580729.7,
   3: 395182.4,
   4: 98795.6,
   5: 24698.9,
   6: 6174.7,
   7: 1543.7,
   8: 385.9,
   9: 96.5,
   10: 24.1,
   11: 24.1,
   12: 24.1,
   13: 24.1,
   14: 24.1,
   15: 24.1,
   16: 24.1,
   17: 24.1,
   18: 24.1,
   19: 24.1,
   20: 24.1,
   21: 24.1,
   22: 24

In [None]:
srp_df['total_reads'] =  None
        srp_df['total_reads_post_trimming'] = Nonesrp_df

In [None]:
files_not_found

# How has the adapter evolved over the years?