In [11]:
%pylab inline
import pandas as pd
import os
from tqdm import tqdm
import fnmatch
import errno    
import glob
from textwrap import dedent
from riboraptor.helpers import mkdir_p
from riboraptor.sradb import SRAdb
from riboraptor.utils import copy_sra_data, create_config_file

def mkdir_p(path):
    """Python version mkdir -p

    Parameters
    ----------

    path : str
    """
    if path:
        try:
            os.makedirs(path)
        except OSError as exc:  # Python >2.5
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise


def symlink_force(source, destination):
    """Create forcelink forcefully

    Parameters
    ----------
    source: string
            Location to source file
    destination: string
                 Location to target

    """
    try:
        os.symlink(source, destination)
    except OSError as exc:
        if exc.errno == errno.EEXIST:
            os.remove(destination)
            os.symlink(source, destination)
        else:
            raise exc

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [12]:
re_ribo_root_dir = '/staging/as/skchoudh/SRA_datasets/'
samples_to_process_dir = '/staging/as/skchoudh/re-ribo-datasets/'
re_ribo_config_dir = '/home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs'
re_ribo_analysis_dir = '/staging/as/skchoudh/re-ribo-analysis/'
riboraptor_annotation_dir = '/home/cmb-panasas2/skchoudh/github_projects/riboraptor/riboraptor/annotation/'
def mkdir_p(path):
    """Python version mkdir -p

    Parameters
    ----------

    path : str
    """
    if path:
        try:
            os.makedirs(path)
        except OSError as exc:  # Python >2.5
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise


def symlink_force(source, destination):
    """Create forcelink forcefully

    Parameters
    ----------
    source: string
            Location to source file
    destination: string
                 Location to target

    """
    try:
        os.symlink(source, destination)
    except OSError as exc:
        if exc.errno == errno.EEXIST:
            os.remove(destination)
            os.symlink(source, destination)
        else:
            raise exc
genome_annotation_map = {
    'hg38': 'v25',
    'mm10': 'vM11',
    'mg1655': '',
    'sacCerR64': 'v91',
    'MG1655': 'ASM584v2.38',
    'BDGP6': 'v91',
    'GRCz10': 'v91',
    'panTro3': 'v94',
    'Mmul8': 'v94'
}


genome_fasta_map = {
    'hg38':
    '/home/cmb-panasas2/skchoudh/genomes/hg38/fasta/hg38.fa',
    'mm10':
    '/home/cmb-panasas2/skchoudh/genomes/mm10/fasta/mm10.fa',
    'sacCerR64':
    '/home/cmb-panasas2/skchoudh/genomes/sacCerR64/fasta/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa',
    'MG1655':
    '/home/cmb-panasas2/skchoudh/genomes/escherichia_coli_str_k_12_substr_mg1655/fasta/Escherichia_coli_str_k_12_substr_mg1655.ASM584v2.dna.toplevel.fa',
    'BDGP6':
    '/home/cmb-panasas2/skchoudh/genomes/drosophila_melanogaster_BDGP6/fasta/Drosophila_melanogaster.BDGP6.dna.toplevel.fa',
    'GRCz10':
    '/home/cmb-panasas2/skchoudh/genomes/GRCz10/fasta/Danio_rerio.GRCz10.dna.toplevel.fa',
    'panTro3':
    '/home/cmb-panasas2/skchoudh/genomes/panTro3/fasta/Pan_troglodytes.Pan_tro_3.0.dna.toplevel.fa',
    'Mmul8':
    '/home/cmb-panasas2/skchoudh/genomes/Mmul8/fasta/Macaca_mulatta.Mmul_8.0.1.dna.toplevel.fa'
}

chrom_sizes_map = {
    'hg38':
    '/home/cmb-panasas2/skchoudh/genomes/hg38/fasta/hg38.chrom.sizes',
    'mm10':
    '/home/cmb-panasas2/skchoudh/genomes/mm10/fasta/mm10.chrom.sizes',
    'sacCerR64':
    '/home/cmb-panasas2/skchoudh/genomes/sacCerR64/fasta/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.sizes',
    'MG1655':
    '/home/cmb-panasas2/skchoudh/genomes/escherichia_coli_str_k_12_substr_mg1655/fasta/Escherichia_coli_str_k_12_substr_mg1655.ASM584v2.dna.toplevel.sizes',
    'BDGP6':
    '/home/cmb-panasas2/skchoudh/genomes/drosophila_melanogaster_BDGP6/fasta/Drosophila_melanogaster.BDGP6.dna.toplevel.sizes',
    'GRCz10':
    '/home/cmb-panasas2/skchoudh/genomes/GRCz10/fasta/Danio_rerio.GRCz10.dna.toplevel.sizes',
    'panTro3':
    '/home/cmb-panasas2/skchoudh/genomes/panTro3/fasta/Pan_troglodytes.Pan_tro_3.0.dna.toplevel.sizes',
    'Mmul8':
    '/home/cmb-panasas2/skchoudh/genomes/Mmul8/fasta/Macaca_mulatta.Mmul_8.0.1.dna.toplevel.sizes'
}

star_index_map = {
    'hg38':
    '/home/cmb-panasas2/skchoudh/genomes/hg38/star_annotated',
    'mm10':
    '/home/cmb-panasas2/skchoudh/genomes/mm10/star_annotated',
    'sacCerR64':
    '/home/cmb-panasas2/skchoudh/genomes/sacCerR64/star_annotated',
    'MG1655':
    '/home/cmb-panasas2/skchoudh/genomes/escherichia_coli_str_k_12_substr_mg1655/star_annotated',
    'BDGP6':
    '/home/cmb-panasas2/skchoudh/genomes/drosophila_melanogaster_BDGP6/star_annotated',
    'GRCz10':
    '/home/cmb-panasas2/skchoudh/genomes/GRCz10/star_annotated',
    'panTro3':
    '/home/cmb-panasas2/skchoudh/genomes/panTro3/star_annotated',
    'Mmul8':
    '/home/cmb-panasas2/skchoudh/genomes/Mmul8/star_annotated'
}

gtf_map = {
    'hg38':
    '/home/cmb-panasas2/skchoudh/genomes/hg38/annotation/gencode.v25.annotation.gtf',
    'mm10':
    '/home/cmb-panasas2/skchoudh/genomes/mm10/annotation/gencode.vM11.annotation.gtf',
    'sacCerR64':
    '/home/cmb-panasas2/skchoudh/genomes/sacCerR64/annotation/Saccharomyces_cerevisiae.R64-1-1.91.gtf',
    'MG1655':
    '/home/cmb-panasas2/skchoudh/genomes/escherichia_coli_str_k_12_substr_mg1655/annotation/Escherichia_coli_str_k_12_substr_mg1655.ASM584v2.38.gtf',
    'BDGP6':
    '/home/cmb-panasas2/skchoudh/genomes/drosophila_melanogaster_BDGP6/annotation/Drosophila_melanogaster.BDGP6.91.gtf',
    'GRCz10':
    '/home/cmb-panasas2/skchoudh/genomes/GRCz10/annotation/Danio_rerio.GRCz10.91.gtf',
    'Mmul8':
    '/home/cmb-panasas2/skchoudh/genomes/Mmul8/annotation/Macaca_mulatta.Mmul_8.0.1.94.gtf',
    'panTro3':
    '/home/cmb-panasas2/skchoudh/genomes/panTro3/annotation/Pan_troglodytes.Pan_tro_3.0.94.gtf'
}


def filter_single_end_samples(df):
    """Filter single end samples from a dataframe

    Parameters
    ----------
    df: DataFrame
        Dataframe as obtained from SRAb.sra_convert()

    Returns
    -------
    df: DataFrame
        DataFrame with only single end samples
    """
    df = df[~df['library_strategy'].str.contains('PAIRED')]
    return df


def copy_sra_data(df,
                  sra_source_dir='/staging/as/skchoudh/SRA_datasets/',
                  sra_dest_dir='/staging/as/skchoudh/re-ribo-datasets/'):
    """Copy SRA data to a new location retaining only single ended samples."""
    df = filter_single_end_samples(df)
    assert len(df.study_accession.unique()) == 1, 'Multiple SRPs found'
    srp = df.study_accession.unique()[0]
    df_grouped = df.groupby(['taxon_id'])
    srp_source_dir = os.path.join(sra_source_dir, srp)

    for taxon_id, df_group in df_grouped:
        species = taxon_id_map[taxon_id]
        species_dest_dir = os.path.join(sra_dest_dir, species)
        srp_dest_dir = os.path.join(species_dest_dir, srp)
        mkdir_p(os.path.join(species_dest_dir, srp))
        source_loc = srp_source_dir + os.path.sep + df_group[
            'experiment_accession'].str.cat(
                df_group['run_accession'] + '.sra', sep=os.path.sep)
        dest_loc = srp_dest_dir + os.path.sep + df_group[
            'experiment_accession'].str.cat(
                df_group['run_accession'] + '.sra', sep=os.path.sep)
        with tqdm(total=len(source_loc)) as pbar:
            for source, dest in zip(source_loc, dest_loc):
                mkdir_p(os.path.dirname(dest))
                if os.path.isfile(source):
                    symlink_force(source, dest)
                pbar.update()





In [13]:
db = SRAdb('/staging/as/skchoudh/SRAmetadb.sqlite')


In [14]:
srp = 'SRP012040'
df = db.sra_convert(srp)
df

Unnamed: 0,study_accession,experiment_accession,experiment_title,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,spots,adapter_spec,avg_read_length
0,SRP012040,SRX135150,GSM900183: CSHL_RnaSeq_Ovary_adult-8wks,SRR453077,10090,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,GSM900183: CSHL_RnaSeq_Ovary_adult-8wks,2461347400,16193075,,152.0
1,SRP012040,SRX135150,GSM900183: CSHL_RnaSeq_Ovary_adult-8wks,SRR453078,10090,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,GSM900183: CSHL_RnaSeq_Ovary_adult-8wks,1818608560,11964530,,152.0
2,SRP012040,SRX135150,GSM900183: CSHL_RnaSeq_Ovary_adult-8wks,SRR453079,10090,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,GSM900183: CSHL_RnaSeq_Ovary_adult-8wks,774200144,5093422,,152.0
3,SRP012040,SRX135150,GSM900183: CSHL_RnaSeq_Ovary_adult-8wks,SRR453080,10090,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,GSM900183: CSHL_RnaSeq_Ovary_adult-8wks,1693931016,11144283,,152.0
4,SRP012040,SRX135150,GSM900183: CSHL_RnaSeq_Ovary_adult-8wks,SRR453081,10090,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,GSM900183: CSHL_RnaSeq_Ovary_adult-8wks,1687751000,11103625,,152.0
5,SRP012040,SRX135150,GSM900183: CSHL_RnaSeq_Ovary_adult-8wks,SRR453082,10090,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,GSM900183: CSHL_RnaSeq_Ovary_adult-8wks,1571660848,10339874,,152.0
6,SRP012040,SRX135150,GSM900183: CSHL_RnaSeq_Ovary_adult-8wks,SRR453083,10090,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,GSM900183: CSHL_RnaSeq_Ovary_adult-8wks,1629662528,10721464,,152.0
7,SRP012040,SRX135150,GSM900183: CSHL_RnaSeq_Ovary_adult-8wks,SRR453084,10090,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,GSM900183: CSHL_RnaSeq_Ovary_adult-8wks,1151556104,7576027,,152.0
8,SRP012040,SRX135150,GSM900183: CSHL_RnaSeq_Ovary_adult-8wks,SRR453085,10090,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,GSM900183: CSHL_RnaSeq_Ovary_adult-8wks,1606060424,10566187,,152.0
9,SRP012040,SRX135150,GSM900183: CSHL_RnaSeq_Ovary_adult-8wks,SRR453086,10090,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,GSM900183: CSHL_RnaSeq_Ovary_adult-8wks,1668134640,10974570,,152.0


In [4]:
srp = 'SRP016501'
df = db.sra_convert(srp)
#copy_sra_data(df)

In [66]:
srp = 'SRP007412'
df = db.sra_convert(srp)
df.head()

Unnamed: 0,study_accession,experiment_accession,experiment_title,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,spots,adapter_spec,avg_read_length
0,SRP007412,SRX081869,GSM752557: gga br F 1,SRR306710,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752557: gga br F 1,1480245008,19476908,,76.0
1,SRP007412,SRX081870,GSM752558: gga br M 1,SRR306711,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752558: gga br M 1,1334334888,17557038,,76.0
2,SRP007412,SRX081871,GSM752559: gga cb F 1,SRR306712,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752559: gga cb F 1,1795697524,23627599,,76.0
3,SRP007412,SRX081872,GSM752560: gga cb M 1,SRR306713,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752560: gga cb M 1,1670739920,21983420,,76.0
4,SRP007412,SRX081873,GSM752561: gga ht F 1,SRR306714,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752561: gga ht F 1,1748333260,23004385,,76.0


In [36]:
#df[df.taxon_id==9031]
df.taxon_id.unique()

array([ 9031,  9258,  9544,  9593,  9597,  9598,  9600,  9606, 10090,
       13616])

In [27]:
taxon_id_map = {
    10090: 'mus_musculus',
    9606: 'homo_sapiens',
    4932: 'sacCerR64',
    511145: 'MG1655',
    7227: 'drosophila_melanogaster',
    7955: 'danio_rerio',
    9598: 'pan_troglodytes',
    9544: 'macaca_mulatta',
    9031: 'gallus_gallus',
    9913: 'bos_taurus',
    10116: 'rattus_norvegicus',
    9258: 'ornithorhynchus_anatinus',
    9593: 'gorilla_gorilla',
    9597: 'pan_paniscus',
    9600: 'pongo_abelii', #'pongo_pygmaeus',
    13616: 'monodelphis_domestica'
    
}

genomes_dir = '/home/cmb-panasas2/skchoudh/genomes'
re_ribo_analysis_dir = '/staging/as/skchoudh/rna-seq-output'






In [15]:
taxon_id_map = {
    10090: 'mus_musculus',
    9606: 'homo_sapiens',
    4932: 'sacCerR64',
    511145: 'MG1655',
    7227: 'drosophila_melanogaster',
    7955: 'danio_rerio',
    9598: 'pan_troglodytes',
    9544: 'macaca_mulatta',
    9031: 'gallus_gallus',
    9913: 'bos_taurus',
    10116: 'rattus_norvegicus',
    9258: 'ornithorhynchus_anatinus',
    9593: 'gorilla_gorilla',
    9597: 'pan_paniscus',
    9600: 'pongo_abelii', #'pongo_pygmaeus',
    13616: 'monodelphis_domestica'
    
}

genomes_dir = '/home/cmb-panasas2/skchoudh/genomes'

cdna_map = {}
for value in taxon_id_map.values():
    genome_dir = os.path.join(genomes_dir, value)
    if os.path.exists(os.path.join(genome_dir, 'cdna')):
        cdna_map[value] = glob.glob('{}/*'.format(os.path.join(genome_dir, 'cdna')))[0]
def create_config_file(df):
    df_grouped = df.groupby(['taxon_id'])

    for taxon_id, df_group in df_grouped:
        assert len(
            df_group['study_accession'].unique()) == 1, 'Multiple SRPs found'
        species = taxon_id_map[taxon_id]
        srp = df_group['study_accession'].unique()[0]
        with open(
                os.path.join(re_ribo_config_dir, '{}_{}.py'.format(
                    species, srp)), 'w') as fh:

            config = write_config(species, srp)
            fh.write(config)
            print('Wrote {}'.format(
                os.path.join(re_ribo_config_dir, '{}_{}.py'.format(
                    species, srp))))        

In [16]:
def copy_sra_data(df,
                  sra_source_dir='/staging/as/skchoudh/SRA_datasets/',
                  sra_dest_dir='/staging/as/skchoudh/rna-seq-datasets/'):
    """Copy SRA data to a new location retaining only single ended samples."""
    assert len(df.study_accession.unique()) == 1, 'Multiple SRPs found'
    srp = df.study_accession.unique()[0]
    df_grouped = df.groupby(['taxon_id'])
    srp_source_dir = os.path.join(sra_source_dir, srp)

    for taxon_id, df_group in df_grouped:
        species = taxon_id_map[taxon_id]
        species_dest_dir = os.path.join(sra_dest_dir, species)
        srp_dest_dir = os.path.join(species_dest_dir, srp)
        mkdir_p(os.path.join(species_dest_dir, srp))
        source_loc = srp_source_dir + os.path.sep + df_group[
            'experiment_accession'].str.cat(
                df_group['run_accession'] + '.sra', sep=os.path.sep)
        dest_loc = srp_dest_dir + os.path.sep + df_group[
            'experiment_accession'].str.cat(
                df_group['run_accession'] + '.sra', sep=os.path.sep)
        with tqdm(total=len(source_loc)) as pbar:
            for source, dest in zip(source_loc, dest_loc):
                mkdir_p(os.path.dirname(dest))
                if os.path.isfile(source):
                    symlink_force(source, dest)
                pbar.update()

In [17]:
sorted(db.desc_table('sra_ft')['name'].tolist())

['SRR_bamFile',
 'SRX_bamFile',
 'SRX_fastqFTP',
 'adapter_spec',
 'anonymized_name',
 'base_caller',
 'bases',
 'center_project_name',
 'common_name',
 'description',
 'design_description',
 'experiment_ID',
 'experiment_accession',
 'experiment_alias',
 'experiment_attribute',
 'experiment_entrez_link',
 'experiment_name',
 'experiment_title',
 'experiment_url_link',
 'individual_name',
 'instrument_model',
 'instrument_name',
 'library_construction_protocol',
 'library_layout',
 'library_name',
 'library_selection',
 'library_source',
 'library_strategy',
 'multiplier',
 'number_of_levels',
 'platform',
 'platform_parameters',
 'primary_study',
 'qtype',
 'quality_scorer',
 'read_spec',
 'related_studies',
 'run_ID',
 'run_accession',
 'run_alias',
 'run_attribute',
 'run_center',
 'run_date',
 'run_entrez_link',
 'run_url_link',
 'sample_ID',
 'sample_accession',
 'sample_alias',
 'sample_attribute',
 'sample_entrez_link',
 'sample_name',
 'sample_url_link',
 'sequence_space',
 'sp

In [61]:
srp = 'SRP016501'
df = db.sra_convert(srp)
copy_sra_data(df)

100%|██████████| 27/27 [00:00<00:00, 325.50it/s]
100%|██████████| 27/27 [00:00<00:00, 369.01it/s]
100%|██████████| 27/27 [00:00<00:00, 372.44it/s]
100%|██████████| 26/26 [00:00<00:00, 393.26it/s]
100%|██████████| 27/27 [00:00<00:00, 407.28it/s]


In [None]:
srp = 'SRP016501'
df = db.sra_convert(srp)
copy_sra_data(df)

In [5]:
srp = 'SRP016501'
prefix = '/home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/{}'.format(srp)
cols = ['study_accession',
 'experiment_accession',
 'experiment_title',
 'run_accession',
 'taxon_id',
 'library_selection',
 'library_layout',
 'library_strategy',
 'library_source',
 'library_name',
 'bases',
 'spots',
 'adapter_spec', 'description']

In [151]:
df = db.sra_convert(srp, out_type=cols)
df['tissue'] = df['experiment_title'].str.split(': ').str.get(1).str.split('; ').str.get(0)
df = df.sort_values(by=['tissue', 'experiment_accession'])
for taxon_id, group in df[['study_accession', 'experiment_accession', 'taxon_id', 'tissue']].drop_duplicates().groupby(['taxon_id']):
    species = taxon_id_map[taxon_id]
    filepath = '{}-{}.tsv'.format(prefix, species)
    #rint(group)
    group = group.sort_values(by=['tissue', 'experiment_accession'])
    abundances = pd.DataFrame()
    for index, row in group.iterrows():
        srp =  row['study_accession']
        tissue = ('_').join(row['tissue'].split('_')[1:])
        srx = row['experiment_accession']
        
        abundance = pd.read_table('/staging/as/skchoudh/rna-seq-output/{}/{}/counts/{}/abundance.tsv'.format(species, srp, srx)) [['target_id', 'tpm']].rename(columns={'tpm': tissue}).set_index('target_id')
        #    target_id       length  eff_length      est_counts      tpm

        abundances = abundances.join(abundance, how='outer')
    abundances = abundances.reset_index()
    abundances.to_csv(filepath, header=True,  index=False, sep='\t')
    print('Wrote: {}'.format(filepath))

Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP016501-gallus_gallus.tsv
Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP016501-macaca_mulatta.tsv
Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP016501-bos_taurus.tsv
Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP016501-mus_musculus.tsv
Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP016501-rattus_norvegicus.tsv


In [10]:
srp = 'SRP016501'

df = db.sra_convert(srp, out_type=cols)
df['tissue'] = df['experiment_title'].str.split(': ').str.get(1).str.split('; ').str.get(0)
df = df.sort_values(by=['tissue', 'experiment_accession'])
with open('../cross-species-data/SRP016501_sample_info.tsv', 'w') as fh:
    fh.write('sample\tspecies\ttissue\n')
    for taxon_id, group in df[['study_accession', 'experiment_accession', 'taxon_id', 'tissue']].drop_duplicates().groupby(['taxon_id']):
        species = taxon_id_map[taxon_id]
        filepath = '{}-{}.tsv'.format(prefix, species)
        #rint(group)
        group = group.sort_values(by=['tissue', 'experiment_accession'])
        abundances = pd.DataFrame()
        for index, row in group.iterrows():
            srp =  row['study_accession']
            tissue = ('_').join(row['tissue'].split('_')[1:])
            srx = row['experiment_accession']
            fh.write('{}\t{}\t{}\n'.format(srx, species, tissue))


In [62]:
create_config_file(df)

Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/gallus_gallus_SRP016501.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/macaca_mulatta_SRP016501.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/bos_taurus_SRP016501.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/mus_musculus_SRP016501.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/rattus_norvegicus_SRP016501.py


In [149]:
srp = 'SRP007412'
df = db.sra_convert(srp)
prefix = '/home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/{}'.format(srp)

#copy_sra_data(df)
df = db.sra_convert(srp, out_type=cols+['experiment_name'])
df['tissue'] = df['experiment_title'].str.split(': ').str.get(1).str[3:].str.lstrip(' ').str.replace(' ', '_')
df = df.sort_values(by=['tissue', 'experiment_accession'])

for taxon_id, group in df[['study_accession', 'experiment_accession', 'taxon_id', 'tissue']].drop_duplicates().groupby(['taxon_id']):
    species = taxon_id_map[taxon_id]
    filepath = '{}-{}.tsv'.format(prefix, species)
    #rint(group)
    group = group.sort_values(by=['tissue', 'experiment_accession'])
    abundances = pd.DataFrame()
    for index, row in group.iterrows():
        srp =  row['study_accession']
        tissue = row['tissue']
        srx = row['experiment_accession']
        
        abundance = pd.read_table('/staging/as/skchoudh/rna-seq-output/{}/{}/counts/{}/abundance.tsv'.format(species, srp, srx)) [['target_id', 'tpm']].rename(columns={'tpm': tissue}).set_index('target_id')
        #    target_id       length  eff_length      est_counts      tpm

        abundances = abundances.join(abundance, how='outer')
    abundances = abundances.reset_index()
    abundances.to_csv(filepath, header=True,  index=False, sep='\t')
    print('Wrote: {}'.format(filepath))
#create_config_file(df)

Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP007412-gallus_gallus.tsv
Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP007412-ornithorhynchus_anatinus.tsv
Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP007412-macaca_mulatta.tsv
Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP007412-gorilla_gorilla.tsv
Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP007412-pan_paniscus.tsv
Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP007412-pan_troglodytes.tsv
Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP007412-pongo_abelii.tsv
Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP007412-homo_sapiens.tsv
Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-dat

In [8]:
srp = 'SRP007412'
df = db.sra_convert(srp)
prefix = '/home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/{}'.format(srp)

#copy_sra_data(df)
df = db.sra_convert(srp, out_type=cols+['experiment_name'])
df['tissue'] = df['experiment_title'].str.split(': ').str.get(1).str[3:].str.lstrip(' ').str.replace(' ', '_')
df = df.sort_values(by=['tissue', 'experiment_accession'])
with open('../cross-species-data/SRP007412_sample_info.tsv', 'w') as fh:
    fh.write('sample\tspecies\ttissue\n')
    for taxon_id, group in df[['study_accession', 'experiment_accession', 'taxon_id', 'tissue']].drop_duplicates().groupby(['taxon_id']):
        species = taxon_id_map[taxon_id]
        filepath = '{}-{}.tsv'.format(prefix, species)
        #rint(group)
        group = group.sort_values(by=['tissue', 'experiment_accession'])
        abundances = pd.DataFrame()
        for index, row in group.iterrows():
            srp =  row['study_accession']
            tissue = row['tissue']
            srx = row['experiment_accession']
            fh.write('{}\t{}\t{}\n'.format(srx, species, tissue))
        

In [None]:
df.head()

In [109]:
df[df.library_strategy.str.contains('PAIRED')]

Unnamed: 0,study_accession,experiment_accession,experiment_title,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,spots,adapter_spec,description,avg_read_length,tissue


In [117]:
df_paired = df[df.avg_read_length>76]
df_unpaired = df[df.library_layout.str.contains('SINGLE')]

In [120]:
copy_sra_data(df)

Unnamed: 0,study_accession,experiment_accession,experiment_title,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,spots,adapter_spec,avg_read_length
0,SRP007412,SRX081869,GSM752557: gga br F 1,SRR306710,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752557: gga br F 1,1480245008,19476908,,76.0
1,SRP007412,SRX081870,GSM752558: gga br M 1,SRR306711,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752558: gga br M 1,1334334888,17557038,,76.0
2,SRP007412,SRX081871,GSM752559: gga cb F 1,SRR306712,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752559: gga cb F 1,1795697524,23627599,,76.0
3,SRP007412,SRX081872,GSM752560: gga cb M 1,SRR306713,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752560: gga cb M 1,1670739920,21983420,,76.0
4,SRP007412,SRX081873,GSM752561: gga ht F 1,SRR306714,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752561: gga ht F 1,1748333260,23004385,,76.0
5,SRP007412,SRX081874,GSM752562: gga ht M 1,SRR306715,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752562: gga ht M 1,1604929848,21117498,,76.0
6,SRP007412,SRX081875,GSM752563: gga kd F 1,SRR306716,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752563: gga kd F 1,1749607628,23021153,,76.0
7,SRP007412,SRX081876,GSM752564: gga kd M 1,SRR306717,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752564: gga kd M 1,1732548288,22796688,,76.0
8,SRP007412,SRX081877,GSM752565: gga lv F 1,SRR306718,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752565: gga lv F 1,2298690376,30245926,,76.0
9,SRP007412,SRX081878,GSM752566: gga lv M 1,SRR306719,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752566: gga lv M 1,653253896,8595446,,76.0


In [121]:
copy_sra_data(df_paired, sra_dest_dir='/staging/as/skchoudh/rna-seq-datasets/paired')
copy_sra_data(df_unpaired, sra_dest_dir='/staging/as/skchoudh/rna-seq-datasets/single')

100%|██████████| 1/1 [00:00<00:00, 122.69it/s]
100%|██████████| 1/1 [00:00<00:00, 227.98it/s]
100%|██████████| 1/1 [00:00<00:00, 234.15it/s]
100%|██████████| 5/5 [00:00<00:00, 178.17it/s]
100%|██████████| 1/1 [00:00<00:00, 233.61it/s]
100%|██████████| 2/2 [00:00<00:00, 250.55it/s]
100%|██████████| 14/14 [00:00<00:00, 219.38it/s]
100%|██████████| 18/18 [00:00<00:00, 249.19it/s]
100%|██████████| 13/13 [00:00<00:00, 222.07it/s]
100%|██████████| 10/10 [00:00<00:00, 199.25it/s]
100%|██████████| 11/11 [00:00<00:00, 213.72it/s]
100%|██████████| 10/10 [00:00<00:00, 187.28it/s]
100%|██████████| 8/8 [00:00<00:00, 215.97it/s]
100%|██████████| 19/19 [00:00<00:00, 218.56it/s]
100%|██████████| 20/20 [00:00<00:00, 250.00it/s]
100%|██████████| 15/15 [00:00<00:00, 254.78it/s]


In [19]:
def create_config_file(df, samples_to_process_dir, strategy=None):
    df_grouped = df.groupby(['taxon_id'])

    for taxon_id, df_group in df_grouped:
        assert len(
            df_group['study_accession'].unique()) == 1, 'Multiple SRPs found'
        species = taxon_id_map[taxon_id]
        srp = df_group['study_accession'].unique()[0]
        filepath = os.path.join(re_ribo_config_dir, '{}_{}.py'.format(species, srp))
        if strategy:
            filepath = os.path.join(re_ribo_config_dir, '{}_{}_{}.py'.format(species, srp, strategy))
            
        with open(filepath, 'w') as fh:

            config = write_config(species, srp, samples_to_process_dir)
            fh.write(config)
            print('Wrote {}'.format(filepath))

In [20]:
def write_config(species, srp, samples_to_process_dir):
    rawdata_dir = os.path.join(samples_to_process_dir, species, srp)
    out_dir = os.path.join(re_ribo_analysis_dir, species, srp)
    cdna_fa = cdna_map[species]    
    cdna_idx = cdna_map[species].replace('.fa.gz', '.kallisto.index')
    to_write = """
    RAWDATA_DIR = '{}'
    OUT_DIR = '{}'
    CDNA_FA_GZ = '{}'    
    CDNA_IDX = '{}'    
    """.format(rawdata_dir, out_dir, cdna_fa, cdna_idx)
    return dedent(to_write)

In [None]:
1666049 2312368 3462856 4651068 5738989 6743138 7648609 8646572 9174364 0054930

In [26]:
srp = 'SRP012040'
df = db.sra_convert(srp)
#copy_sra_data(df)
#create_config_file(df)
df_paired = df[df.avg_read_length>76]
df_unpaired = df[df.library_layout.str.contains('SINGLE')]
if len(df_paired.index):
    copy_sra_data(df_paired, sra_dest_dir='/staging/as/skchoudh/rna-seq-datasets/paired')
    create_config_file(df_paired,  '/staging/as/skchoudh/rna-seq-datasets/paired', 'paired')
if len(df_unpaired.index):
    copy_sra_data(df_unpaired, sra_dest_dir='/staging/as/skchoudh/rna-seq-datasets/single')
    create_config_file(df_unpaired,'/staging/as/skchoudh/rna-seq-datasets/single', 'single')

100%|██████████| 125/125 [00:00<00:00, 736.37it/s]

Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/mus_musculus_SRP012040_paired.py





In [25]:
df_unpaired

Unnamed: 0,study_accession,experiment_accession,experiment_title,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,spots,adapter_spec,avg_read_length


In [23]:
srp = 'SRP012040'
df = db.sra_convert(srp)
#copy_sra_data(df)
#create_config_file(df)
df_paired = df[df.avg_read_length>76]
df_unpaired = df[df.library_layout.str.contains('SINGLE')]
copy_sra_data(df_paired, sra_dest_dir='/staging/as/skchoudh/rna-seq-datasets/paired')
copy_sra_data(df_unpaired, sra_dest_dir='/staging/as/skchoudh/rna-seq-datasets/single')
create_config_file(df_paired,  '/staging/as/skchoudh/rna-seq-datasets/paired', 'paired')
create_config_file(df_unpaired,'/staging/as/skchoudh/rna-seq-datasets/single', 'single')

Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/mus_musculus_SRP012040_paired.py


In [52]:
srp = 'SRP000941'
df = db.sra_convert(srp)
df = df[df.library_strategy == 'RNA-Seq']
df_paired = df[df.library_layout.str.contains('PAIRED')]
df_unpaired = df[df.library_layout.str.contains('SINGLE')]
copy_sra_data(df_paired, sra_dest_dir='/staging/as/skchoudh/rna-seq-datasets/paired')
copy_sra_data(df_unpaired, sra_dest_dir='/staging/as/skchoudh/rna-seq-datasets/single')
create_config_file(df_paired,  '/staging/as/skchoudh/rna-seq-datasets/paired', 'paired')
create_config_file(df_unpaired,'/staging/as/skchoudh/rna-seq-datasets/single', 'single')

100%|██████████| 324/324 [00:00<00:00, 478.22it/s]
100%|██████████| 29/29 [00:00<00:00, 559.09it/s]


Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/homo_sapiens_SRP000941_paired.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/homo_sapiens_SRP000941_single.py


In [47]:
df_paired

Unnamed: 0,study_accession,experiment_accession,experiment_title,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,spots,adapter_spec,avg_read_length
1886,SRP000941,SRX263860,polyA RNA sequencing of STL001 Fat Cells,SRR1045522,9606,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,polyA-RNA-seq_STL001FT_r1a,1106627800,5533139,,200.0
1887,SRP000941,SRX263860,polyA RNA sequencing of STL001 Fat Cells,SRR1045523,9606,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,polyA-RNA-seq_STL001FT_r1a,1897588600,9487943,,200.0
1888,SRP000941,SRX263860,polyA RNA sequencing of STL001 Fat Cells,SRR1045524,9606,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,polyA-RNA-seq_STL001FT_r1a,1901851200,9509256,,200.0
1889,SRP000941,SRX263860,polyA RNA sequencing of STL001 Fat Cells,SRR1045525,9606,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,polyA-RNA-seq_STL001FT_r1a,1891729200,9458646,,200.0
1890,SRP000941,SRX263862,polyA RNA sequencing of STL001 Gastric Cells,SRR1045526,9606,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,polyA-RNA-seq_STL001GA_r1a,1189486000,5947430,,200.0
1891,SRP000941,SRX263862,polyA RNA sequencing of STL001 Gastric Cells,SRR1045527,9606,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,polyA-RNA-seq_STL001GA_r1a,2058524000,10292620,,200.0
1892,SRP000941,SRX263862,polyA RNA sequencing of STL001 Gastric Cells,SRR1045528,9606,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,polyA-RNA-seq_STL001GA_r1a,2074740800,10373704,,200.0
1893,SRP000941,SRX263862,polyA RNA sequencing of STL001 Gastric Cells,SRR1045529,9606,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,polyA-RNA-seq_STL001GA_r1a,2065242800,10326214,,200.0
1894,SRP000941,SRX263864,polyA RNA sequencing of STL001 Lung Cells,SRR1045530,9606,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,polyA-RNA-seq_STL001LG_r1a,2142985400,10714927,,200.0
1895,SRP000941,SRX263864,polyA RNA sequencing of STL001 Lung Cells,SRR1045531,9606,cDNA,PAIRED -,RNA-Seq,TRANSCRIPTOMIC,polyA-RNA-seq_STL001LG_r1a,3656058800,18280294,,200.0


In [49]:
df_unpaired

Unnamed: 0,study_accession,experiment_accession,experiment_title,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,spots,adapter_spec,avg_read_length
672,SRP000941,SRX007166,Sequencing of small RNA from the H1 cell line,SRR020285,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,smRNA-seq_h1_r1,489650116,11387212,,43.0
673,SRP000941,SRX007168,Sequencing of small RNA from the IMR90 cell line,SRR020286,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,smRNA-seq_imr90_r1,388025507,9023849,,43.0
674,SRP000941,SRX007165,"Strand-specific, shotgun sequencing of mRNA fr...",SRR020287,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,mRNA-seq_h1_r1,213042296,4954472,,43.0
675,SRP000941,SRX007167,"Strand-specific, shotgun sequencing of mRNA fr...",SRR020292,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,mRNA-seq_imr90_r1,220029108,5116956,,43.0
711,SRP000941,SRX056684,Shotgun sequencing of polyA+ RNA isolated from...,SRR179588,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,mRNA-Seq_ff_ips_19_11_r1,2311289650,46225793,,50.0
712,SRP000941,SRX056685,Shotgun sequencing of polyA+ RNA isolated from...,SRR179589,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,mRNA-Seq_ff_ips_19_11_r3,2346790300,46935806,,50.0
713,SRP000941,SRX056686,Shotgun sequencing of polyA+ RNA isolated from...,SRR179590,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,mRNA-Seq_ff_ips_6_9_r1,2657279100,53145582,,50.0
714,SRP000941,SRX056680,Shotgun sequencing of polyA+ RNA isolated from...,SRR179591,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,mRNA-Seq_h1+bmp4_r1,2775254750,55505095,,50.0
715,SRP000941,SRX056681,Shotgun sequencing of polyA+ RNA isolated from...,SRR179592,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,mRNA-Seq_h1+bmp4_r2,2353221450,47064429,,50.0
716,SRP000941,SRX056682,Shotgun sequencing of polyA+ RNA isolated from...,SRR179593,9606,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,mRNA-Seq_h1-npc_r1,2571974800,51439496,,50.0


In [50]:
for index, row in df_paired.iterrows():
    if not os.path.exists('/staging/as/skchoudh/SRA_datasets/SRP000941/{}/{}.sra'.format(row['experiment_accession'], row['run_accession'])):
        print('{}/{}.sra'.format(row['experiment_accession'], row['run_accession']))

In [51]:
for index, row in df_unpaired.iterrows():
    if not os.path.exists('/staging/as/skchoudh/SRA_datasets/SRP000941/{}/{}.sra'.format(row['experiment_accession'], row['run_accession'])):
        print('{}'.format(row['experiment_accession']))

In [36]:
create_config_file(df_paired,  '/staging/as/skchoudh/rna-seq-datasets/paired', 'paired')

Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/homo_sapiens_SRP000941_paired.py


In [37]:
create_config_file(df_unpaired,'/staging/as/skchoudh/rna-seq-datasets/single', 'single')

Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/homo_sapiens_SRP000941_single.py


In [131]:
create_config_file(df_paired,  '/staging/as/skchoudh/rna-seq-datasets/paired', 'paired')

Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/macaca_mulatta_SRP007412_paired.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/gorilla_gorilla_SRP007412_paired.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/pan_paniscus_SRP007412_paired.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/pan_troglodytes_SRP007412_paired.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/pongo_abelii_SRP007412_paired.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/homo_sapiens_SRP007412_paired.py


In [132]:
create_config_file(df_unpaired,'/staging/as/skchoudh/rna-seq-datasets/single', 'single')

Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/gallus_gallus_SRP007412_single.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/ornithorhynchus_anatinus_SRP007412_single.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/macaca_mulatta_SRP007412_single.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/gorilla_gorilla_SRP007412_single.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/pan_paniscus_SRP007412_single.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/pan_troglodytes_SRP007412_single.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/pongo_abelii_SRP007412_single.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/homo_sapiens_SRP007412_single.py
Wrote /home/cmb-panasas2/skchoudh/github_pr