In [38]:
%pylab inline
import pandas as pd
import os
from tqdm import tqdm
import fnmatch
import errno    
import glob
from textwrap import dedent
from riboraptor.helpers import mkdir_p
from riboraptor.sradb import SRAdb
from riboraptor.utils import copy_sra_data, create_config_file

def mkdir_p(path):
    """Python version mkdir -p

    Parameters
    ----------

    path : str
    """
    if path:
        try:
            os.makedirs(path)
        except OSError as exc:  # Python >2.5
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise


def symlink_force(source, destination):
    """Create forcelink forcefully

    Parameters
    ----------
    source: string
            Location to source file
    destination: string
                 Location to target

    """
    try:
        os.symlink(source, destination)
    except OSError as exc:
        if exc.errno == errno.EEXIST:
            os.remove(destination)
            os.symlink(source, destination)
        else:
            raise exc

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [43]:
re_ribo_root_dir = '/staging/as/skchoudh/SRA_datasets/'
samples_to_process_dir = '/staging/as/skchoudh/re-ribo-datasets/'
re_ribo_config_dir = '/home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs'
re_ribo_analysis_dir = '/staging/as/skchoudh/re-ribo-analysis/'
riboraptor_annotation_dir = '/home/cmb-panasas2/skchoudh/github_projects/riboraptor/riboraptor/annotation/'
def mkdir_p(path):
    """Python version mkdir -p

    Parameters
    ----------

    path : str
    """
    if path:
        try:
            os.makedirs(path)
        except OSError as exc:  # Python >2.5
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise


def symlink_force(source, destination):
    """Create forcelink forcefully

    Parameters
    ----------
    source: string
            Location to source file
    destination: string
                 Location to target

    """
    try:
        os.symlink(source, destination)
    except OSError as exc:
        if exc.errno == errno.EEXIST:
            os.remove(destination)
            os.symlink(source, destination)
        else:
            raise exc
genome_annotation_map = {
    'hg38': 'v25',
    'mm10': 'vM11',
    'mg1655': '',
    'sacCerR64': 'v91',
    'MG1655': 'ASM584v2.38',
    'BDGP6': 'v91',
    'GRCz10': 'v91',
    'panTro3': 'v94',
    'Mmul8': 'v94'
}


genome_fasta_map = {
    'hg38':
    '/home/cmb-panasas2/skchoudh/genomes/hg38/fasta/hg38.fa',
    'mm10':
    '/home/cmb-panasas2/skchoudh/genomes/mm10/fasta/mm10.fa',
    'sacCerR64':
    '/home/cmb-panasas2/skchoudh/genomes/sacCerR64/fasta/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa',
    'MG1655':
    '/home/cmb-panasas2/skchoudh/genomes/escherichia_coli_str_k_12_substr_mg1655/fasta/Escherichia_coli_str_k_12_substr_mg1655.ASM584v2.dna.toplevel.fa',
    'BDGP6':
    '/home/cmb-panasas2/skchoudh/genomes/drosophila_melanogaster_BDGP6/fasta/Drosophila_melanogaster.BDGP6.dna.toplevel.fa',
    'GRCz10':
    '/home/cmb-panasas2/skchoudh/genomes/GRCz10/fasta/Danio_rerio.GRCz10.dna.toplevel.fa',
    'panTro3':
    '/home/cmb-panasas2/skchoudh/genomes/panTro3/fasta/Pan_troglodytes.Pan_tro_3.0.dna.toplevel.fa',
    'Mmul8':
    '/home/cmb-panasas2/skchoudh/genomes/Mmul8/fasta/Macaca_mulatta.Mmul_8.0.1.dna.toplevel.fa'
}

chrom_sizes_map = {
    'hg38':
    '/home/cmb-panasas2/skchoudh/genomes/hg38/fasta/hg38.chrom.sizes',
    'mm10':
    '/home/cmb-panasas2/skchoudh/genomes/mm10/fasta/mm10.chrom.sizes',
    'sacCerR64':
    '/home/cmb-panasas2/skchoudh/genomes/sacCerR64/fasta/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.sizes',
    'MG1655':
    '/home/cmb-panasas2/skchoudh/genomes/escherichia_coli_str_k_12_substr_mg1655/fasta/Escherichia_coli_str_k_12_substr_mg1655.ASM584v2.dna.toplevel.sizes',
    'BDGP6':
    '/home/cmb-panasas2/skchoudh/genomes/drosophila_melanogaster_BDGP6/fasta/Drosophila_melanogaster.BDGP6.dna.toplevel.sizes',
    'GRCz10':
    '/home/cmb-panasas2/skchoudh/genomes/GRCz10/fasta/Danio_rerio.GRCz10.dna.toplevel.sizes',
    'panTro3':
    '/home/cmb-panasas2/skchoudh/genomes/panTro3/fasta/Pan_troglodytes.Pan_tro_3.0.dna.toplevel.sizes',
    'Mmul8':
    '/home/cmb-panasas2/skchoudh/genomes/Mmul8/fasta/Macaca_mulatta.Mmul_8.0.1.dna.toplevel.sizes'
}

star_index_map = {
    'hg38':
    '/home/cmb-panasas2/skchoudh/genomes/hg38/star_annotated',
    'mm10':
    '/home/cmb-panasas2/skchoudh/genomes/mm10/star_annotated',
    'sacCerR64':
    '/home/cmb-panasas2/skchoudh/genomes/sacCerR64/star_annotated',
    'MG1655':
    '/home/cmb-panasas2/skchoudh/genomes/escherichia_coli_str_k_12_substr_mg1655/star_annotated',
    'BDGP6':
    '/home/cmb-panasas2/skchoudh/genomes/drosophila_melanogaster_BDGP6/star_annotated',
    'GRCz10':
    '/home/cmb-panasas2/skchoudh/genomes/GRCz10/star_annotated',
    'panTro3':
    '/home/cmb-panasas2/skchoudh/genomes/panTro3/star_annotated',
    'Mmul8':
    '/home/cmb-panasas2/skchoudh/genomes/Mmul8/star_annotated'
}

gtf_map = {
    'hg38':
    '/home/cmb-panasas2/skchoudh/genomes/hg38/annotation/gencode.v25.annotation.gtf',
    'mm10':
    '/home/cmb-panasas2/skchoudh/genomes/mm10/annotation/gencode.vM11.annotation.gtf',
    'sacCerR64':
    '/home/cmb-panasas2/skchoudh/genomes/sacCerR64/annotation/Saccharomyces_cerevisiae.R64-1-1.91.gtf',
    'MG1655':
    '/home/cmb-panasas2/skchoudh/genomes/escherichia_coli_str_k_12_substr_mg1655/annotation/Escherichia_coli_str_k_12_substr_mg1655.ASM584v2.38.gtf',
    'BDGP6':
    '/home/cmb-panasas2/skchoudh/genomes/drosophila_melanogaster_BDGP6/annotation/Drosophila_melanogaster.BDGP6.91.gtf',
    'GRCz10':
    '/home/cmb-panasas2/skchoudh/genomes/GRCz10/annotation/Danio_rerio.GRCz10.91.gtf',
    'Mmul8':
    '/home/cmb-panasas2/skchoudh/genomes/Mmul8/annotation/Macaca_mulatta.Mmul_8.0.1.94.gtf',
    'panTro3':
    '/home/cmb-panasas2/skchoudh/genomes/panTro3/annotation/Pan_troglodytes.Pan_tro_3.0.94.gtf'
}


def filter_single_end_samples(df):
    """Filter single end samples from a dataframe

    Parameters
    ----------
    df: DataFrame
        Dataframe as obtained from SRAb.sra_convert()

    Returns
    -------
    df: DataFrame
        DataFrame with only single end samples
    """
    df = df[~df['library_strategy'].str.contains('PAIRED')]
    return df


def copy_sra_data(df,
                  sra_source_dir='/staging/as/skchoudh/SRA_datasets/',
                  sra_dest_dir='/staging/as/skchoudh/re-ribo-datasets/'):
    """Copy SRA data to a new location retaining only single ended samples."""
    df = filter_single_end_samples(df)
    assert len(df.study_accession.unique()) == 1, 'Multiple SRPs found'
    srp = df.study_accession.unique()[0]
    df_grouped = df.groupby(['taxon_id'])
    srp_source_dir = os.path.join(sra_source_dir, srp)

    for taxon_id, df_group in df_grouped:
        species = taxon_id_map[taxon_id]
        species_dest_dir = os.path.join(sra_dest_dir, species)
        srp_dest_dir = os.path.join(species_dest_dir, srp)
        mkdir_p(os.path.join(species_dest_dir, srp))
        source_loc = srp_source_dir + os.path.sep + df_group[
            'experiment_accession'].str.cat(
                df_group['run_accession'] + '.sra', sep=os.path.sep)
        dest_loc = srp_dest_dir + os.path.sep + df_group[
            'experiment_accession'].str.cat(
                df_group['run_accession'] + '.sra', sep=os.path.sep)
        with tqdm(total=len(source_loc)) as pbar:
            for source, dest in zip(source_loc, dest_loc):
                mkdir_p(os.path.dirname(dest))
                if os.path.isfile(source):
                    symlink_force(source, dest)
                pbar.update()





In [33]:
db = SRAdb('/staging/as/skchoudh/SRAmetadb.sqlite')


In [4]:
srp = 'SRP016501'
df = db.sra_convert(srp)
#copy_sra_data(df)

In [66]:
srp = 'SRP007412'
df = db.sra_convert(srp)
df.head()

Unnamed: 0,study_accession,experiment_accession,experiment_title,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,spots,adapter_spec,avg_read_length
0,SRP007412,SRX081869,GSM752557: gga br F 1,SRR306710,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752557: gga br F 1,1480245008,19476908,,76.0
1,SRP007412,SRX081870,GSM752558: gga br M 1,SRR306711,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752558: gga br M 1,1334334888,17557038,,76.0
2,SRP007412,SRX081871,GSM752559: gga cb F 1,SRR306712,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752559: gga cb F 1,1795697524,23627599,,76.0
3,SRP007412,SRX081872,GSM752560: gga cb M 1,SRR306713,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752560: gga cb M 1,1670739920,21983420,,76.0
4,SRP007412,SRX081873,GSM752561: gga ht F 1,SRR306714,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752561: gga ht F 1,1748333260,23004385,,76.0


In [36]:
#df[df.taxon_id==9031]
df.taxon_id.unique()

array([ 9031,  9258,  9544,  9593,  9597,  9598,  9600,  9606, 10090,
       13616])

In [60]:
taxon_id_map = {
    10090: 'mus_musculus',
    9606: 'homo_sapiens',
    4932: 'sacCerR64',
    511145: 'MG1655',
    7227: 'drosophila_melanogaster',
    7955: 'danio_rerio',
    9598: 'pan_troglodytes',
    9544: 'macaca_mulatta',
    9031: 'gallus_gallus',
    9913: 'bos_taurus',
    10116: 'rattus_norvegicus',
    9258: 'ornithorhynchus_anatinus',
    9593: 'gorilla_gorilla',
    9597: 'pan_paniscus',
    9600: 'pongo_abelii', #'pongo_pygmaeus',
    13616: 'monodelphis_domestica'
    
}

genomes_dir = '/home/cmb-panasas2/skchoudh/genomes'
re_ribo_analysis_dir = '/staging/as/skchoudh/rna-seq-output'






In [47]:
taxon_id_map = {
    10090: 'mus_musculus',
    9606: 'homo_sapiens',
    4932: 'sacCerR64',
    511145: 'MG1655',
    7227: 'drosophila_melanogaster',
    7955: 'danio_rerio',
    9598: 'pan_troglodytes',
    9544: 'macaca_mulatta',
    9031: 'gallus_gallus',
    9913: 'bos_taurus',
    10116: 'rattus_norvegicus',
    9258: 'ornithorhynchus_anatinus',
    9593: 'gorilla_gorilla',
    9597: 'pan_paniscus',
    9600: 'pongo_abelii', #'pongo_pygmaeus',
    13616: 'monodelphis_domestica'
    
}

genomes_dir = '/home/cmb-panasas2/skchoudh/genomes'

cdna_map = {}
for value in taxon_id_map.values():
    genome_dir = os.path.join(genomes_dir, value)
    if os.path.exists(os.path.join(genome_dir, 'cdna')):
        cdna_map[value] = glob.glob('{}/*'.format(os.path.join(genome_dir, 'cdna')))[0]
def create_config_file(df):
    df_grouped = df.groupby(['taxon_id'])

    for taxon_id, df_group in df_grouped:
        assert len(
            df_group['study_accession'].unique()) == 1, 'Multiple SRPs found'
        species = taxon_id_map[taxon_id]
        srp = df_group['study_accession'].unique()[0]
        with open(
                os.path.join(re_ribo_config_dir, '{}_{}.py'.format(
                    species, srp)), 'w') as fh:

            config = write_config(species, srp)
            fh.write(config)
            print('Wrote {}'.format(
                os.path.join(re_ribo_config_dir, '{}_{}.py'.format(
                    species, srp))))        

In [41]:
def copy_sra_data(df,
                  sra_source_dir='/staging/as/skchoudh/SRA_datasets/',
                  sra_dest_dir='/staging/as/skchoudh/rna-seq-datasets/'):
    """Copy SRA data to a new location retaining only single ended samples."""
    assert len(df.study_accession.unique()) == 1, 'Multiple SRPs found'
    srp = df.study_accession.unique()[0]
    df_grouped = df.groupby(['taxon_id'])
    srp_source_dir = os.path.join(sra_source_dir, srp)

    for taxon_id, df_group in df_grouped:
        species = taxon_id_map[taxon_id]
        species_dest_dir = os.path.join(sra_dest_dir, species)
        srp_dest_dir = os.path.join(species_dest_dir, srp)
        mkdir_p(os.path.join(species_dest_dir, srp))
        source_loc = srp_source_dir + os.path.sep + df_group[
            'experiment_accession'].str.cat(
                df_group['run_accession'] + '.sra', sep=os.path.sep)
        dest_loc = srp_dest_dir + os.path.sep + df_group[
            'experiment_accession'].str.cat(
                df_group['run_accession'] + '.sra', sep=os.path.sep)
        with tqdm(total=len(source_loc)) as pbar:
            for source, dest in zip(source_loc, dest_loc):
                mkdir_p(os.path.dirname(dest))
                if os.path.isfile(source):
                    symlink_force(source, dest)
                pbar.update()

In [76]:
sorted(db.desc_table('sra_ft')['name'].tolist())

['SRR_bamFile',
 'SRX_bamFile',
 'SRX_fastqFTP',
 'adapter_spec',
 'anonymized_name',
 'base_caller',
 'bases',
 'center_project_name',
 'common_name',
 'description',
 'design_description',
 'experiment_ID',
 'experiment_accession',
 'experiment_alias',
 'experiment_attribute',
 'experiment_entrez_link',
 'experiment_name',
 'experiment_title',
 'experiment_url_link',
 'individual_name',
 'instrument_model',
 'instrument_name',
 'library_construction_protocol',
 'library_layout',
 'library_name',
 'library_selection',
 'library_source',
 'library_strategy',
 'multiplier',
 'number_of_levels',
 'platform',
 'platform_parameters',
 'primary_study',
 'qtype',
 'quality_scorer',
 'read_spec',
 'related_studies',
 'run_ID',
 'run_accession',
 'run_alias',
 'run_attribute',
 'run_center',
 'run_date',
 'run_entrez_link',
 'run_url_link',
 'sample_ID',
 'sample_accession',
 'sample_alias',
 'sample_attribute',
 'sample_entrez_link',
 'sample_name',
 'sample_url_link',
 'sequence_space',
 'sp

In [61]:
srp = 'SRP016501'
df = db.sra_convert(srp)
copy_sra_data(df)

100%|██████████| 27/27 [00:00<00:00, 325.50it/s]
100%|██████████| 27/27 [00:00<00:00, 369.01it/s]
100%|██████████| 27/27 [00:00<00:00, 372.44it/s]
100%|██████████| 26/26 [00:00<00:00, 393.26it/s]
100%|██████████| 27/27 [00:00<00:00, 407.28it/s]


In [151]:
srp = 'SRP016501'
prefix = '/home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/{}'.format(srp)
cols = ['study_accession',
 'experiment_accession',
 'experiment_title',
 'run_accession',
 'taxon_id',
 'library_selection',
 'library_layout',
 'library_strategy',
 'library_source',
 'library_name',
 'bases',
 'spots',
 'adapter_spec', 'description']
df = db.sra_convert(srp, out_type=cols)
df['tissue'] = df['experiment_title'].str.split(': ').str.get(1).str.split('; ').str.get(0)
df = df.sort_values(by=['tissue', 'experiment_accession'])
for taxon_id, group in df[['study_accession', 'experiment_accession', 'taxon_id', 'tissue']].drop_duplicates().groupby(['taxon_id']):
    species = taxon_id_map[taxon_id]
    filepath = '{}-{}.tsv'.format(prefix, species)
    #rint(group)
    group = group.sort_values(by=['tissue', 'experiment_accession'])
    abundances = pd.DataFrame()
    for index, row in group.iterrows():
        srp =  row['study_accession']
        tissue = ('_').join(row['tissue'].split('_')[1:])
        srx = row['experiment_accession']
        
        abundance = pd.read_table('/staging/as/skchoudh/rna-seq-output/{}/{}/counts/{}/abundance.tsv'.format(species, srp, srx)) [['target_id', 'tpm']].rename(columns={'tpm': tissue}).set_index('target_id')
        #    target_id       length  eff_length      est_counts      tpm

        abundances = abundances.join(abundance, how='outer')
    abundances = abundances.reset_index()
    abundances.to_csv(filepath, header=True,  index=False, sep='\t')
    print('Wrote: {}'.format(filepath))

Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP016501-gallus_gallus.tsv
Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP016501-macaca_mulatta.tsv
Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP016501-bos_taurus.tsv
Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP016501-mus_musculus.tsv
Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP016501-rattus_norvegicus.tsv


In [62]:
create_config_file(df)

Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/gallus_gallus_SRP016501.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/macaca_mulatta_SRP016501.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/bos_taurus_SRP016501.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/mus_musculus_SRP016501.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/rattus_norvegicus_SRP016501.py


In [149]:
srp = 'SRP007412'
df = db.sra_convert(srp)
prefix = '/home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/{}'.format(srp)

#copy_sra_data(df)
df = db.sra_convert(srp, out_type=cols+['experiment_name'])
df['tissue'] = df['experiment_title'].str.split(': ').str.get(1).str[3:].str.lstrip(' ').str.replace(' ', '_')
df = df.sort_values(by=['tissue', 'experiment_accession'])
for taxon_id, group in df[['study_accession', 'experiment_accession', 'taxon_id', 'tissue']].drop_duplicates().groupby(['taxon_id']):
    species = taxon_id_map[taxon_id]
    filepath = '{}-{}.tsv'.format(prefix, species)
    #rint(group)
    group = group.sort_values(by=['tissue', 'experiment_accession'])
    abundances = pd.DataFrame()
    for index, row in group.iterrows():
        srp =  row['study_accession']
        tissue = row['tissue']
        srx = row['experiment_accession']
        
        abundance = pd.read_table('/staging/as/skchoudh/rna-seq-output/{}/{}/counts/{}/abundance.tsv'.format(species, srp, srx)) [['target_id', 'tpm']].rename(columns={'tpm': tissue}).set_index('target_id')
        #    target_id       length  eff_length      est_counts      tpm

        abundances = abundances.join(abundance, how='outer')
    abundances = abundances.reset_index()
    abundances.to_csv(filepath, header=True,  index=False, sep='\t')
    print('Wrote: {}'.format(filepath))
#create_config_file(df)

Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP007412-gallus_gallus.tsv
Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP007412-ornithorhynchus_anatinus.tsv
Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP007412-macaca_mulatta.tsv
Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP007412-gorilla_gorilla.tsv
Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP007412-pan_paniscus.tsv
Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP007412-pan_troglodytes.tsv
Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP007412-pongo_abelii.tsv
Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-data/SRP007412-homo_sapiens.tsv
Wrote: /home/cmb-panasas2/skchoudh/github_projects/EE-546-project/cross-species-dat

In [148]:
df.head()

Unnamed: 0,study_accession,experiment_accession,experiment_title,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,spots,adapter_spec,description,experiment_name,avg_read_length,tissue
0,SRP007412,SRX081869,GSM752557: gga br F 1,SRR306710,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752557: gga br F 1,1480245008,19476908,,,GSM752557: gga br F 1,76.0,br_F_1
14,SRP007412,SRX081881,GSM752569: oan br F 1,SRR306724,9258,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752569: oan br F 1,820587580,10797205,,,GSM752569: oan br F 1,76.0,br_F_1
15,SRP007412,SRX081881,GSM752569: oan br F 1,SRR306725,9258,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752569: oan br F 1,1850093840,24343340,,,GSM752569: oan br F 1,76.0,br_F_1
134,SRP007412,SRX081893,GSM752588: mdo br F 1,SRR306742,13616,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752588: mdo br F 1,860751832,11325682,,,GSM752588: mdo br F 1,76.0,br_F_1
135,SRP007412,SRX081893,GSM752588: mdo br F 1,SRR306743,13616,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752588: mdo br F 1,3615666256,47574556,,,GSM752588: mdo br F 1,76.0,br_F_1


In [109]:
df[df.library_strategy.str.contains('PAIRED')]

Unnamed: 0,study_accession,experiment_accession,experiment_title,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,spots,adapter_spec,description,avg_read_length,tissue


In [117]:
df_paired = df[df.avg_read_length>76]
df_unpaired = df[df.library_layout.str.contains('SINGLE')]

In [120]:
copy_sra_data(df)

Unnamed: 0,study_accession,experiment_accession,experiment_title,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,bases,spots,adapter_spec,avg_read_length
0,SRP007412,SRX081869,GSM752557: gga br F 1,SRR306710,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752557: gga br F 1,1480245008,19476908,,76.0
1,SRP007412,SRX081870,GSM752558: gga br M 1,SRR306711,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752558: gga br M 1,1334334888,17557038,,76.0
2,SRP007412,SRX081871,GSM752559: gga cb F 1,SRR306712,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752559: gga cb F 1,1795697524,23627599,,76.0
3,SRP007412,SRX081872,GSM752560: gga cb M 1,SRR306713,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752560: gga cb M 1,1670739920,21983420,,76.0
4,SRP007412,SRX081873,GSM752561: gga ht F 1,SRR306714,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752561: gga ht F 1,1748333260,23004385,,76.0
5,SRP007412,SRX081874,GSM752562: gga ht M 1,SRR306715,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752562: gga ht M 1,1604929848,21117498,,76.0
6,SRP007412,SRX081875,GSM752563: gga kd F 1,SRR306716,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752563: gga kd F 1,1749607628,23021153,,76.0
7,SRP007412,SRX081876,GSM752564: gga kd M 1,SRR306717,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752564: gga kd M 1,1732548288,22796688,,76.0
8,SRP007412,SRX081877,GSM752565: gga lv F 1,SRR306718,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752565: gga lv F 1,2298690376,30245926,,76.0
9,SRP007412,SRX081878,GSM752566: gga lv M 1,SRR306719,9031,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,GSM752566: gga lv M 1,653253896,8595446,,76.0


In [121]:
copy_sra_data(df_paired, sra_dest_dir='/staging/as/skchoudh/rna-seq-datasets/paired')
copy_sra_data(df_unpaired, sra_dest_dir='/staging/as/skchoudh/rna-seq-datasets/single')

100%|██████████| 1/1 [00:00<00:00, 122.69it/s]
100%|██████████| 1/1 [00:00<00:00, 227.98it/s]
100%|██████████| 1/1 [00:00<00:00, 234.15it/s]
100%|██████████| 5/5 [00:00<00:00, 178.17it/s]
100%|██████████| 1/1 [00:00<00:00, 233.61it/s]
100%|██████████| 2/2 [00:00<00:00, 250.55it/s]
100%|██████████| 14/14 [00:00<00:00, 219.38it/s]
100%|██████████| 18/18 [00:00<00:00, 249.19it/s]
100%|██████████| 13/13 [00:00<00:00, 222.07it/s]
100%|██████████| 10/10 [00:00<00:00, 199.25it/s]
100%|██████████| 11/11 [00:00<00:00, 213.72it/s]
100%|██████████| 10/10 [00:00<00:00, 187.28it/s]
100%|██████████| 8/8 [00:00<00:00, 215.97it/s]
100%|██████████| 19/19 [00:00<00:00, 218.56it/s]
100%|██████████| 20/20 [00:00<00:00, 250.00it/s]
100%|██████████| 15/15 [00:00<00:00, 254.78it/s]


In [127]:
def create_config_file(df, samples_to_process_dir, strategy=None):
    df_grouped = df.groupby(['taxon_id'])

    for taxon_id, df_group in df_grouped:
        assert len(
            df_group['study_accession'].unique()) == 1, 'Multiple SRPs found'
        species = taxon_id_map[taxon_id]
        srp = df_group['study_accession'].unique()[0]
        filepath = os.path.join(re_ribo_config_dir, '{}_{}.py'.format(species, srp))
        if strategy:
            filepath = os.path.join(re_ribo_config_dir, '{}_{}_{}.py'.format(species, srp, strategy))
            
        with open(filepath, 'w') as fh:

            config = write_config(species, srp, samples_to_process_dir)
            fh.write(config)
            print('Wrote {}'.format(filepath))

In [130]:
def write_config(species, srp, samples_to_process_dir):
    rawdata_dir = os.path.join(samples_to_process_dir, species, srp)
    out_dir = os.path.join(re_ribo_analysis_dir, species, srp)
    cdna_fa = cdna_map[species]    
    cdna_idx = cdna_map[species].replace('.fa.gz', '.kallisto.index')
    to_write = """
    RAWDATA_DIR = '{}'
    OUT_DIR = '{}'
    CDNA_FA_GZ = '{}'    
    CDNA_IDX = '{}'    
    """.format(rawdata_dir, out_dir, cdna_fa, cdna_idx)
    return dedent(to_write)

In [131]:
create_config_file(df_paired,  '/staging/as/skchoudh/rna-seq-datasets/paired', 'paired')

Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/macaca_mulatta_SRP007412_paired.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/gorilla_gorilla_SRP007412_paired.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/pan_paniscus_SRP007412_paired.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/pan_troglodytes_SRP007412_paired.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/pongo_abelii_SRP007412_paired.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/homo_sapiens_SRP007412_paired.py


In [132]:
create_config_file(df_unpaired,'/staging/as/skchoudh/rna-seq-datasets/single', 'single')

Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/gallus_gallus_SRP007412_single.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/ornithorhynchus_anatinus_SRP007412_single.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/macaca_mulatta_SRP007412_single.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/gorilla_gorilla_SRP007412_single.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/pan_paniscus_SRP007412_single.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/pan_troglodytes_SRP007412_single.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/pongo_abelii_SRP007412_single.py
Wrote /home/cmb-panasas2/skchoudh/github_projects/EE-546-project//snakemake/configs/homo_sapiens_SRP007412_single.py
Wrote /home/cmb-panasas2/skchoudh/github_pr