In [50]:
from collections import defaultdict
from collections import OrderedDict

import pysradb
from pysradb import SRAdb
import os
import glob
import pandas as pd
from riboraptor.helpers import path_leaf, parse_star_logs, millify, order_dataframe
from riboraptor.cutadapt_to_json import cutadapt_to_json
from riboraptor.utils import summary_starlogs_over_runs, mkdir_p

root_dir = '/data1/re-ribo-analysis/'

builds = os.listdir(root_dir)

In [2]:
builds

['Mmul8',
 'GRCg6',
 'panTro3',
 'hg38',
 'Rnor6.0',
 'BDGP6',
 'GRCz11',
 'mm10',
 'WBcel235']

In [45]:
def check_ribotricer_output_exists(srp, srx, assembly):
    path = "/data1/re-ribo-analysis"
    path = os.path.join(path, assembly, srp, "ribotricer_results" ,"{}_translating_ORFs.tsv".format(srx))
    if os.path.exists(path):
        return path
    
def check_ribotricer_metagene_exists(srp, srx, assembly):
    path = "/data1/re-ribo-analysis"
    path_5p = os.path.join(path, assembly, srp, "ribotricer_results" ,"{}_metagene_profiles_5p.tsv".format(srx))
    path_3p = os.path.join(path, assembly, srp, "ribotricer_results" ,"{}_metagene_profiles_3p.tsv".format(srx))
    path_5p_tsv = None
    path_3p_tsv = None
    if os.path.exists(path_5p):
        path_5p_tsv = path_5p
    if os.path.exists(path_3p):
        path_3p_tsv = path_3p
        
    return path_5p_tsv, path_3p_tsv


In [48]:
def get_srp_table(srp, assembly, re_ribo_analysis_dir):
    sradb = SRAdb("/data2/SRAmetadb.sqlite")
    column_order = [
        "study_accession",
        "experiment_title",
        "experiment_accession",
        "run_accession",
        "taxon_id",
        "library_selection",
        "library_layout",
        "library_strategy",
        "library_source",
        "library_name",
        "adapter_spec",
        "bases",
        "spots",
        "avg_read_length",
        "pass1_adapter",
        "pass1_total_reads_processed",
        "pass1_reads_with_adapters",
        "pass2_adapter",
        "pass2_total_reads_processed",
        "pass2_reads_with_adapters",
        "mapping_total_reads_input",
        "uniquely_mapped",
        "uniquely_mapped_percent",
        "ribotricer_orfs"
    ]
    filepath = os.path.join(re_ribo_analysis_dir, assembly, srp)
    if os.path.exists(filepath):

        try:
            srp_df = sradb.sra_metadata(srp.split("_")[0], detailed=True, expand_sample_attributes=True)
        except:
            return pd.DataFrame()
        srp_df.library_layout = srp_df.library_layout.fillna("SINGLE")
        srp_df = srp_df[srp_df.library_layout.str.contains("SINGLE")]

        srp_df["pass1_reads_with_adapters"] = None
        srp_df["pass1_total_reads_processed"] = None
        srp_df["pass1_adapter"] = None
        srp_df["pass2_adapter"] = None
        srp_df["pass2_total_reads_processed"] = None
        srp_df["pass2_reads_with_adapters"] = None
        srp_df["mapping_total_reads_input"] = None
        srp_df["uniquely_mapped"] = None
        srp_df["uniquely_mapped_percent"] = None
        srp_df["ribotricer_orfs"] = None
        srp_df["ribotricer_metagene_5p"] = None
        srp_df["ribotricer_metagene_3p"] = None
        

        srpdir = os.path.join(re_ribo_analysis_dir, assembly, srp)
        starlogsdir = os.path.join(srpdir, "starlogs")
        srp_srx_grouped = srp_df.groupby("experiment_accession")
        preprocess_step1_dir = os.path.join(srpdir, "preprocessed_step1")
        preprocess_step2_dir = os.path.join(srpdir, "preprocessed")
        for srx, srx_group in srp_srx_grouped:
            ribotricer_output = check_ribotricer_output_exists(srp, srx, assembly)
            ribotricer_metagene_5p, ribotricer_metagene_3p = check_ribotricer_metagene_exists(srp, srx, assembly)
            srrs = srx_group["run_accession"].tolist()
            if ribotricer_output:
                srp_df.loc[srp_df.experiment_accession == srx, "ribotricer_orfs"] = ribotricer_output
            srp_df.loc[srp_df.experiment_accession == srx, "ribotricer_metagene_5p"] = ribotricer_metagene_5p
            srp_df.loc[srp_df.experiment_accession == srx, "ribotricer_metagene_3p"] = ribotricer_metagene_3p

                
            # starlogs_df = summary_starlogs_over_runs(starlogsdir, srrs)

            for srr in srrs:
                starlogs_df = None
                if os.path.isfile(os.path.join(starlogsdir, srr + "Log.final.out")):
                    starlogs_df = parse_star_logs(
                        os.path.join(starlogsdir, srr + "Log.final.out")
                    )
                # Preprocessed_step1 adapter info
                step1_txt = os.path.join(
                    preprocess_step1_dir, srr + ".fastq.gz_trimming_report.txt"
                )
                step2_txt = os.path.join(
                    preprocess_step2_dir, srr + "_trimmed.fq.gz_trimming_report.txt"
                )
                step1_cutadapt_json = None
                step2_cutadapt_json = None

                if os.path.isfile(step1_txt):
                    step1_cutadapt_json = cutadapt_to_json(step1_txt)

                if os.path.isfile(step2_txt):
                    step2_cutadapt_json = cutadapt_to_json(step2_txt)

                if step1_cutadapt_json:
                    adapters = step1_cutadapt_json["adapters"]
                    if len(step1_cutadapt_json["adapters"]) == 0:
                        srp_df.loc[
                            srp_df.run_accession == srr, "pass1_adapter"
                        ] = "Empty?"
                    elif isinstance(adapters, str):
                        srp_df.loc[
                            srp_df.run_accession == srr, "pass1_adapter"
                        ] = step1_cutadapt_json["adapters"]
                    else:
                        srp_df.loc[
                            srp_df.run_accession == srr, "pass1_adapter"
                        ] = step1_cutadapt_json["adapters"][
                            "{} - {}".format(srr, "Adapter 1")
                        ]
                        trim_info1 = step1_cutadapt_json["trim_info"][srr]
                        srp_df.loc[
                            srp_df.run_accession == srr, "pass1_total_reads_processed"
                        ] = trim_info1["r_processed"]
                        srp_df.loc[
                            srp_df.run_accession == srr, "pass1_reads_with_adapters"
                        ] = trim_info1["r_with_adapters"]
                if step2_cutadapt_json:
                    adapters = step2_cutadapt_json["adapters"]
                    if len(step2_cutadapt_json["adapters"]) == 0:
                        srp_df.loc[
                            srp_df.run_accession == srr, "pass2_adapter"
                        ] = "Empty?"
                    elif isinstance(adapters, str):
                        srp_df.loc[
                            srp_df.run_accession == srr, "pass2_adapter"
                        ] = step2_cutadapt_json["adapters"]
                    else:
                        srp_df.loc[
                            srp_df.run_accession == srr, "pass2_adapter"
                        ] = step2_cutadapt_json["adapters"][
                            "{} - {}".format(srr + "_trimmed", "Adapter 1")
                        ]
                        trim_info2 = step2_cutadapt_json["trim_info"][srr + "_trimmed"]
                        srp_df.loc[
                            srp_df.run_accession == srr, "pass2_reads_with_adapters"
                        ] = trim_info2["r_with_adapters"]
                        srp_df.loc[
                            srp_df.run_accession == srr, "pass2_total_reads_processed"
                        ] = trim_info2["r_processed"]

                if starlogs_df:
                    srp_df.loc[
                        srp_df.run_accession == srr, "mapping_total_reads_input"
                    ] = starlogs_df["total_reads"]
                    srp_df.loc[
                        srp_df.run_accession == srr, "uniquely_mapped"
                    ] = starlogs_df["uniquely_mapped"]
                    srp_df.loc[
                        srp_df.run_accession == srr, "uniquely_mapped_percent"
                    ] = starlogs_df["uniquely_mapped_percent"]

        cols = [
            "bases",
            "spots",
            "pass1_reads_with_adapters",
            "pass2_reads_with_adapters",
            "pass2_total_reads_processed",
            "pass1_total_reads_processed",
            "uniquely_mapped",
            "mapping_total_reads_input",
        ]
        for col in cols:
            srp_df[col] = srp_df[col].apply(lambda z: millify(z))
        sradb.close()
        return order_dataframe(srp_df, column_order)

In [49]:
srp = 'SRP045777'
assembly = 'Rnor6.0'


Unnamed: 0,study_accession,experiment_title,experiment_accession,run_accession,taxon_id,library_selection,library_layout,library_strategy,library_source,library_name,...,uniquely_mapped_percent,ribotricer_orfs,sample_accession,experiment_attribute,cell_line,rna_type,source_name,treatment,ribotricer_metagene_5p,ribotricer_metagene_3p
0,SRP045777,"GSM1487097: 0 min. OGD, ribo-seq, replicate #1...",SRX686493,SRR1557705,10116,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,...,,,SRS689312,GEO Accession: GSM1487097,pc12,ribosome protected fragment,pc12,control,/data1/re-ribo-analysis/Rnor6.0/SRP045777/ribo...,/data1/re-ribo-analysis/Rnor6.0/SRP045777/ribo...
1,SRP045777,"GSM1487098: 0 min. OGD, mRNA-seq, replicate #1...",SRX686494,SRR1557706,10116,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,...,,,SRS689313,GEO Accession: GSM1487098,pc12,ã¢â‚¬å“nakedã¢â‚¬â mrnaseq,pc12,control,,
2,SRP045777,"GSM1487099: 20 min. OGD, ribo-seq, replicate #...",SRX686495,SRR1557707,10116,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,...,,,SRS689315,GEO Accession: GSM1487099,pc12,ribosome protected fragment,pc12,20 minutes of oxygen and glucose deprivation,/data1/re-ribo-analysis/Rnor6.0/SRP045777/ribo...,/data1/re-ribo-analysis/Rnor6.0/SRP045777/ribo...
3,SRP045777,"GSM1487100: 20 min. OGD, mRNA-seq, replicate #...",SRX686496,SRR1557708,10116,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,...,,,SRS689316,GEO Accession: GSM1487100,pc12,ã¢â‚¬å“nakedã¢â‚¬â mrnaseq,pc12,20 minutes of oxygen and glucose deprivation,,
4,SRP045777,"GSM1487101: 40 min. OGD, ribo-seq, replicate #...",SRX686497,SRR1557709,10116,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,...,,/data1/re-ribo-analysis/Rnor6.0/SRP045777/ribo...,SRS689314,GEO Accession: GSM1487101,pc12,ribosome protected fragment,pc12,40 minutes of oxygen and glucose deprivation,/data1/re-ribo-analysis/Rnor6.0/SRP045777/ribo...,/data1/re-ribo-analysis/Rnor6.0/SRP045777/ribo...
5,SRP045777,"GSM1487102: 40 min. OGD, mRNA-seq, replicate #...",SRX686498,SRR1557710,10116,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,...,,,SRS689317,GEO Accession: GSM1487102,pc12,ã¢â‚¬å“nakedã¢â‚¬â mrnaseq,pc12,40 minutes of oxygen and glucose deprivation,,
6,SRP045777,"GSM1487103: 60 min. OGD, ribo-seq, replicate #...",SRX686499,SRR1557711,10116,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,...,,/data1/re-ribo-analysis/Rnor6.0/SRP045777/ribo...,SRS689318,GEO Accession: GSM1487103,pc12,ribosome protected fragment,pc12,60 minutes of oxygen and glucose deprivation,/data1/re-ribo-analysis/Rnor6.0/SRP045777/ribo...,/data1/re-ribo-analysis/Rnor6.0/SRP045777/ribo...
7,SRP045777,"GSM1487104: 60 min. OGD, mRNA-seq, replicate #...",SRX686500,SRR1557712,10116,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,...,,,SRS689319,GEO Accession: GSM1487104,pc12,ã¢â‚¬å“nakedã¢â‚¬â mrnaseq,pc12,60 minutes of oxygen and glucose deprivation,,
8,SRP045777,"GSM1487105: 0 min. OGD, ribo-seq, replicate #2...",SRX686501,SRR1557713,10116,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,...,,/data1/re-ribo-analysis/Rnor6.0/SRP045777/ribo...,SRS689320,GEO Accession: GSM1487105,pc12,ribosome protected fragment,pc12,control,/data1/re-ribo-analysis/Rnor6.0/SRP045777/ribo...,/data1/re-ribo-analysis/Rnor6.0/SRP045777/ribo...
9,SRP045777,"GSM1487106: 0 min. OGD, mRNA-seq, replicate #2...",SRX686502,SRR1557714,10116,cDNA,SINGLE -,RNA-Seq,TRANSCRIPTOMIC,,...,,,SRS689321,GEO Accession: GSM1487106,pc12,ã¢â‚¬å“nakedã¢â‚¬â mrnaseq,pc12,control,,


In [17]:
ROOT_DIRS = ["/data1/re-ribo-analysis"]# "/data2/re-ribo-analysis", "/data3/re-ribo-analysis", "/data4/re-ribo-analysis"]
READ_LENGTH_DIRNAME = "read_lengths"
METAGENE_COVERAGE_DIRNAME = "metagene_coverages"
METAGENE_LENWISE_COVERAGE_DIRNAME = "metagene_coverage_lengthwise"

# Top level directory of the directories inside each of the ROOT_DIRS
__ASSEMBLIES__ = [os.listdir(dirname) for dirname in ROOT_DIRS]
__SPECIES__ = [
    {"label": "H.sapiens", "value": "hg38"},
    {"label": "M.musculus", "value": "mm10"},
    {"label": "C.albicans", "value": "SC5314"}
]
__ASSEMBLIES__ = list(
    sorted(set([item for sublist in __ASSEMBLIES__ for item in sublist]))
)
__ASSEMBLY_WISE_SRP__ = defaultdict(list)
__SRP_TO_ROOT_DIR_MAP__ = defaultdict(dict)

#DATASETS = {"hg38": pd.read_csv("/data1/hg_datasets.tsv", sep="\t"),
#            "mm10": pd.read_csv("/data1/mm_datasets.tsv", sep="\t")}

for root_dir in ROOT_DIRS:
    for assembly_build in os.listdir(root_dir):
        for srp_dir in filter(
            os.path.isdir, glob.glob(os.path.join(root_dir, assembly_build, "*"))
        ):
            srp = os.path.basename(srp_dir)
            __ASSEMBLY_WISE_SRP__[assembly_build].append(srp)
            __SRP_TO_ROOT_DIR_MAP__[srp][assembly_build] = os.path.join(
                root_dir, assembly_build, srp
            )

def generate_tablex(dataframe, max_rows=26):
    return html.Table(
        # Header
        [html.Tr([html.Th(col) for col in dataframe.columns]) ] +
        # Body
        [html.Tr([
            html.Td(dataframe.iloc[i][col]) for col in dataframe.columns
        ]) for i in range(min(len(dataframe), max_rows))]
    )


In [18]:
__ASSEMBLY_WISE_SRP__ = defaultdict(list)
__SRP_TO_ROOT_DIR_MAP__ = defaultdict(dict)
for root_dir in ROOT_DIRS:
    for assembly_build in os.listdir(root_dir):
        for srp_dir in filter(
            os.path.isdir, glob.glob(os.path.join(root_dir, assembly_build, "*"))
        ):
            srp = os.path.basename(srp_dir)
            __ASSEMBLY_WISE_SRP__[assembly_build].append(srp)
            __SRP_TO_ROOT_DIR_MAP__[srp][assembly_build] = os.path.join(
                root_dir, assembly_build, srp
            )

In [51]:
__ASSEMBLY_WISE_SRP__

defaultdict(list,
            {'Mmul8': ['SRP028612', 'SRP062129'],
             'GRCg6': ['SRP096694'],
             'panTro3': ['SRP028612', 'SRP062129'],
             'hg38': ['SRP065528',
              'ERP021735',
              'SRP102021',
              'SRP065529',
              'SRP115659',
              'SRP044932',
              'SRP102616',
              'SRP103009',
              'SRP090415',
              'SRP044933',
              'SRP044935',
              'SRP075585',
              'SRP044936',
              'SRP058501',
              'SRP028612',
              'SRP102020',
              'SRP062129',
              'SRP113333',
              'SRP065530',
              'SRP083699',
              'SRP114321',
              'SRP044934',
              'SRP067300',
              'SRP044937',
              'SRP059546',
              'SRP101952',
              'SRP098789',
              'SRP059547',
              'SRP062129_rm_quicksect',
              'SRP059548',
            

In [59]:
def get_fragment_lengths(file_path):
    return pd.read_csv(file_path, sep='\t').fragment_length.tolist()


In [61]:
db = SRAdb('/data2/SRAmetadb.sqlite')
all_projects = []
re_ribo_analysis_dir = '/data1/re-ribo-analysis'
for species, sample_list in __ASSEMBLY_WISE_SRP__.items():
    mkdir_p('/data2/re-ribo-analysis-metadata/{}'.format(species))
    for srp in sample_list:
        df = get_srp_table(srp, species, re_ribo_analysis_dir)
        project_filepath = '/data1/re-ribo-analysis/{}/{}'.format(species, srp)    
        metadata_filepath = '/data2/re-ribo-analysis-metadata/{}/{}.tsv'.format(species, srp)
        df_subset = df[df.ribotricer_metagene_5p == df.ribotricer_metagene_5p].ribotricer_metagene_5p.tolist()
        fragment_lengths = []
        for f in df_subset:
            fragment_lengths += get_fragment_lengths(f)
        fragment_length = list(sorted(set(fragment_lengths)))
        all_projects.append((species, srp, project_filepath, metadata_filepath, str(fragment_lengths)))        
        df.to_csv(metadata_filepath, sep='\t', index=False, header=True)

In [62]:
summary_df = pd.DataFrame(all_projects)
summary_df.columns = ['species', 'srp', 'project_output_path', 'project_metadata_path', 'fragment_lengths']
summary_df = summary_df.sort_values(by=['species', 'srp'])
summary_df.to_csv('/data2/datasets.tsv', sep='\t', index=False, header=True)
summary_df

Unnamed: 0,species,srp,project_output_path,project_metadata_path,fragment_lengths
40,BDGP6,ERP008887,/data1/re-ribo-analysis/BDGP6/ERP008887,/data2/re-ribo-analysis-metadata/BDGP6/ERP0088...,[]
42,BDGP6,SRP028243,/data1/re-ribo-analysis/BDGP6/SRP028243,/data2/re-ribo-analysis-metadata/BDGP6/SRP0282...,"[28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 3..."
43,BDGP6,SRP033366,/data1/re-ribo-analysis/BDGP6/SRP033366,/data2/re-ribo-analysis-metadata/BDGP6/SRP0333...,"[36, 37, 38, 39, 40, 35, 36, 37, 38, 39, 40, 3..."
44,BDGP6,SRP072369,/data1/re-ribo-analysis/BDGP6/SRP072369,/data2/re-ribo-analysis-metadata/BDGP6/SRP0723...,"[34, 35, 36, 37, 38]"
41,BDGP6,SRP108999,/data1/re-ribo-analysis/BDGP6/SRP108999,/data2/re-ribo-analysis-metadata/BDGP6/SRP1089...,"[64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 7..."
2,GRCg6,SRP096694,/data1/re-ribo-analysis/GRCg6/SRP096694,/data2/re-ribo-analysis-metadata/GRCg6/SRP0966...,"[18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 2..."
48,GRCz11,SRP010040,/data1/re-ribo-analysis/GRCz11/SRP010040,/data2/re-ribo-analysis-metadata/GRCz11/SRP010...,"[24, 24, 24, 24]"
45,GRCz11,SRP021915,/data1/re-ribo-analysis/GRCz11/SRP021915,/data2/re-ribo-analysis-metadata/GRCz11/SRP021...,"[43, 31, 32, 33, 34, 35, 36, 37, 43, 44, 31, 3..."
46,GRCz11,SRP023492,/data1/re-ribo-analysis/GRCz11/SRP023492,/data2/re-ribo-analysis-metadata/GRCz11/SRP023...,"[23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 3..."
47,GRCz11,SRP033369,/data1/re-ribo-analysis/GRCz11/SRP033369,/data2/re-ribo-analysis-metadata/GRCz11/SRP033...,"[39, 39, 20, 34, 19, 25, 33, 34, 35, 25, 33, 3..."
