In [None]:
import pandas as pd
import os
import openpyxl

# full path to ALTER-code/5_tutorial-workflows/1_degs-off-tgts
proj_dir =''
# full path to miRtarBase csv
mirtar_path = ''
# full path to the transcript map
transc_map_path = ''

wt_condition = '01_transfection.control'

In [None]:
def mirtarbase_by_gene(mirtar_path):
    # -------------------------------------------------
    # Input:    1. mirtar_path - path to the downloaded csv of human mirtarbase
    # Output:   a df of humn mirtar base grouped by gene
    # -------------------------------------------------
    df = pd.read_csv(mirtar_path, dtype={
        'Target Gene (Entrez ID)': int,
    })

    df = df[df['Species (miRNA)'] == 'hsa'].reset_index(drop=True)
    df = df.drop('Species (miRNA)', axis=1)
    df = df.drop('Species (Target Gene)', axis=1)
    rename_dict={
        'miRTarBase ID':'mirtar_id', 
        'Target Gene':'gene_name',
        'Target Gene (Entrez ID)':'entrez_id',
        'Experiments':'experiments',
        'Support Type':'support',
        'References (PMID)':'ref_pmid'
    }
    df = df.rename(columns=rename_dict)
    df['ref_pmid'] = df['ref_pmid'].apply(lambda x: str(int(x)) if not pd.isna(x) else '')

    group_by = df.groupby(['gene_name', 'entrez_id'])
    grouped_df = group_by['mirtar_id'].apply(lambda x: ','.join(x)).reset_index()
    grouped_df['miRNA'] = list(group_by['miRNA'].apply(lambda x: ','.join(x)))
    grouped_df['experiments'] = list(group_by['experiments'].apply(lambda x: ','.join(x)))
    grouped_df['support'] = list(group_by['support'].apply(lambda x: ','.join(x)))
    grouped_df['ref_pmid'] = list(group_by['ref_pmid'].apply(lambda x: ','.join(x)))

    return grouped_df


salmon_dir = os.path.join(proj_dir, 'salmon-results')
deseq_dir = os.path.join(salmon_dir, 'deseq2')

# import transcript map and make a gene map
transc_map_df = pd.read_csv(transc_map_path, sep='\t', compression='infer')
gene_map_df = transc_map_df.groupby(['gene_id', 'gene_name'])['transcript_id'].apply(lambda x:','.join(x)).reset_index()
gene_map_df = gene_map_df.set_index('gene_name')

# if not done already import mirtarbase data, collapse to gene level and add ensembl ids
mirtar_by_gene_tsv_path = os.path.join(os.path.split(mirtar_path)[0], 'miRtarBase-by-gene.tsv')
mirtar_by_gene_tsvgz_path = mirtar_by_gene_tsv_path +'.gz'
if os.path.isfile(mirtar_by_gene_tsv_path):
    mirtar_df = pd.read_csv(mirtar_by_gene_tsv_path, sep='\t')
elif os.path.isfile(mirtar_by_gene_tsvgz_path):
    mirtar_df = pd.read_csv(mirtar_by_gene_tsvgz_path, sep='\t', compression='infer')
else:
    mirtar_df = mirtarbase_by_gene(mirtar_path)
    mirtar_df['gene_id'] = mirtar_df['gene_name'].apply(lambda x: gene_map_df.loc[x.strip(), 'gene_id'] if x.strip() in gene_map_df.index else 'NOT_FOUND')
    mirtar_df.to_csv(mirtar_by_gene_tsvgz_path, sep='\t', index=False, compression='gzip')

display(gene_map_df)
display(mirtar_df)

In [None]:
sample_map_path = os.path.join(deseq_dir, '1_inputs', 'sample-map.tsv')
sample_map_df = pd.read_csv(sample_map_path, sep='\t')

biocondition_list = []
for biocondition in sample_map_df['condition'].unique():
    if biocondition != wt_condition:
        biocondition_list.append(biocondition)

raw_count_path = os.path.join(deseq_dir, '2_txi-counts', 'gene-level-raw-counts.tsv')
deseq_results_path = os.path.join(deseq_dir, '3_results', '2_result-tables')
norm_count_path = os.path.join(deseq_results_path, 'norm-gene-counts.tsv')

raw_count_df =      pd.read_csv(raw_count_path, sep='\t')
norm_count_df =     pd.read_csv(norm_count_path, sep='\t')

display(sample_map_df)

In [None]:
# for each non reference condition import deg data and then count the miRNA targets and freuency in:
# - all genes
# - raw count genes
# - expressed genes
# - deseq assayed genes
# - degs 
# - upregulated degs
# - downregulated degs
out_df_dict = {}
for biocondition in biocondition_list:
    drop_na_path = os.path.join(deseq_results_path, f'{biocondition}_de-results-drop-na.tsv')
    deg_path = os.path.join(deseq_results_path, f'{biocondition}_de-results-padj0.01-lfc1.tsv')

    drop_na_df = pd.read_csv(drop_na_path, sep='\t')
    deg_df = pd.read_csv(deg_path, sep= '\t')

    out_df = pd.DataFrame(columns=['subset', 'total_count', 'mirna_count', 'pct_mirna'])
    out_df['subset'] = [
        'All Genes',
        'Raw Count Genes',
        'Expressed Genes',
        'Assayed Genes',
        'DEGs',
        'Up-regulated',
        'Down-regulated'
    ]

    expressed_mask = ~(raw_count_df[raw_count_df.columns[2:]].sum(axis=1) == 0)
    upreg_mask = deg_df['log2FoldChange'] > 0
    downreg_mask = deg_df['log2FoldChange'] < 0
    df_list = [
        gene_map_df,
        raw_count_df,
        raw_count_df[expressed_mask],
        drop_na_df,
        deg_df,
        deg_df[upreg_mask],
        deg_df[downreg_mask]
        ]

    for i in range(len(df_list)):
        merge_df = pd.merge(left=df_list[i], right=mirtar_df, on=['gene_id', 'gene_name'], how='inner')
        out_df.loc[i, 'total_count'] = len(df_list[i])
        out_df.loc[i, 'mirna_count'] = len(merge_df)
    out_df['pct_mirna'] = round((out_df['mirna_count']/out_df['total_count']) * 100, 2)

    out_df_dict[biocondition] = out_df
    display(out_df)

In [None]:
# write outputs
out_dir = os.path.join(deseq_dir, '4_mirna-tgt-freq')
os.makedirs(out_dir, exist_ok=True)

excel_path = os.path.join(out_dir, f'mirna-tgt-freq.xlsx')
with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
    for biocondition, out_df in out_df_dict.items():
        tsv_path = os.path.join(out_dir, f'{biocondition}_mirna-tgt-freq.tsv.gz')
        out_df.to_csv(tsv_path, index=False, sep='\t', compression='gzip')

        out_df.to_excel(writer, sheet_name=biocondition, index=False)