<a href="https://colab.research.google.com/github/sanjaynagi/rna-seq-meta/blob/main/workflow/notebooks/expression-candidates.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', None)

#### **Ranking genes by median and mean expression**

In [12]:
def load_candidates(analysis, name='median', func=np.nanmedian, query_annotation=None, query_fc=None):
    fc_data = pd.read_csv(f"https://raw.githubusercontent.com/sanjaynagi/rna-seq-meta/main/results/fcs.{analysis}.tsv", sep="\t")
    fc_data = fc_data.set_index(['GeneID', 'GeneName', 'GeneDescription'])

    if query_annotation is not None:
      gene_annot_df = load_annotations()
      gene_ids = gene_ids_from_annotation(gene_annot_df=gene_annot_df, annotation=query_annotation)
      fc_data = fc_data.query("GeneID in @gene_ids")
      assert not fc_data.empty, "No genes were found for the selection. It is possible these genes were removed by the ortholog finding process"
    
    fc_ranked = fc_data.apply(func, axis=1).to_frame().rename(columns={0:f'{name} log2 Fold Change'}).copy()
    fc_ranked = fc_ranked.sort_values(f'{name} log2 Fold Change', ascending=False)
    fc_ranked = fc_ranked.reset_index()
    fc_ranked.loc[:, f'{name} Fold Change'] = np.round(2**fc_ranked.loc[:, f'{name} log2 Fold Change'], 2)

    if query_fc is not None:
      fc_ranked = fc_ranked.query(f'`{name} Fold Change` > {query_fc}')

    return(fc_ranked)
    
def gene_ids_from_annotation(gene_annot_df, annotation):
    if isinstance(annotation, list):
        gene_list = np.array([])
        if annotation[0].startswith("GO"):
            for go in annotation:
                ids = gene_annot_df.query(f"GO_terms.str.contains('{go}', na=False)", engine='python')['gene_id'].to_numpy()
                gene_list = np.hstack([gene_list, ids])
            return(np.unique(gene_list))
        else:
            for dom in annotation:
                ids = gene_annot_df.query("domain == @annotation")['gene_id'].to_numpy()
                gene_list = np.hstack([gene_list, ids])
            return(np.unique(gene_list))
    else:
        if annotation.startswith("GO"): 
            return(gene_annot_df.query(f"GO_terms.str.contains('{annotation}', na=False)", engine='python')['gene_id'].to_numpy())
        else:
            return(gene_annot_df.query("domain == @annotation")['gene_id'].to_numpy())

def load_annotations():
    pfam_df = pd.read_csv("https://github.com/sanjaynagi/rna-seq-meta/blob/main/resources/Anogam_long.pep_Pfamscan.seqs.gz?raw=true", sep="\s+", header=None, compression='gzip')
    go_df = pd.read_csv("https://github.com/sanjaynagi/rna-seq-meta/blob/main/resources/Anogam_long.pep_eggnog_diamond.emapper.annotations.GO.gz?raw=true", sep="\t", header=None, compression='gzip')
    pfam_df.columns = ["transcript", "pstart", "pend", "pfamid", "domain", "domseq"]
    go_df.columns = ['transcript', 'GO_terms']

    gene_annot_df = pfam_df.merge(go_df)
    gene_annot_df.loc[:, 'gene_id'] = gene_annot_df.loc[:, 'transcript'].str.replace("Anogam_", "").str.replace("-R[A-Z]", "", regex=True)
    return(gene_annot_df)

In [13]:
fc_median = load_candidates("gamb_colu_arab", "median", func=np.nanmedian)
fc_median.head(200)

Unnamed: 0,GeneID,GeneName,GeneDescription,median log2 Fold Change,median Fold Change
0,AGAP000047,CPR130,cuticular protein RR-2 family 130 [Source:VB C...,3.09,8.51
1,AGAP001684,,Alkaline phosphatase [Source:UniProtKB/TrEMBL;...,2.77,6.82
2,AGAP002894,CYP6Z4,cytochrome P450 [Source:VB Community Annotation],2.73,6.63
3,AGAP028557,,,2.62,6.15
4,AGAP028402,,,2.59,6.02
5,AGAP008447,CPLCG4,cuticular protein CPLCG family (CPLCG4) [Sourc...,2.57,5.94
6,AGAP006417,,venom allergen [Source:VB Community Annotation],2.51,5.7
7,AGAP008448,,,2.51,5.7
8,AGAP002867,CYP6P4,cytochrome P450 [Source:VB Community Annotation],2.51,5.7
9,AGAP006149,CPLCX3,cuticular protein unclassified [Source:VB Comm...,2.46,5.5


Rank by mean fold change

In [14]:
fc_means = load_candidates(analysis="gamb_colu_arab_fun", name='mean', func=np.nanmean)
fc_means.head(200)

Unnamed: 0,GeneID,GeneName,GeneDescription,mean log2 Fold Change,mean Fold Change
0,AGAP002865,CYP6P3,cytochrome P450 [Source:VB Community Annotation],2.764643,6.8
1,AGAP028402,,,2.691923,6.46
2,AGAP002867,CYP6P4,cytochrome P450 [Source:VB Community Annotation],2.603929,6.08
3,AGAP010109,CPR150,cuticular protein 150 [Source:VB Community Ann...,2.305,4.94
4,AGAP001684,,Alkaline phosphatase [Source:UniProtKB/TrEMBL;...,2.145714,4.43
5,AGAP007445,,,2.099643,4.29
6,AGAP002743,,,2.085357,4.24
7,AGAP006417,,venom allergen [Source:VB Community Annotation],2.026786,4.07
8,AGAP008447,CPLCG4,cuticular protein CPLCG family (CPLCG4) [Sourc...,2.014286,4.04
9,AGAP008817,CPLCP3,cuticular protein (putative) CPLCP3 [Source:VB...,1.989231,3.97


**Finding genes that are consistently over-expressed**

first, any genes which show a positive fold change in all 35 comparisons. 

In [9]:
def consistent_genes(analysis, direction, n):
    
    fc_data = pd.read_csv(f"https://raw.githubusercontent.com/sanjaynagi/rna-seq-meta/main/results/fcs.{database}.tsv", sep="\t")
    print(f"There are {fc_data.shape[0]} genes and {fc_data.shape[1]} differential expression comparisons in {database}")
    fc_data = fc_data.set_index(['GeneID', 'GeneName', 'GeneDescription'])
    if direction == 'up':
        return(fc_data[fc_data.apply(lambda x: (x > 0).sum() >= n , axis=1)])
    else: 
        return(fc_data[fc_data.apply(lambda x: (x < 0).sum() >= n, axis=1)])

In [10]:
consistent_genes("gamb_colu_arab_fun", "up", 20)

NameError: ignored

Allowing for 3 out of 35 arrays to be negative 

In [None]:
consistent_genes("gamb_colu_arab_fun", "up", 24)

Maf-S is consistently upregulated.