<a href="https://colab.research.google.com/github/sanjaynagi/rna-seq-meta/blob/main/workflow/notebooks/expression-candidates.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', None)

#### **Ranking genes by median and mean expression**

In [44]:
def load_candidates(database, name, func=np.nanmedian):
    fc_data = pd.read_csv(f"https://raw.githubusercontent.com/sanjaynagi/rna-seq-meta/main/results/fcs.{database}.tsv", sep="\t")
    fc_data = fc_data.set_index(['GeneID', 'GeneName', 'GeneDescription'])
    
    fc_ranked = fc_data.apply(np.nanmedian, axis=1).to_frame().rename(columns={0:f'{name} log2 Fold Change'})
    fc_ranked = fc_ranked.sort_values(f'{name} log2 Fold Change', ascending=False)
    fc_ranked = fc_ranked.reset_index()
    fc_ranked.loc[:, f'{name} Fold Change'] = np.round(2**fc_ranked.loc[:, f'{name} log2 Fold Change'], 2)
    return(fc_ranked)

In [45]:
fc_median = load_candidates("gamb_colu_arab", "median", func=np.nanmedian)
fc_median.head(200)

Unnamed: 0,GeneID,GeneName,GeneDescription,median log2 Fold Change,median Fold Change
0,AGAP000047,CPR130,cuticular protein RR-2 family 130 [Source:VB C...,3.09,8.51
1,AGAP001684,,Alkaline phosphatase [Source:UniProtKB/TrEMBL;...,2.77,6.82
2,AGAP002894,CYP6Z4,cytochrome P450 [Source:VB Community Annotation],2.73,6.63
3,AGAP028557,,,2.62,6.15
4,AGAP028402,,,2.59,6.02
5,AGAP008447,CPLCG4,cuticular protein CPLCG family (CPLCG4) [Sourc...,2.57,5.94
6,AGAP006417,,venom allergen [Source:VB Community Annotation],2.51,5.7
7,AGAP008448,,,2.51,5.7
8,AGAP002867,CYP6P4,cytochrome P450 [Source:VB Community Annotation],2.51,5.7
9,AGAP006149,CPLCX3,cuticular protein unclassified [Source:VB Comm...,2.46,5.5


Rank by mean fold change

In [46]:
fc_means = load_candidates(database="gamb_colu_arab_fun", name='mean', func=np.nanmean)
fc_means.head(200)

Unnamed: 0,GeneID,GeneName,GeneDescription,mean log2 Fold Change,mean Fold Change
0,AGAP028402,,,2.77,6.82
1,AGAP002865,CYP6P3,cytochrome P450 [Source:VB Community Annotation],2.73,6.63
2,AGAP008817,CPLCP3,cuticular protein (putative) CPLCP3 [Source:VB...,2.56,5.9
3,AGAP002867,CYP6P4,cytochrome P450 [Source:VB Community Annotation],2.535,5.8
4,AGAP008448,,,2.41,5.31
5,AGAP001684,,Alkaline phosphatase [Source:UniProtKB/TrEMBL;...,2.365,5.15
6,AGAP002894,CYP6Z4,cytochrome P450 [Source:VB Community Annotation],2.315,4.98
7,AGAP006149,CPLCX3,cuticular protein unclassified [Source:VB Comm...,2.25,4.76
8,AGAP002866,CYP6P5,cytochrome P450 [Source:VB Community Annotation],2.235,4.71
9,AGAP027993,,,2.195,4.58


**Finding genes that are consistently over-expressed**

first, any genes which show a positive fold change in all 35 comparisons. 

In [49]:
def consistent_genes(database, direction, n):
    
    fc_data = pd.read_csv(f"https://raw.githubusercontent.com/sanjaynagi/rna-seq-meta/main/results/fcs.{database}.tsv", sep="\t")
    print(f"There are {fc_data.shape[0]} genes and {fc_data.shape[1]} differential expression comparisons in {database}")
    fc_data = fc_data.set_index(['GeneID', 'GeneName', 'GeneDescription'])
    if direction == 'up':
        return(fc_data[fc_data.apply(lambda x: (x > 0).sum() >= n , axis=1)])
    else: 
        return(fc_data[fc_data.apply(lambda x: (x < 0).sum() >= n, axis=1)])

In [53]:
consistent_genes("gamb_colu_arab_fun", "up", 20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Tiefora_v_Ngousso_log2FoldChange,Ban_v_BanS_log2FoldChange,BanRe_v_BanS_log2FoldChange,Bak_v_Kisumu_log2FoldChange,VK7_v_Kisumu_log2FoldChange,Cameroon_v_Ngousso_log2FoldChange,Chad_v_Ngousso_log2FoldChange,Niger_v_Ngousso_log2FoldChange,Nigeria_v_Ngousso_log2FoldChange,Agboville_v_Mali_log2FoldChange,...,Asendabo_v_Moz_log2FoldChange,Chewaka_v_Moz_log2FoldChange,Tolay_v_Moz_log2FoldChange,Ethiopia_v_Dongola_log2FoldChange,Gou_v_Moz_log2FoldChange,Cam_fun_v_Fang_log2FoldChange,Fumoz_v_Fang_log2FoldChange,Ghana_fun_v_Fang_log2FoldChange,Malawi_fun_v_Fang_log2FoldChange,Uganda_fun_v_Fang_log2FoldChange
GeneID,GeneName,GeneDescription,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
AGAP002865,CYP6P3,cytochrome P450 [Source:VB Community Annotation],4.26,1.36,-0.76,2.03,3.9,1.26,1.87,0.41,-0.32,3.82,...,2.41,4.7,2.85,0.47,1.07,1.46,5.19,3.58,5.44,2.61
AGAP008218,CYP6Z2,cytochrome P450 [Source:VB Community Annotation],2.64,0.58,-2.17,2.55,4.33,3.46,1.54,2.07,1.5,1.35,...,1.27,1.98,0.7,-0.85,1.14,-0.2,-0.02,0.52,0.55,0.45
AGAP004382,GSTD3,glutathione S-transferase delta class 3 [Source:VB Community Annotation],2.05,1.44,0.39,-0.25,0.9,-0.39,0.67,0.18,1.22,2.09,...,1.72,1.31,1.71,1.75,1.05,0.83,1.24,1.92,1.79,1.38
AGAP008227,,trehalose 6-phosphate synthase/phosphatase [Source:VB Community Annotation],-1.62,-0.82,0.37,0.33,0.53,1.32,1.67,1.92,1.62,-0.96,...,0.32,0.4,0.78,1.12,1.31,0.16,-0.07,0.06,-0.08,0.13
AGAP010675,LRIM18,leucine-rich immune protein (Coil-less) [Source:VB Community Annotation],1.15,0.52,-0.6,0.33,-0.13,0.44,0.49,0.57,0.44,0.41,...,0.05,0.03,-0.07,0.17,0.19,-0.09,-0.04,0.02,-0.01,-0.14
AGAP012201,,Histone H2B [Source:UniProtKB/Swiss-Prot;Acc:Q27442],3.66,1.05,2.9,-1.71,-2.38,2.19,4.41,2.96,3.33,0.97,...,0.06,0.03,0.03,-1.82,-1.71,0.36,0.9,0.27,0.26,0.06
AGAP008217,CYP6Z3,cytochrome P450 [Source:VB Community Annotation],2.81,0.76,-1.56,1.54,2.36,2.25,0.75,1.52,1.6,1.35,...,1.27,1.98,0.7,-0.85,-0.26,-0.2,-0.02,0.52,0.55,0.45
AGAP002862,CYP6AA1,cytochrome P450 [Source:VB Community Annotation],1.02,1.14,0.33,-0.17,1.27,0.12,0.15,-0.41,-0.08,2.93,...,0.94,1.31,1.24,0.76,0.03,0.62,0.78,0.82,0.85,-0.25
AGAP004410,,,0.96,0.42,-0.85,0.66,0.28,0.15,0.32,0.45,0.59,0.18,...,0.57,0.46,0.59,0.04,0.02,0.2,0.17,0.11,0.29,0.06
AGAP011812,,Elongation of very long chain fatty acids protein (Fragment) [Source:UniProtKB/TrEMBL;Acc:Q7PZA2],1.52,0.91,0.04,0.06,-0.37,1.14,1.29,4.0,3.64,0.61,...,-0.32,0.05,-0.52,1.11,0.13,0.4,0.2,0.19,0.24,0.08


Allowing for 3 out of 35 arrays to be negative 

In [58]:
consistent_genes("gamb_colu_arab_fun", "up", 24)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Tiefora_v_Ngousso_log2FoldChange,Ban_v_BanS_log2FoldChange,BanRe_v_BanS_log2FoldChange,Bak_v_Kisumu_log2FoldChange,VK7_v_Kisumu_log2FoldChange,Cameroon_v_Ngousso_log2FoldChange,Chad_v_Ngousso_log2FoldChange,Niger_v_Ngousso_log2FoldChange,Nigeria_v_Ngousso_log2FoldChange,Agboville_v_Mali_log2FoldChange,...,Asendabo_v_Moz_log2FoldChange,Chewaka_v_Moz_log2FoldChange,Tolay_v_Moz_log2FoldChange,Ethiopia_v_Dongola_log2FoldChange,Gou_v_Moz_log2FoldChange,Cam_fun_v_Fang_log2FoldChange,Fumoz_v_Fang_log2FoldChange,Ghana_fun_v_Fang_log2FoldChange,Malawi_fun_v_Fang_log2FoldChange,Uganda_fun_v_Fang_log2FoldChange
GeneID,GeneName,GeneDescription,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
AGAP002865,CYP6P3,cytochrome P450 [Source:VB Community Annotation],4.26,1.36,-0.76,2.03,3.9,1.26,1.87,0.41,-0.32,3.82,...,2.41,4.7,2.85,0.47,1.07,1.46,5.19,3.58,5.44,2.61
AGAP008218,CYP6Z2,cytochrome P450 [Source:VB Community Annotation],2.64,0.58,-2.17,2.55,4.33,3.46,1.54,2.07,1.5,1.35,...,1.27,1.98,0.7,-0.85,1.14,-0.2,-0.02,0.52,0.55,0.45
AGAP004382,GSTD3,glutathione S-transferase delta class 3 [Source:VB Community Annotation],2.05,1.44,0.39,-0.25,0.9,-0.39,0.67,0.18,1.22,2.09,...,1.72,1.31,1.71,1.75,1.05,0.83,1.24,1.92,1.79,1.38
AGAP004410,,,0.96,0.42,-0.85,0.66,0.28,0.15,0.32,0.45,0.59,0.18,...,0.57,0.46,0.59,0.04,0.02,0.2,0.17,0.11,0.29,0.06
AGAP011477,,eupolytin [Source:VB Community Annotation],2.64,1.58,0.24,1.4,0.34,1.21,1.34,0.73,0.27,3.75,...,0.71,0.62,-0.07,-0.16,-0.99,0.17,0.38,1.04,0.26,0.23
AGAP002867,CYP6P4,cytochrome P450 [Source:VB Community Annotation],2.05,2.56,0.59,1.65,4.7,2.3,0.59,-0.28,1.11,3.89,...,2.51,3.12,3.14,2.03,-0.85,1.08,5.16,3.49,5.41,2.51
AGAP000476,,Xaa-Pro aminopeptidase [Source:VB Community Annotation],1.23,0.45,-1.63,1.17,1.8,0.14,0.46,-0.55,-0.94,0.28,...,0.46,0.67,0.55,0.46,-0.23,0.68,0.55,0.38,0.33,0.29
AGAP013128,CYP6AA2,cytochrome P450 [Source:VB Community Annotation],0.85,1.18,0.18,0.33,0.14,0.6,0.35,0.02,0.6,2.38,...,0.31,0.8,0.22,-0.35,-0.68,0.62,0.78,0.82,0.85,-0.25
AGAP000818,CYP9K1,cytochrome P450 [Source:VB Community Annotation],0.64,1.08,-0.73,4.65,2.0,1.58,0.44,0.31,1.08,3.45,...,2.27,2.89,2.28,1.37,0.2,0.0,0.92,1.32,0.93,2.91
AGAP010414,CYP4C28,cytochrome P450 [Source:VB Community Annotation],2.04,1.57,0.61,1.28,0.55,1.23,0.89,1.4,1.45,1.14,...,0.26,-0.33,0.24,0.91,1.04,0.38,0.6,1.51,1.26,1.18


Maf-S is consistently upregulated.