---
title: Genequery Logic and tests.
authors:
- B. Rohan
tags:
- Fisher Exact test + code
- Fisher Enrichment Analysis test + code
- overlap 
- process .gmt file 

created_at: 2019-04-29
updated_at: 2019-05-16
tldr: Decoding Genequery logic and getting results 100%.
---

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import math
import pickle

In [2]:
# To test if species input are correct

def test_species(speciesQy, speciesDb):
    Specie = ["hs", "mm", "rt"]
    
    spQ = speciesQy
    spD = speciesDb
    
    if( (spQ.isalpha()) and (spD.isalpha())):
        spQ = spQ.lower()
        spD = spD.lower()
        
        if(spQ in Specie):
            A = spQ
        else:
            print("Query specie -> {}  is not valid".format(spQ))
            return None
        
        if(spD in Specie):
            B = spD
        else:
            print("Database specie -> {}  is not in Database".format(spD))
            return None
    
    else:
        print("Species entered are Incorrect !!!")
        return None
    
    
    return(A , B)    

### Reading query file and preparing list

In [3]:
# Function to process Query file ( xyz.txt )

def process_Query(file):
    
    colmn = ["Signature"]
    sign = pd.read_csv("{}".format(file), header = None, names = colmn)

    sp_Result = test_species(sign.iat[0,0], sign.iat[1,0])  # to test if species entered are valid

    
    sign = sign[2:].Signature.tolist()
    sign = list(map(int, sign))


    if( (sp_Result) and ( (sp_Result[0]) == (sp_Result[1]) ) ):
        moduleSpecie = sp_Result[1]
        
        print("Genes Entered : {}".format(len(sign)))
        sign = np.unique(sign)
        print("Unique Entrez IDs : {}".format(len(sign)))
    
    elif( (sp_Result) and ( (sp_Result[0]) != (sp_Result[1]) ) ):
        moduleSpecie = sp_Result[1]
    
        print("Genes Entered : {}".format(len(sign)))
    
        sign = Orthology_converter(sign, moduleSpecie)    # Convert user genes to orthologous genes
    
        sign = np.unique(sign)
        print("Unique Entrez IDs : {}".format(len(sign)))
    
    else: 
        return None
    
    
    return(sign, moduleSpecie)

### Convert user genes to Orthogous genes

In [4]:
# Function to convert genes to orthologous genes

def Orthology_converter(signature, DBspecie):
    
    Orthog = pd.read_pickle("Orthology.pkl")
    
    orth_df_1 = Orthog[Orthog.entrez.isin(signature)]
    
    orth_lst = orth_df_1.groupId.values.tolist()
    
    orth_df_2 = Orthog[Orthog.groupId.isin(orth_lst)]
    
    convG = orth_df_2.entrez.where( orth_df_2.species == DBspecie ).dropna().tolist()
    
    convG = [round(x) for x in convG]
    
    return(convG)

### Computing pValue using Fisher Exact Test

In [5]:
# Overestimated Gene World Size = 7000 (Genes supposed to be in a GSE)

universeSize = 7000     
logFactorials = np.repeat( 0.0 , universeSize+1 )

def make_logF(logFactorials):
    for i in range(1,universeSize):
        logFactorials[i] = logFactorials[i-1] + math.log(i)
    return(logFactorials)
        
    
logFactorials = make_logF(logFactorials)

# Hypergeometric test

def calculateHypergeomP(a,b,c,d):
    return(np.exp(logFactorials[a+b]+logFactorials[c+d]+logFactorials[a+c]+logFactorials[b+d]-logFactorials[a+b+c+d]-logFactorials[a]-logFactorials[b]-logFactorials[c]-logFactorials[d]))

# Fisher Exact test to compute significance 

def righttailPvalue(a,b,c,d):
    ra = a
    rb = b
    rc = c
    rd = d
    
    
    if(ra+rb+rc+rd > len(logFactorials)):
        print("Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}")
        return (None)
        
        
    pSum = 0.0
    p = calculateHypergeomP(ra,rb,rc,rd)
    
    while((rc >= 0) and (rb >= 0)):
        if(p != 0):
            pSum += p
            
        if((rb==0) or (rc == 0)):
            
            break
        
        ra = ra+1
        rb = rb-1
        rc = rc-1
        rd = rd+1
    
    # Computing pValue

        p= calculateHypergeomP(ra,rb,rc,rd)
          
    return(pSum)

### Testing fisherExact Test

In [6]:
righttailPvalue(16,31,11,6800)

1.6629173637850388e-29

### Function to compute Bonferroni Correction and return ranked results

In [7]:
def rank_func(QUERY, Specie_DF):
    
    min_logp = -325.0
    bonferroniMaxPvalue = 0.01
    
    
    moduleCount = len(Specie_DF.index)       
    
    #####   2.7 sec
    Specie_DF["Intersection_Size"] = Specie_DF.Entrez.apply(lambda x : len(set(QUERY).intersection(set(x))))
    
    
    Spec = Specie_DF[["Module","Number","Size","Intersection_Size"]].copy() # drop Entrez column

    
    Spec["Universe"] = Spec["Module"].map(Spec.groupby("Module")["Size"].sum())
    Spec["Query_overlap_Universe"] = Spec["Module"].map(Spec.groupby("Module")["Intersection_Size"].sum())
    
    
    Spec = Spec[(Spec["Number"] > 0) & (Spec["Intersection_Size"] > 0)]
    
    
    Spec["MnotQ"] = Spec["Size"] - Spec["Intersection_Size"]
    Spec["QnotM"] = Spec["Query_overlap_Universe"] - Spec["Intersection_Size"]
    Spec["rest"] = Spec["Universe"] - Spec["QnotM"] - Spec["MnotQ"] - Spec["Intersection_Size"]
    
    
    
    Spec["pval"] = np.vectorize(righttailPvalue)(Spec['Intersection_Size'], Spec['MnotQ'], Spec['QnotM'], Spec['rest'])
    
    
    Spec["apval"] = Spec["pval"].apply(lambda x : x*moduleCount if(x*moduleCount <= bonferroniMaxPvalue) else None)
    
    
    Spec = Spec[Spec["apval"] > 0]
    
        
    Spec["logpval"] = Spec["pval"].apply(lambda x : math.log10(x) if(x > 0) else min_logp ) 
    
    Spec["logapval"] = Spec["apval"].apply(lambda x : math.log10(x) if(x > 0) else min_logp)

    Spec = Spec[Spec["logapval"] < 0]
    
    Spec = Spec.sort_values(['logapval'], ascending=[True])
    
    
    RANK_result = Spec[["logpval","logapval","Module","Number","Intersection_Size","Size"]].copy() # Keeping only required columns

    
    return(RANK_result)

### Reading the Module and parsing to rank function


In [23]:
Qfile = "QUERY.txt"

Qresult = process_Query(Qfile)

QUERY = sorted(Qresult[0])
Species = Qresult[1]

FILE = Species+"_modules.pkl"

print(QUERY, FILE)

Specie_DF = pd.read_pickle("{}".format(FILE))

result = rank_func(QUERY, Specie_DF) # takes 40 seconds

Genes Entered : 185
Unique Entrez IDs : 185
[11520, 11535, 11541, 11639, 11674, 11676, 11717, 11910, 12032, 12043, 12111, 12177, 12209, 12226, 12306, 12368, 12389, 12452, 12457, 12575, 12576, 12577, 12767, 12831, 12870, 13004, 13008, 13179, 13198, 13358, 13527, 13615, 13636, 13638, 13649, 13806, 13807, 13808, 14066, 14121, 14219, 14281, 14284, 14385, 14387, 14433, 14447, 14538, 14635, 14733, 14734, 14735, 14815, 14828, 14936, 15116, 15211, 15275, 15277, 15368, 15417, 15476, 15529, 15531, 15931, 15937, 16006, 16007, 16009, 16193, 16322, 16476, 16572, 16770, 16795, 16828, 16833, 16948, 17035, 17133, 17319, 17423, 17684, 17859, 17872, 17886, 17988, 18030, 18451, 18452, 18484, 18534, 18591, 18641, 18654, 18655, 18682, 18750, 18770, 18772, 18787, 18793, 19017, 19252, 19285, 19309, 19664, 19883, 20198, 20341, 20439, 20527, 20778, 20855, 20893, 20970, 20971, 21366, 21753, 21809, 21810, 21817, 21929, 21983, 21985, 21991, 22022, 22339, 22346, 22359, 22403, 22695, 23849, 23871, 26401, 26757, 505

In [25]:
result.head()

Unnamed: 0,logpval,logapval,Module,Number,Intersection_Size,Size
32395,-41.077664,-36.135388,GSE3296_GPL1261,10,46,172
32665,-40.486808,-35.544532,GSE3318_GPL1261,7,46,169
30995,-38.614315,-33.672039,GSE3196_GPL1261,7,48,212
69881,-34.993245,-30.050969,GSE62128_GPL6246,4,52,324
60251,-33.676861,-28.734585,GSE54454_GPL11180,6,43,181


In [26]:
result.to_csv("mm_orig_result.csv", sep = "\t", index = False)

In [23]:
#gQ_res.to_csv("MM_module.csv", sep='\t', encoding='utf-8')


### Total number of modules in results of genequery

In [27]:
result.count()

logpval              415
logapval             415
Module               415
Number               415
Intersection_Size    415
Size                 415
dtype: int64

### Testing expanded genequery

In [18]:
mdf.count()   # Number of modules in mouse DB

Module    150078
Entrez    150078
Number    150078
Size      150078
dtype: int64

### Reading the Module and parsing to rank function


In [8]:
Qfile = "QUERY.txt"

Qresult = process_Query(Qfile)

QUERY = sorted(Qresult[0])
Species = Qresult[1]

FILE = Species+"_all_modules.pkl"

print(QUERY, FILE)

Specie_DF = pd.read_pickle("{}".format(FILE))

result = rank_func(QUERY, Specie_DF) # takes 40 seconds

Genes Entered : 185
Unique Entrez IDs : 185
[11520, 11535, 11541, 11639, 11674, 11676, 11717, 11910, 12032, 12043, 12111, 12177, 12209, 12226, 12306, 12368, 12389, 12452, 12457, 12575, 12576, 12577, 12767, 12831, 12870, 13004, 13008, 13179, 13198, 13358, 13527, 13615, 13636, 13638, 13649, 13806, 13807, 13808, 14066, 14121, 14219, 14281, 14284, 14385, 14387, 14433, 14447, 14538, 14635, 14733, 14734, 14735, 14815, 14828, 14936, 15116, 15211, 15275, 15277, 15368, 15417, 15476, 15529, 15531, 15931, 15937, 16006, 16007, 16009, 16193, 16322, 16476, 16572, 16770, 16795, 16828, 16833, 16948, 17035, 17133, 17319, 17423, 17684, 17859, 17872, 17886, 17988, 18030, 18451, 18452, 18484, 18534, 18591, 18641, 18654, 18655, 18682, 18750, 18770, 18772, 18787, 18793, 19017, 19252, 19285, 19309, 19664, 19883, 20198, 20341, 20439, 20527, 20778, 20855, 20893, 20970, 20971, 21366, 21753, 21809, 21810, 21817, 21929, 21983, 21985, 21991, 22022, 22339, 22346, 22359, 22403, 22695, 23849, 23871, 26401, 26757, 505

Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logF

Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logF

Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logF

Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logF

Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logFactorials.size - 1}
Sum of the arguments must be not greater than universe: $a + $b + $c + $d > ${logF

In [9]:
result.head()

Unnamed: 0,logpval,logapval,Module,Number,Intersection_Size,Size
32395,-41.077664,-35.901347,GSE3296_GPL1261,10,46,172
32665,-40.486808,-35.310491,GSE3318_GPL1261,7,46,169
30995,-38.614315,-33.437998,GSE3196_GPL1261,7,48,212
69881,-34.993245,-29.816927,GSE62128_GPL6246,4,52,324
60251,-33.676861,-28.500544,GSE54454_GPL11180,6,43,181


In [10]:
result.count()

logpval              556
logapval             556
Module               556
Number               556
Intersection_Size    556
Size                 556
dtype: int64

In [11]:
result.to_csv("mm_expanded_result.csv", sep = "\t", index = False)

### google Drive Links for result files

result_original_gQ -> https://drive.google.com/open?id=1dPdcGn4e1R7HaO2Dfkqr2byorBbmzrob

result_expanded_gQ -> https://drive.google.com/open?id=1BfqrSa1B1hozPcJN68wf4i6m8HmXGhYj