In [1]:
# Import packages
import pandas as pd
import numpy as np
import scipy.stats as stats
import pingouin as pg
import random
import matplotlib.pyplot as plt
import seaborn as sns

import gff2bed
import Orthoscripts

# Disable chained assignments
pd.options.mode.chained_assignment = None 

In [2]:
# Asterias rubens
Astrub = Orthoscripts.readBED("Data/Genelists/Asterias.rubens.genelist.bed")

# Holothuria leucospilota
Holleu = Orthoscripts.readBED("Data/Genelists/Holothuria.leucospilota.genelist.bed")

# Paracentrotus livides
Parliv = Orthoscripts.readBED("Data/Genelists/Paracentrotus.lividus.genelist.bed")

# Branchiostoma lanceolatum
Bralan = Orthoscripts.readBED("Data/Genelists/Branchiostoma.lanceolatum.genelist.bed")

# Branchiostoma floridae
Braflo = Orthoscripts.readBED("Data/Genelists/Branchiostoma.floridae.genelist.bed")

# Marthasterias glacialis
Margla = Orthoscripts.readBED("Data/Genelists/Marthasterias.glacialis.genelist.bed")

# Pecten maximus
Pecmax = Orthoscripts.readBED("Data/Genelists/Pecmax.genelist.bed")

# Stichopus chloronotus
Stichl = Orthoscripts.readBED("Data/Genelists/Stichopus.chloronotus.genelist.bed")

# Amphiura filiformis 
Ampfil = Orthoscripts.readBED("Data/Genelists/Amphiura.filiformis.genelist.bed")

# Ephydatia muelleri
Ephmue = Orthoscripts.readBED("Data/Genelists/Ephydatia.muelleri.genelist.bed")

# Ancestor 
Ancestor = Orthoscripts.readBED("Data/Genelists/Ancestor.genelist.bed")

In [3]:
# Import orthologs
Astrub_Holleu = np.loadtxt("Data/Orthologs/Asterias.rubens+Holothuria.leucospilota.txt", dtype = "str")

Astrub_Parliv = np.loadtxt("Data/Orthologs/Asterias.rubens+Paracentrotus.lividus.txt", dtype = "str")

Holleu_Parliv = np.loadtxt("Data/Orthologs/Holothuria.leucospilota+Paracentrotus.lividus.txt", dtype = "str")

Margla_Bralan = np.loadtxt("Data/Orthologs/Marthasterias.glacialis+Branchiostoma.lanceolatum.txt", dtype = "str")

Margla_Pecmax = np.loadtxt("Data/Orthologs/Marthasterias.glacialis+Pecten.maximus.txt", dtype = "str")

Margla_Stichl = np.loadtxt("Data/Orthologs/Marthasterias.glacialis+Stichopus.chloronotus.txt", dtype = "str")

Pecmax_Bralan = np.loadtxt("Data/Orthologs/Pecten.maximus+Branchiostoma.lanceolatum.txt", dtype = "str")

Stichl_Bralan = np.loadtxt("Data/Orthologs/Stichopus.chloronotus+Branchiostoma.lanceolatum.txt", dtype = "str")

Stichl_Pecmax = np.loadtxt("Data/Orthologs/Stichopus.chloronotus+Pecten.maximus.txt", dtype = "str")

Pecmax_Holleu = np.loadtxt("Orthology pipeline/orthologs/Pecmax+Holleu_sensitive.txt", dtype = "str")

Holleu_Bralan = np.loadtxt("Orthology pipeline/orthologs/Holleu+Bralan_sensitive.txt", dtype = "str")

Pecmax_Bralan = np.loadtxt("Orthology pipeline/orthologs/Pecmax+Bralan_sensitive.txt", dtype = "str")

Pecmax_Braflo = np.loadtxt("Orthology pipeline/orthologs/Pecmax+Braflo_sensitive.txt", dtype = "str")

Holleu_Braflo = np.loadtxt("Orthology pipeline/orthologs/Holleu+Braflo_sensitive.txt", dtype = "str")

Holleu_Ampfil = np.loadtxt("Data/Orthologs/Holothuria.leucospilota+Amphiura.filiformis.txt", dtype = "str")

Braflo_Ephmue = np.loadtxt("Orthology pipeline/orthologs/Braflo+Ephmue_sensitive.txt", dtype = "str")

Holleu_Ephmue = np.loadtxt("Orthology pipeline/orthologs/Holleu+Ephmue_sensitive.txt", dtype = "str")

Pecmax_Ephmue = np.loadtxt("Orthology pipeline/orthologs/Pecmax+Ephmue_sensitive.txt", dtype = "str")

In [4]:
Astrub = Astrub.loc[Astrub['Chromosome'].str.contains('chr')]
Bralan = Bralan.loc[Bralan['Chromosome'].str.contains('BFL_')]
Braflo = Braflo.loc[Braflo['Chromosome'].str.contains('BFL_')]
Pecmax = Pecmax.loc[Pecmax['Chromosome'].str.contains('PYE_')]

# Ephmue genelist: remove suffix
Ephmue['Name'] = Ephmue['Name'].str.rsplit('.t1').str.get(0)

# Parliv genelist: select chromosomal scaffolds
Parliv = Orthoscripts.unscaff(Parliv, 100)
Ampfil = Orthoscripts.unscaff(Ampfil, 100)
Ephmue = Orthoscripts.unscaff(Ephmue, 600)

Astrub_Parliv = Orthoscripts.orthFix(Astrub_Parliv, 'B', 'Parliv_', 1)
Margla_Bralan = Orthoscripts.orthFix(Margla_Bralan, 'A', '.1', 0)
Margla_Stichl = Orthoscripts.orthFix(Margla_Stichl, 'A', '.1', 0)
Margla_Stichl = Orthoscripts.orthFix(Margla_Stichl, 'B', '.1', 0)
Margla_Pecmax = Orthoscripts.orthFix(Margla_Pecmax, 'B', '.1', 0)
Holleu_Ampfil = Orthoscripts.orthFix(Holleu_Ampfil, 'B', '.1', 0)
Holleu_Bralan = Orthoscripts.orthFix(Holleu_Bralan, 'B', '_', 0)

Old orthology function

In [None]:
def orthofy(genelistA, genelistB, orthologies):
    
    """
    inputs:
    genelistA: gene list for species A
    genelistB: gene list for species B
    orthologies: orthology dataset
    """
    
    # Make ortholog dictionaries
    A_orthdict = dict(zip(orthologies[:, 1], orthologies[:, 0]))
    B_orthdict = dict(zip(orthologies[:, 2], orthologies[:, 0]))

    # Replace genelist values with ortholog dictionaries
    A_data = genelistA.replace({'Name': A_orthdict})
    B_data = genelistB.replace({'Name' : B_orthdict})
    
    # Add column for orthologs: 1 if ortholog, 0 if not
    B_data['Ortholog'] = B_data['Name'].apply(lambda x:1 if 'ortholog' in x.lower() else 0)
    A_data['Ortholog'] = A_data['Name'].apply(lambda x:1 if 'ortholog' in x.lower() else 0)
    
    # Isolate orthologies
    A_ortho = A_data.loc[A_data['Ortholog'] == 1]
    A_dict = dict(zip(A_ortho.Name, A_ortho.Chromosome))

    B_ortho = B_data.loc[B_data['Ortholog'] == 1]
    B_dict = dict(zip(B_ortho.Name, B_ortho.Chromosome))
    
    # Seperate all orthology entries into new dataframe
    AB_data = pd.DataFrame({'Orthologs' : orthologies[:, 0],
                            'speciesA' : orthologies[:, 0],
                            'speciesB' : orthologies[:, 0]})
    
    # Replace location in A and B with orthology dictionary keys
    AB_data['speciesA'] = AB_data['speciesB'].map(A_dict)
    AB_data['speciesB'] = AB_data['speciesB'].map(B_dict)
    
    # Calculate number of orthologs for each pair of chromosomes
    AB_data = AB_data.groupby(['speciesA', 'speciesB']).count().reset_index()
    
    A = A_data.Name.values.tolist()
    B = B_data.Name.values.tolist()
    M = len(list(set(A) & set(B)))
    
    # Define inner function for hypergeometric testing
    def hypertest(chrA, chrB):
        nA = AB_data.loc[(AB_data['speciesA'] == chrA), 'Orthologs'].sum()
        nB = AB_data.loc[(AB_data['speciesB'] == chrB), 'Orthologs'].sum()
        x = AB_data.loc[(AB_data['speciesA'] == chrA) & (AB_data['speciesB'] == chrB), 'Orthologs'].sum()
    
        p = stats.hypergeom.sf(x - 1, M, nA, nB)
        
        return p

    # Conduct hypergeometric testing
    AB_data['p-Values'] = AB_data.apply(lambda x : hypertest(x['speciesA'], x['speciesB']), axis = 1)
    
    # Apply BH testing correction
    AB_data['Results'], AB_data['p-Values'] = pg.multicomp(AB_data['p-Values'], method = 'fdr_bh')
    
    # Remove all rows that have been rejected in BH correction
    AB_data = AB_data.loc[AB_data['Results'] == True]
    
    return AB_data

Print whole output

In [None]:
# Prints whole output
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  
    print(Astrub)

Replace df values with dict

In [None]:
Astrub['PGenes'] = Astrub['PGenes'].map(lambda x: orthdictA.get(x, x))

Rename Pecmax scaffolds to chromosomes

In [None]:
# Pecmax data: Replace scaffold names with chromosome names
Pecmax = Pecmax.replace('HiC_scaffold_2', 'PYE_1')
Pecmax = Pecmax.replace('HiC_scaffold_18', 'PYE_2')
Pecmax = Pecmax.replace('HiC_scaffold_19', 'PYE_3')
Pecmax = Pecmax.replace('HiC_scaffold_16', 'PYE_4')
Pecmax = Pecmax.replace('HiC_scaffold_4', 'PYE_5')
Pecmax = Pecmax.replace('HiC_scaffold_7', 'PYE_6')
Pecmax = Pecmax.replace('HiC_scaffold_11', 'PYE_7')
Pecmax = Pecmax.replace('HiC_scaffold_15', 'PYE_8')
Pecmax = Pecmax.replace('HiC_scaffold_17', 'PYE_9')
Pecmax = Pecmax.replace('HiC_scaffold_10', 'PYE_10')
Pecmax = Pecmax.replace('HiC_scaffold_1', 'PYE_11')
Pecmax = Pecmax.replace('HiC_scaffold_3', 'PYE_12')
Pecmax = Pecmax.replace('HiC_scaffold_12', 'PYE_13')
Pecmax = Pecmax.replace('HiC_scaffold_5', 'PYE_14')
Pecmax = Pecmax.replace('HiC_scaffold_9', 'PYE_15')
Pecmax = Pecmax.replace('HiC_scaffold_14', 'PYE_16')
Pecmax = Pecmax.replace('HiC_scaffold_13', 'PYE_17')
Pecmax = Pecmax.replace('HiC_scaffold_8', 'PYE_18')
Pecmax = Pecmax.replace('HiC_scaffold_6', 'PYE_19')

Rename Ephmue scaffolds to chromosomes

In [None]:
Ephmue['Name'] = Ephmue['Name'].str.rsplit('.t1').str.get(0)
Ephmue = Ephmue.replace('scaffold_0001', 'EMU_1')
Ephmue = Ephmue.replace('scaffold_0002', 'EMU_2')
Ephmue = Ephmue.replace('scaffold_0003', 'EMU_3')
Ephmue = Ephmue.replace('scaffold_0004', 'EMU_4')
Ephmue = Ephmue.replace('scaffold_0005', 'EMU_5')
Ephmue = Ephmue.replace('scaffold_0006', 'EMU_6')
Ephmue = Ephmue.replace('scaffold_0007', 'EMU_7')
Ephmue = Ephmue.replace('scaffold_0008', 'EMU_8')
Ephmue = Ephmue.replace('scaffold_0009', 'EMU_9')
Ephmue = Ephmue.replace('scaffold_0010', 'EMU_10')
Ephmue = Ephmue.replace('scaffold_0011', 'EMU_11')
Ephmue = Ephmue.replace('scaffold_0012', 'EMU_12')
Ephmue = Ephmue.replace('scaffold_0013', 'EMU_13')
Ephmue = Ephmue.replace('scaffold_0014', 'EMU_14')
Ephmue = Ephmue.replace('scaffold_0015', 'EMU_15')
Ephmue = Ephmue.replace('scaffold_0016', 'EMU_16')
Ephmue = Ephmue.replace('scaffold_0017', 'EMU_17')
Ephmue = Ephmue.replace('scaffold_0018', 'EMU_18')
Ephmue = Ephmue.replace('scaffold_0019', 'EMU_19')
Ephmue = Ephmue.replace('scaffold_0020', 'EMU_20')
Ephmue = Ephmue.replace('scaffold_0021', 'EMU_21')
Ephmue = Ephmue.replace('scaffold_0022', 'EMU_22')
Ephmue = Ephmue.replace('scaffold_00023', 'EMU_23')
np.savetxt(r'Data/Genelists/Ephmue.genelist.bed', Ephmue.values, fmt = '%s')

Old ancestral chromosome code

In [None]:
PB = orthofy(Pecmax, Braflo, Pecmax_Braflo)
PB = PB.dropna()

# Make matrix with corresponding chromosomes
Amp = ['BFL_11', 'BFL_10', 'BFL_16', 'BFL_8', 'BFL_3', 'BFL_1', 'BFL_18', 'BFL_14', 'BFL_15', 'BFL_5', 'BFL_7', 'BFL_3', 'BFL_17', 'BFL_3', 'BFL_19', 'BFL_12', 'BFL_1', 'BFL_13', 'BFL_2', 'BFL_2', 'BFL_6', 'BFL_9', 'BFL_4', 'BFL_4']
Sca = ['PYE_10', 'PYE_13', 'PYE_1', 'PYE_1', 'PYE_17', 'PYE_5', 'PYE_19', 'PYE_15', 'PYE_4', 'PYE_6', 'PYE_7', 'PYE_2', 'PYE_18', 'PYE_2', 'PYE_3', 'PYE_14', 'PYE_16', 'PYE_2', 'PYE_4', 'PYE_9', 'PYE_8', 'PYE_3', 'PYE_11', 'PYE_12']
Anc = ['G', 'B1', 'B2', 'M', 'C2', 'A1aA1b', 'B3', 'P', 'L', 'EaEb', 'F', 'QbQa', 'J1', 'QcQd', 'O2', 'N', 'A2', 'H', 'J2', 'C1', 'D', 'K', 'I', 'O1']
ChrCorr = np.column_stack((Sca, Amp, Anc))

# Make dataframe with corresponding chromosomes
PBgenes = pd.DataFrame()
for i in range (0, 24): 
    PBorthologs = PB.loc[(PB['A'] == ChrCorr[i, 0]) & (PB['B'] == ChrCorr[i, 1])]
    PBorthologs['Chr'] = ChrCorr[i, 2]

    PBgenes = pd.concat([PBgenes, PBorthologs])

# Manually add PYE_12
PBorthologs = PB.loc[(PB['A'] == 'PYE_12') & (PB['B'] != 'BFL_4')]
PBorthologs['Chr'] = 'R'
PBgenes = pd.concat([PBgenes, PBorthologs])

PBgenes['BGenes'] = PBgenes.loc[:, 'Orthologs']
PBgenes = PBgenes.rename(columns = {'Orthologs' : 'PGenes'})
PBgenes = PBgenes[['Chr', 'A', 'PGenes', 'B', 'BGenes']]

# Make reverse ortholog dictionaries (ortholog : gene name)
orthdictA = dict(zip(Pecmax_Braflo[:, 0], Pecmax_Braflo[:, 1]))
orthdictB = dict(zip(Pecmax_Braflo[:, 0], Pecmax_Braflo[:, 2]))

# Replace values
PBgenes['PGenes'] = PBgenes['PGenes'].map(lambda x: orthdictA.get(x, x))
PBgenes['BGenes'] = PBgenes['BGenes'].map(lambda x: orthdictB.get(x, x))

# Make dictionaries (H gene name : P/B gene name)
orthdictP = dict(zip(Pecmax_Holleu[:, 1], Pecmax_Holleu[:, 2]))
orthdictB = dict(zip(Holleu_Braflo[:, 2], Holleu_Braflo[:, 1]))

# Replace values
PBgenes['PGenes'] = PBgenes['PGenes'].map(lambda x: orthdictP.get(x, x))
PBgenes['BGenes'] = PBgenes['BGenes'].map(lambda x: orthdictB.get(x, x))

# Select all values orthologous in both columns
Ancestor = PBgenes.loc[(PBgenes['PGenes'].str.contains('gene-HOLleu_')) & 
                       (PBgenes['BGenes'].str.contains('gene-HOLleu_'))]

Ancestor = Ancestor.rename(columns = {'Chr' : 'Chromosome',
                                      'PGenes' : 'Name', 
                                      'A' : 'Pchr',
                                      'B' : 'Bchr'})

PB = orthofy(Pecmax, Braflo, Pecmax_Braflo)
PB = PB.dropna()

# Make matrix with corresponding chromosomes
Amp = ['BFL_11', 'BFL_10', 'BFL_16', 'BFL_8', 'BFL_3', 'BFL_1', 'BFL_18', 'BFL_14', 'BFL_15', 'BFL_5', 'BFL_7', 'BFL_3', 'BFL_17', 'BFL_3', 'BFL_19', 'BFL_12', 'BFL_1', 'BFL_13', 'BFL_2', 'BFL_2', 'BFL_6', 'BFL_9', 'BFL_4', 'BFL_4']
Sca = ['PYE_10', 'PYE_13', 'PYE_1', 'PYE_1', 'PYE_17', 'PYE_5', 'PYE_19', 'PYE_15', 'PYE_4', 'PYE_6', 'PYE_7', 'PYE_2', 'PYE_18', 'PYE_2', 'PYE_3', 'PYE_14', 'PYE_16', 'PYE_2', 'PYE_4', 'PYE_9', 'PYE_8', 'PYE_3', 'PYE_11', 'PYE_12']
Anc = ['G', 'B1', 'B2', 'M', 'C2', 'A1aA1b', 'B3', 'P', 'L', 'EaEb', 'F', 'QbQa', 'J1', 'QcQd', 'O2', 'N', 'A2', 'H', 'J2', 'C1', 'D', 'K', 'I', 'O1']
ChrCorr = np.column_stack((Sca, Amp, Anc))

# Make dataframe with corresponding chromosomes
PBgenes = pd.DataFrame()
for i in range (0, 24): 
    PBorthologs = PB.loc[(PB['A'] == ChrCorr[i, 0]) & (PB['B'] == ChrCorr[i, 1])]
    PBorthologs['Chr'] = ChrCorr[i, 2]

    PBgenes = pd.concat([PBgenes, PBorthologs])

# Manually add PYE_12
PBorthologs = PB.loc[(PB['A'] == 'PYE_12') & (PB['B'] != 'BFL_4')]
PBorthologs['Chr'] = 'R'
PBgenes = pd.concat([PBgenes, PBorthologs])

PBgenes['BGenes'] = PBgenes.loc[:, 'Orthologs']
PBgenes = PBgenes.rename(columns = {'Orthologs' : 'PGenes'})
PBgenes = PBgenes[['Chr', 'A', 'PGenes', 'B'BGenes']]

# Make reverse ortholog dictionaries (ortholog : gene name)
orthdictA = dict(zip(Pecmax_Braflo[:, 0], Pecmax_Braflo[:, 1]))
orthdictB = dict(zip(Pecmax_Braflo[:, 0], Pecmax_Braflo[:, 2]))

# Replace values
PBgenes['PGenes'] = PBgenes['PGenes'].map(lambda x: orthdictA.get(x, x))
PBgenes['BGenes'] = PBgenes['BGenes'].map(lambda x: orthdictB.get(x, x))

# Make dictionaries (H gene name : P/B gene name)
orthdictP = dict(zip(Pecmax_Holleu[:, 1], Pecmax_Holleu[:, 2]))
orthdictB = dict(zip(Holleu_Braflo[:, 2], Holleu_Braflo[:, 1]))

# Replace values
PBgenes['PGenes'] = PBgenes['PGenes'].map(lambda x: orthdictP.get(x, x))
PBgenes['BGenes'] = PBgenes['BGenes'].map(lambda x: orthdictB.get(x, x))

# Select all values orthologous in both columns
Ancestor = PBgenes.loc[(PBgenes['PGenes'].str.contains('gene-HOLleu_')) & 
                       (PBgenes['BGenes'].str.contains('gene-HOLleu_'))]

Ancestor = Ancestor[['Chr', 'PGenes']]
Ancestor = Ancestor.rename(columns = {'Chr' : 'Chromosome',
                                      'PGenes' : 'Name'})

# Select all values orthologous only in one column
AncP = PBgenes.loc[(PBgenes['PGenes'].str.contains('gene-HOLleu_')) & (PBgenes['BGenes'].str.contains('gene-HOLleu_') == False)]
AncB = PBgenes.loc[(PBgenes['BGenes'].str.contains('gene-HOLleu_')) & (PBgenes['PGenes'].str.contains('gene-HOLleu_') == False)]

AncP = AncP[['Chr', 'PGenes']]
AncB = AncB[['Chr', 'BGenes']]

Ancestor = pd.concat([Ancestor, 
                      AncP.rename(columns = {'Chr' : 'Chromosome', 'PGenes' : 'Name'}), 
                      AncB.rename(columns = {'Chr' : 'Chromosome', 'BGenes' : 'Name'})])

All old

In [None]:
# Make ortholog dictionaries
Aorthdict = dict(zip(ortholog[ :,1], ortholog[ :,0]))
Horthdict = dict(zip(ortholog[ :,2], ortholog[ :,0]))

# Replace values with ortholog dictionary
Adata = Adata.replace({"Name": Aorthdict})

# Edit Hdata values (Hchr1 -> chr1)
val2 = ['Hchr1', 'Hchr2', 'Hchr3', 'Hchr4', 'Hchr5', 'Hchr6', 
        'Hchr7', 'Hchr8', 'Hchr9', 'Hchr10', 'Hchr11', 'Hchr12',
        'Hchr13', 'Hchr14', 'Hchr15', 'Hchr16', 'Hchr17', 
        'Hchr18', 'Hchr19', 'Hchr20', 'Hchr21', 'Hchr22', 'Hchr23']

for i in range(0, 23):
    Hdata = Hdata.replace(val2[i], val1[i])

# Replace values with ortholog dictionary
Hdata = Hdata.replace({"Name" : Horthdict})

# Add ortholog column, value is 1 or 0
Hdata["Ortholog"] = Hdata["Name"].apply(lambda x:1 if 'ortholog' in x.lower() else 0)
Adata["Ortholog"] = Adata["Name"].apply(lambda x:1 if 'ortholog' in x.lower() else 0)

# Make new dataframe with just the orthologs
Hortho = Hdata.loc[Hdata['Ortholog'] == 1]
Hdict = dict(zip(Hortho.Name, Hortho.Chromosome))

Aortho = Adata.loc[Adata['Ortholog'] == 1]
Adict = dict(zip(Aortho.Name, Aortho.Chromosome))

# Calculate number of orthologs for each chromosome
HChr = []
for i in val1:
    HChr.append(len(Hortho.loc[(Hortho['Chromosome'] == i)]))
print(HChr)

AChr = []
for i in val1:
    AChr.append(len(Aortho.loc[(Aortho['Chromosome'] == i)]))
print(AChr)

# Make new dataframe
Odata = pd.DataFrame()
Odata['Orthologs'] = ortholog[:, 0]
Odata['A'] = ortholog[:, 0]
Odata['H'] = ortholog[:, 0]

# Replace location in A and H with orthology dictionary keys
Odata['A'] = Odata['A'].map(Adict)
Odata['H'] = Odata['H'].map(Hdict)

# Calculate number of orthologs for each pair of chromosomes
Odata = Odata.groupby(['A', 'H']).count()
Odata = Odata.reset_index()

# Plot
sns.scatterplot(data = Odata, x = 'A', y = 'H', size = 'Orthologs', hue = 'Orthologs', palette = "crest")
plt.legend(bbox_to_anchor=(1, 1), loc='upper left', fontsize=10)
plt.xlabel("Asterias rubens")
plt.ylabel("Holothuria leucospilota")
plt.xticks(rotation='vertical')

plt.show()

# Removing values <100:
minOdata = Odata.loc[Odata["Orthologs"] >= 100]

# Plot
plt.rcParams['figure.figsize'] = [8, 8]
sns.scatterplot(data = minOdata, x = 'A', y = 'H', size = 'Orthologs', hue = 'Orthologs', palette = "crest")
plt.legend(bbox_to_anchor=(1, 1), loc='upper left', fontsize=10)
plt.xlabel("Asterias rubens")
plt.ylabel("Holothuria leucospilota")
plt.xticks(rotation='vertical')

plt.show()

# Hypergeometric test
def hypertest(chrA, chrB, dataset = Odata, speciesA = 'A', speciesB = 'B'):
    """
    M: total number of orthologs on both AchrN and BchrN
    nA, nB: number of orthologs on AchrN and BchrN individually
    x: number of orthologs on both AchrN and BchrN
    """
    nA = dataset.loc[(dataset['A'] == chrA), 'Orthologs'].sum()
    nB = dataset.loc[(dataset['H'] == chrB), 'Orthologs'].sum()
    x = dataset.loc[(dataset['A'] == chrA) & (dataset['H'] == chrB), 'Orthologs'].sum()
    
    p = stats.hypergeom.sf(x, (nA + nB), nA, nB)
     
    print(nA, nB, (nA + nB), x, p)
    
hypertest('chr1', 'chr3')

In [None]:
    if r <= 0.30:
        if len(ancestor) < 2: continue
        speciesA, log = fission(speciesA)
        events['EVENT_' + str(event + 1)] = log
        print(log)
    
    elif r <= 0.45:
        speciesA, log = translocation(speciesA)
        events['EVENT_' + str(event + 1)] = log
        print(log)
    
    elif r <= 0.70:
        speciesA, log = fusion(speciesA)
        events['EVENT_' + str(event + 1)] = log
        print(log)
    
    elif r <= 0.95:
        speciesA, log = fusion(speciesA, mixing = 0.5)
        events['EVENT_' + str(event + 1)] = log
        print(log)
        
    else:
        speciesA, log = syntenyloss(speciesA)
        events['EVENT_' + str(event + 1)] = log
        print(log)
        continue