In [135]:
# Import packages
import pandas as pd
import numpy as np
import scipy.stats as stats
import pingouin as pg
import matplotlib.pyplot as plt
import seaborn as sns

import Orthoscripts

# disable chained assignments
pd.options.mode.chained_assignment = None 

#### Import data

In [136]:
# Asterias rubens
Astrub = Orthoscripts.readBED("Data/Genelists/Asterias.rubens.genelist.bed")

# Holothuria leucospilota
Holleu = Orthoscripts.readBED("Data/Genelists/Holothuria.leucospilota.genelist.bed")

# Paracentrotus livides
Parliv = Orthoscripts.readBED("Data/Genelists/Paracentrotus.lividus.genelist.bed")

# Branchiostoma lanceolatum
Bralan = Orthoscripts.readBED("Data/Genelists/Branchiostoma.lanceolatum.genelist.bed")

# Branchiostoma floridae
Braflo = np.loadtxt("Braflo_info_ra.txt", dtype = "str")
Braflo = pd.DataFrame(Braflo, columns = ['Name', 'Chromosome', 'Start', 'End', 'Dot'])

# Marthasterias glacialis
Margla = Orthoscripts.readBED("Data/Genelists/Marthasterias.glacialis.genelist.bed")

# Pecten maximus
Pecmax = Orthoscripts.readBED("Data/Genelists/Pecten.maximus.genelist.bed")

# Stichopus chloronotus
Stichl = Orthoscripts.readBED("Data/Genelists/Stichopus.chloronotus.genelist.bed")

In [137]:
# Import orthologs
Astrub_Holleu = np.loadtxt("Data/Orthologs/Asterias.rubens+Holothuria.leucospilota.txt", dtype = "str")

Astrub_Parliv = np.loadtxt("Data/Orthologs/Asterias.rubens+Paracentrotus.lividus.txt", dtype = "str")

Holleu_Parliv = np.loadtxt("Data/Orthologs/Holothuria.leucospilota+Paracentrotus.lividus.txt", dtype = "str")

Margla_Bralan = np.loadtxt("Data/Orthologs/Marthasterias.glacialis+Branchiostoma.lanceolatum.txt", dtype = "str")

Margla_Pecmax = np.loadtxt("Data/Orthologs/Marthasterias.glacialis+Pecten.maximus.txt", dtype = "str")

Margla_Stichl = np.loadtxt("Data/Orthologs/Marthasterias.glacialis+Stichopus.chloronotus.txt", dtype = "str")

Pecmax_Bralan = np.loadtxt("Data/Orthologs/Pecten.maximus+Branchiostoma.lanceolatum.txt", dtype = "str")

Stichl_Bralan = np.loadtxt("Data/Orthologs/Stichopus.chloronotus+Branchiostoma.lanceolatum.txt", dtype = "str")

Stichl_Pecmax = np.loadtxt("Data/Orthologs/Stichopus.chloronotus+Pecten.maximus.txt", dtype = "str")

Pecmax_Holleu = np.loadtxt("orthology_pipeline/orthologs/Pecmax+Holleu_sensitive.txt", dtype = "str")

Holleu_Bralan = np.loadtxt("orthology_pipeline/orthologs/Holleu+Bralan_sensitive.txt", dtype = "str")

Pecmax_Bralan = np.loadtxt("orthology_pipeline/orthologs/Pecmax+Bralan_sensitive.txt", dtype = "str")

Pecmax_Braflo = np.loadtxt("orthology_pipeline/orthologs/Pecmax+Braflo_sensitive.txt", dtype = "str")

Holleu_Braflo = np.loadtxt("orthology_pipeline/orthologs/Holleu+Braflo_sensitive.txt", dtype = "str")

#### Sorting out the data

In [138]:
# Asterias data: removing non-chromosomal values
Astrub = Astrub.loc[Astrub['Chromosome'].str.contains('chr')]

# Paracentrotus data: select chromosomal scaffolds
Parliv_ortho = Parliv.groupby('Chromosome').size()
Parliv_ortho = Parliv_ortho.reset_index()

Parliv_ortho.columns = ['Chromosome', 'Count']
Parliv_ortho = Parliv_ortho.loc[Parliv_ortho['Count'] >= 100]

scaffolds = Parliv_ortho.Chromosome.tolist() # Remove all values from non-chromosome scaffolds
Parliv = Parliv.loc[Parliv['Chromosome'].isin(scaffolds)]

# Branchiostoma data: removing non-chromosomal values
Bralan = Bralan.loc[Bralan['Chromosome'].str.contains('BFL_')]
Braflo = Braflo.loc[Braflo['Chromosome'].str.contains('BFL_')]

# Pecten data: Calculate number of genes on each scaffold
Pecmax_ortho = Pecmax.groupby('Chromosome').size()
Pecmax_ortho = Pecmax_ortho.reset_index()

Pecmax_ortho.columns = ['Chromosome', 'Count']
Pecmax_ortho = Pecmax_ortho.loc[Pecmax_ortho['Count'] >= 300]

scaffolds = Pecmax_ortho.Chromosome.tolist() # Remove all values from non-chromosome scaffolds
Pecmax = Pecmax.loc[Pecmax['Chromosome'].isin(scaffolds)]

# Replace scaffold names with chromosome names
Pecmax = Pecmax.replace('HiC_scaffold_2', 'PYE_1')
Pecmax = Pecmax.replace('HiC_scaffold_18', 'PYE_2')
Pecmax = Pecmax.replace('HiC_scaffold_19', 'PYE_3')
Pecmax = Pecmax.replace('HiC_scaffold_16', 'PYE_4')
Pecmax = Pecmax.replace('HiC_scaffold_4', 'PYE_5')
Pecmax = Pecmax.replace('HiC_scaffold_7', 'PYE_6')
Pecmax = Pecmax.replace('HiC_scaffold_11', 'PYE_7')
Pecmax = Pecmax.replace('HiC_scaffold_15', 'PYE_8')
Pecmax = Pecmax.replace('HiC_scaffold_17', 'PYE_9')
Pecmax = Pecmax.replace('HiC_scaffold_10', 'PYE_10')
Pecmax = Pecmax.replace('HiC_scaffold_1', 'PYE_11')
Pecmax = Pecmax.replace('HiC_scaffold_3', 'PYE_12')
Pecmax = Pecmax.replace('HiC_scaffold_12', 'PYE_13')
Pecmax = Pecmax.replace('HiC_scaffold_5', 'PYE_14')
Pecmax = Pecmax.replace('HiC_scaffold_9', 'PYE_15')
Pecmax = Pecmax.replace('HiC_scaffold_14', 'PYE_16')
Pecmax = Pecmax.replace('HiC_scaffold_13', 'PYE_17')
Pecmax = Pecmax.replace('HiC_scaffold_8', 'PYE_18')
Pecmax = Pecmax.replace('HiC_scaffold_6', 'PYE_19')

# MB orthologies: remove prefixes on gene names
Margla_Bralan = pd.DataFrame(Margla_Bralan, columns = ['Code', 'A', 'P'])
Margla_Bralan['P'] = Margla_Bralan['P'].str.removeprefix('Parliv_')
Margla_Bralan = Margla_Bralan.to_numpy()

# MarBra orthologies: remove suffix
Margla_Bralan = pd.DataFrame(Margla_Bralan, columns = ['Code', 'M', 'B'])
Margla_Bralan['M'] = Margla_Bralan['M'].str.removesuffix('.1')
Margla_Bralan = Margla_Bralan.to_numpy()

# MS orthologies: remove suffix
Margla_Stichl = pd.DataFrame(Margla_Stichl, columns = ['Code', 'M', 'S'])
Margla_Stichl['M'] = Margla_Stichl['M'].str.removesuffix('.1')
Margla_Stichl['S'] = Margla_Stichl['S'].str.removesuffix('.1')
Margla_Stichl = Margla_Stichl.to_numpy()

# MP orthologies: remove suffix
Margla_Pecmax = pd.DataFrame(Margla_Pecmax, columns = ['Code', 'M', 'P'])
Margla_Pecmax['M'] = Margla_Pecmax['M'].str.removesuffix('.1')
Margla_Pecmax = Margla_Pecmax.to_numpy()

# AncBra orthology: 
Holleu_Bralan = pd.DataFrame(Holleu_Bralan, columns = ['Code', 'A', 'B'])
Holleu_Bralan['B'] = Holleu_Bralan['B'].str.rsplit('_').str.get(0)
Holleu_Bralan = Holleu_Bralan.to_numpy()

In [154]:
def orthofy(genelistA, genelistB, orthologies):
    
    """
    inputs:
    genelistA: gene list for species A
    genelistB: gene list for species B
    orthologies: orthology dataset
    
    outputs: dataframe with significant ortholog combinations 
             and their location in species A and B and p-Values
    """
    
    # Make ortholog dictionaries (ortholog : gene name)
    orthdictA = dict(zip(orthologies[:, 1], orthologies[:, 0]))
    orthdictB = dict(zip(orthologies[:, 2], orthologies[:, 0]))

    # Replace genelist values with ortholog dictionary keys
    genelistA['Name'] = genelistA['Name'].map(lambda x: orthdictA.get(x, x))
    genelistB['Name'] = genelistB['Name'].map(lambda x: orthdictB.get(x, x))
    
    # Make orthology location dictionaries (ortholog : chromosome)
    dictA = dict(zip(genelistA.loc[genelistA['Name'].str.contains('ortholog')].Name, 
                     genelistA.loc[genelistA['Name'].str.contains('ortholog')].Chromosome))
    dictB = dict(zip(genelistB.loc[genelistB['Name'].str.contains('ortholog')].Name, 
                     genelistB.loc[genelistB['Name'].str.contains('ortholog')].Chromosome))
    
    # Seperate all orthology entries into new dataframe
    AB_data = pd.DataFrame({'Orthologs': orthologies[:, 0],
                            'A' : orthologies[:, 0],
                            'B' : orthologies[:, 0]})
    
    # Replace location in A and B with ortholog location dictionary keys
    AB_data['A'] = AB_data['A'].map(dictA)
    AB_data['B'] = AB_data['B'].map(dictB)
    
    return AB_data

PB = orthofy(Pecmax, Braflo, Pecmax_Braflo)
PB = PB.dropna()

# Make matrix with corresponding chromosomes
Amp = ['BFL_11', 'BFL_10', 'BFL_16', 'BFL_8', 'BFL_3', 'BFL_1', 'BFL_18', 'BFL_14', 'BFL_15', 'BFL_5', 'BFL_7', 'BFL_3', 'BFL_17', 'BFL_3', 'BFL_19', 'BFL_12', 'BFL_1', 'BFL_13', 'BFL_2', 'BFL_2', 'BFL_6', 'BFL_9', 'BFL_4', 'BFL_4']
Sca = ['PYE_10', 'PYE_13', 'PYE_1', 'PYE_1', 'PYE_17', 'PYE_5', 'PYE_19', 'PYE_15', 'PYE_4', 'PYE_6', 'PYE_7', 'PYE_2', 'PYE_18', 'PYE_2', 'PYE_3', 'PYE_14', 'PYE_16', 'PYE_2', 'PYE_4', 'PYE_9', 'PYE_8', 'PYE_3', 'PYE_11', 'PYE_12']
Anc = ['G', 'B1', 'B2', 'M', 'C2', 'A1aA1b', 'B3', 'P', 'L', 'EaEb', 'F', 'QbQa', 'J1', 'QcQd', 'O2', 'N', 'A2', 'H', 'J2', 'C1', 'D', 'K', 'I', 'O1']
ChrCorr = np.column_stack((Sca, Amp, Anc))

# Make dataframe with corresponding chromosomes
PBgenes = pd.DataFrame()
for i in range (0, 24): 
    PBorthologs = PB.loc[(PB['A'] == ChrCorr[i, 0]) & (PB['B'] == ChrCorr[i, 1])]
    PBorthologs['Chr'] = ChrCorr[i, 2]

    PBgenes = pd.concat([PBgenes, PBorthologs])

# Manually add PYE_12
PBorthologs = PB.loc[(PB['A'] == 'PYE_12') & (PB['B'] != 'BFL_4')]
PBorthologs['Chr'] = 'R'
PBgenes = pd.concat([PBgenes, PBorthologs])

PBgenes['BGenes'] = PBgenes.loc[:, 'Orthologs']
PBgenes = PBgenes.rename(columns = {'Orthologs' : 'PGenes'})
PBgenes = PBgenes[['Chr', 'A', 'PGenes', 'B', 'BGenes']]

# Make reverse ortholog dictionaries (ortholog : gene name)
orthdictA = dict(zip(Pecmax_Braflo[:, 0], Pecmax_Braflo[:, 1]))
orthdictB = dict(zip(Pecmax_Braflo[:, 0], Pecmax_Braflo[:, 2]))

# Replace values
PBgenes['PGenes'] = PBgenes['PGenes'].map(lambda x: orthdictA.get(x, x))
PBgenes['BGenes'] = PBgenes['BGenes'].map(lambda x: orthdictB.get(x, x))

# Make dictionaries (H gene name : P/B gene name)
orthdictP = dict(zip(Pecmax_Holleu[:, 1], Pecmax_Holleu[:, 2]))
orthdictB = dict(zip(Holleu_Braflo[:, 2], Holleu_Braflo[:, 1]))

# Replace values
PBgenes['PGenes'] = PBgenes['PGenes'].map(lambda x: orthdictP.get(x, x))
PBgenes['BGenes'] = PBgenes['BGenes'].map(lambda x: orthdictB.get(x, x))

# Select all values orthologous in both columns
Ancestor = PBgenes.loc[(PBgenes['PGenes'].str.contains('gene-HOLleu_')) & 
                       (PBgenes['BGenes'].str.contains('gene-HOLleu_'))]

Ancestor = Ancestor.rename(columns = {'Chr' : 'Chromosome',
                                      'PGenes' : 'Name', 
                                      'A' : 'Pchr',
                                      'B' : 'Bchr'})
Ancestor = Ancestor[['Chromosome', 'Name', 'Pchr', 'Bchr']]

In [155]:
Ancestor

Unnamed: 0,Chromosome,Name,Pchr,Bchr
31,G,gene-HOLleu_09740,PYE_10,BFL_11
36,G,gene-HOLleu_10130,PYE_10,BFL_11
42,G,gene-HOLleu_10648,PYE_10,BFL_11
53,G,gene-HOLleu_11649,PYE_10,BFL_11
57,G,gene-HOLleu_10002,PYE_10,BFL_11
...,...,...,...,...
6127,R,gene-HOLleu_02690,PYE_12,BFL_5
6130,R,gene-HOLleu_04213,PYE_12,BFL_3
6131,R,gene-HOLleu_00369,PYE_12,BFL_11
6132,R,gene-HOLleu_00278,PYE_12,BFL_3


In [156]:
def orthofy(genelistA, genelistB, orthologies):
    
    """
    inputs:
    genelistA: gene list for species A
    genelistB: gene list for species B
    orthologies: orthology dataset
    
    outputs: dataframe with significant ortholog combinations 
             and their location in species A and B and p-Values
    """
    
    # Make ortholog dictionaries (ortholog : gene name)
    orthdictA = dict(zip(orthologies[:, 1], orthologies[:, 0]))
    orthdictB = dict(zip(orthologies[:, 2], orthologies[:, 0]))

    # Replace genelist values with ortholog dictionary keys
    genelistA['Name'] = genelistA['Name'].map(lambda x: orthdictA.get(x, x))
    genelistB['Name'] = genelistB['Name'].map(lambda x: orthdictB.get(x, x))
    
    # Make orthology location dictionaries (ortholog : chromosome)
    dictA = dict(zip(genelistA.loc[genelistA['Name'].str.contains('ortholog')].Name, 
                     genelistA.loc[genelistA['Name'].str.contains('ortholog')].Chromosome))
    dictB = dict(zip(genelistB.loc[genelistB['Name'].str.contains('ortholog')].Name, 
                     genelistB.loc[genelistB['Name'].str.contains('ortholog')].Chromosome))
    
    # Seperate all orthology entries into new dataframe
    AB_data = pd.DataFrame({'Orthologs': orthologies[:, 0],
                            'A' : orthologies[:, 0],
                            'B' : orthologies[:, 0]})
    
    # Replace location in A and B with ortholog location dictionary keys
    AB_data['A'] = AB_data['A'].map(dictA)
    AB_data['B'] = AB_data['B'].map(dictB)

    # Calculate number of orthologs for each pair of chromosomes
    AB_data = AB_data.groupby(['A', 'B']).count().reset_index()
    
    return AB_data

Unnamed: 0,A,B,Orthologs
0,A1aA1b,BFL_1,6
1,A1aA1b,BFL_10,74
2,A1aA1b,BFL_11,1
3,A1aA1b,BFL_12,3
4,A1aA1b,BFL_13,1
...,...,...,...
361,R,BFL_3,3
362,R,BFL_4,7
363,R,BFL_5,3
364,R,BFL_6,4


Figure out why the function messes up chromosome correspondance

In [182]:
genelistA = Ancestor
genelistB = Braflo
orthologies = Holleu_Braflo

# Make ortholog dictionaries (ortholog : gene name)
orthdictA = dict(zip(orthologies[:, 1], orthologies[:, 0]))
orthdictB = dict(zip(orthologies[:, 2], orthologies[:, 0]))

# Replace genelist values with ortholog dictionary keys
genelistA['Name'] = genelistA['Name'].map(lambda x: orthdictA.get(x, x))
genelistB['Name'] = genelistB['Name'].map(lambda x: orthdictB.get(x, x))

# Make orthology location dictionaries (ortholog : chromosome)
dictA = dict(zip(genelistA.loc[genelistA['Name'].str.contains('ortholog')].Name, 
                    genelistA.loc[genelistA['Name'].str.contains('ortholog')].Chromosome))
dictB = dict(zip(genelistB.loc[genelistB['Name'].str.contains('ortholog')].Name, 
                    genelistB.loc[genelistB['Name'].str.contains('ortholog')].Chromosome))

# Seperate all orthology entries into new dataframe
AB_data = pd.DataFrame({'Orthologs': orthologies[:, 0],
                        'A' : orthologies[:, 0],
                        'B' : orthologies[:, 0]})

# Replace location in A and B with ortholog location dictionary keys
AB_data['A'] = AB_data['A'].map(dictA)
AB_data['B'] = AB_data['B'].map(dictB)

# Calculate number of orthologs for each pair of chromosomes
AB_data = AB_data.groupby(['A', 'B']).count().reset_index()

In [183]:
AB_data

Unnamed: 0,A,B,Orthologs
0,A1aA1b,BFL_1,6
1,A1aA1b,BFL_10,74
2,A1aA1b,BFL_11,1
3,A1aA1b,BFL_12,3
4,A1aA1b,BFL_13,1
...,...,...,...
361,R,BFL_3,3
362,R,BFL_4,7
363,R,BFL_5,3
364,R,BFL_6,4
