## Imports

In [1]:
# Import packages
import pandas as pd
import numpy as np

import scipy.stats as stats
import pingouin as pg

import random
from random import randrange

import matplotlib.pyplot as plt
import seaborn as sns

import argparse

import Orthoscripts

pd.options.mode.chained_assignment = None 

#### Import genelists

In [2]:
# Asterias rubens
Astrub = Orthoscripts.readBED("Data/Genelists/Asterias.rubens.genelist.bed")

# Holothuria leucospilota
Holleu = Orthoscripts.readBED("Data/Genelists/Holothuria.leucospilota.genelist.bed")

# Paracentrotus livides
Parliv = Orthoscripts.readBED("Data/Genelists/Paracentrotus.lividus.genelist.bed")

# Branchiostoma lanceolatum
Bralan = Orthoscripts.readBED("Data/Genelists/Branchiostoma.lanceolatum.genelist.bed")

# Branchiostoma floridae
Braflo = Orthoscripts.readBED("Data/Genelists/Branchiostoma.floridae.genelist.bed", 's')

# Marthasterias glacialis
Margla = Orthoscripts.readBED("Data/Genelists/Marthasterias.glacialis.genelist.bed")

# Pecten maximus
Pecmax = Orthoscripts.readBED("Data/Genelists/Pecmax.genelist.bed", 's')

# Stichopus chloronotus
Stichl = Orthoscripts.readBED("Data/Genelists/Stichopus.chloronotus.genelist.bed")

# Amphiura filiformis 
Ampfil = Orthoscripts.readBED("Data/Genelists/Amphiura.filiformis.genelist.bed")

# Ephydatia muelleri
Ephmue = Orthoscripts.readBED("Data/Genelists/Ephmue.genelist.bed", 's')

# Ancestor 
AniAnc = Orthoscripts.readBED("Data/Genelists/AniAnc.genelist.bed", 's')
BilAnc = Orthoscripts.readBED("Data/Genelists/BilAnc.genelist.bed", 's')

#### Import ortholog files

In [3]:
# Import orthologs
Astrub_Holleu = np.loadtxt("Data/Orthologs/Asterias.rubens+Holothuria.leucospilota.txt", dtype = "str")

Astrub_Parliv = np.loadtxt("Data/Orthologs/Asterias.rubens+Paracentrotus.lividus.txt", dtype = "str")

Holleu_Parliv = np.loadtxt("Data/Orthologs/Holothuria.leucospilota+Paracentrotus.lividus.txt", dtype = "str")

Margla_Bralan = np.loadtxt("Data/Orthologs/Marthasterias.glacialis+Branchiostoma.lanceolatum.txt", dtype = "str")

Margla_Pecmax = np.loadtxt("Data/Orthologs/Marthasterias.glacialis+Pecten.maximus.txt", dtype = "str")

Margla_Stichl = np.loadtxt("Data/Orthologs/Marthasterias.glacialis+Stichopus.chloronotus.txt", dtype = "str")

Pecmax_Bralan = np.loadtxt("Data/Orthologs/Pecten.maximus+Branchiostoma.lanceolatum.txt", dtype = "str")

Stichl_Bralan = np.loadtxt("Data/Orthologs/Stichopus.chloronotus+Branchiostoma.lanceolatum.txt", dtype = "str")

Stichl_Pecmax = np.loadtxt("Data/Orthologs/Stichopus.chloronotus+Pecten.maximus.txt", dtype = "str")

Pecmax_Holleu = np.loadtxt("Orthology pipeline/orthologs/Pecmax+Holleu_sensitive.txt", dtype = "str")

Holleu_Bralan = np.loadtxt("Orthology pipeline/orthologs/Holleu+Bralan_sensitive.txt", dtype = "str")

Pecmax_Bralan = np.loadtxt("Orthology pipeline/orthologs/Pecmax+Bralan_sensitive.txt", dtype = "str")

Pecmax_Braflo = np.loadtxt("Orthology pipeline/orthologs/Pecmax+Braflo_sensitive.txt", dtype = "str")

Holleu_Braflo = np.loadtxt("Orthology pipeline/orthologs/Holleu+Braflo_sensitive.txt", dtype = "str")

Holleu_Ampfil = np.loadtxt("Data/Orthologs/Holothuria.leucospilota+Amphiura.filiformis.txt", dtype = "str")

Braflo_Ephmue = np.loadtxt("Orthology pipeline/orthologs/Braflo+Ephmue_sensitive.txt", dtype = "str")

Holleu_Ephmue = np.loadtxt("Orthology pipeline/orthologs/Holleu+Ephmue_sensitive.txt", dtype = "str")

Pecmax_Ephmue = np.loadtxt("Orthology pipeline/orthologs/Pecmax+Ephmue_sensitive.txt", dtype = "str")

#### Sorting out the data

In [4]:
Astrub = Astrub.loc[Astrub['Chromosome'].str.contains('chr')]
Bralan = Bralan.loc[Bralan['Chromosome'].str.contains('BFL_')]
Braflo = Braflo.loc[Braflo['Chromosome'].str.contains('BFL_')]
Pecmax = Pecmax.loc[Pecmax['Chromosome'].str.contains('PYE_')]
Ephmue = Ephmue.loc[Ephmue['Chromosome'].str.contains('EMU_')]

# Ephmue genelist: remove suffix
Ephmue['Name'] = Ephmue['Name'].str.rsplit('.t1').str.get(0)

# Parliv genelist: select chromosomal scaffolds
Parliv = Orthoscripts.unscaff(Parliv, 100)
Ampfil = Orthoscripts.unscaff(Ampfil, 100)
Ephmue = Orthoscripts.unscaff(Ephmue, 600)

Astrub_Parliv = Orthoscripts.orthFix(Astrub_Parliv, 'B', 'Parliv_', 1)
Margla_Bralan = Orthoscripts.orthFix(Margla_Bralan, 'A', '.1', 0)
Margla_Stichl = Orthoscripts.orthFix(Margla_Stichl, 'A', '.1', 0)
Margla_Stichl = Orthoscripts.orthFix(Margla_Stichl, 'B', '.1', 0)
Margla_Pecmax = Orthoscripts.orthFix(Margla_Pecmax, 'B', '.1', 0)
Holleu_Ampfil = Orthoscripts.orthFix(Holleu_Ampfil, 'B', '.1', 0)
Holleu_Bralan = Orthoscripts.orthFix(Holleu_Bralan, 'B', '_', 0)

### 
-----

### Plots

In [None]:
data = Orthoscripts.orthofind(Braflo, Ephmue, Braflo_Ephmue)
Orthoscripts.orthoplot(data, 'Amphioxus', 'Sponge', 'A', 'B')

------------------------------

### Test simulations

In [42]:
def rearrangements(data):
    # Converts table into dotplot
    fissions = data.pivot(index = 'A', columns='B', values = 'Orthologs')
    
    # Picks out all rows and columns with more than one dot
    fissions = fissions.loc[(fissions.where(fissions.isnull(), 1).sum(axis=1) > 1) | (fissions.sum(axis=0) > 1)]
    fissions = fissions.stack(dropna = True).reset_index().groupby('A')['B'].apply(list).reset_index(name = 'B')
    fissions['B'] = [', '.join(map(str, l)) for l in fissions['B']] # Convert list to str

    # Identify all translocations
    translocations = fissions.groupby('B').filter(lambda g: len(g) > 1)
    # Remove all translocations from list of fissions
    fissions = fissions[~ fissions.isin(translocations)]
    fissions.dropna(inplace = True)

    # Converts table into dotplot
    fusions = data.pivot(index = 'B', columns = 'A', values = 'Orthologs')
    fusions = fusions.loc[(fusions.where(fusions.isnull(), 1).sum(axis=1) > 1) | (fusions.sum(axis=0) > 1)]
    fusions = fusions.stack(dropna = True).reset_index()
    
    # Picks out all rows and columns with more than one dot
    translocations = fusions.groupby('A').filter(lambda g: len(g) > 1)
    
    # Remove all translocations from list of fissions
    fusions = fusions[~ fusions.isin(translocations)]

    # Identify and isolate the translocations
    translocations = translocations.groupby('B')['A'].apply(list).reset_index()
    translocations['A'] = [', '.join(map(str, l)) for l in translocations['A']]
    translocations = translocations.groupby('A')['B'].apply(list).reset_index()
    translocations['B'] = [', '.join(map(str, l)) for l in translocations['B']]

    fusions = fusions.groupby('B')['A'].apply(list).reset_index(name = 'A')
    fusions['A'] = [', '.join(map(str, l)) for l in fusions['A']]

    events = []
    for index, row in fissions.iterrows():
        events.append(''.join(('Fission of ancestral chromosome ', row['A'], ' into ', row['B'])))

    for index, row in fusions.iterrows():
        events.append(''.join(('Fusion of ancestral chromosome ', row['A'], ' into ', row['B'])))
        
    for index, row in translocations.iterrows():
        events.append(''.join(('Translocation of ancestral chromosome ', row['A'], ' into ', row['B'])))
    
    return events

In [43]:
rearrangements(simdata)

['Fission of ancestral chromosome AncChr1 into Chr1_1, Chr1_2',
 'Fission of ancestral chromosome AncChr10 into Chr10_1, Chr10_2',
 'Fission of ancestral chromosome AncChr18 into Chr18_1, Chr18_2',
 'Fission of ancestral chromosome AncChr2 into Chr2_1, Chr2_2',
 'Fusion of ancestral chromosome Chr12x9 into AncChr12, AncChr9',
 'Fusion of ancestral chromosome Chr17x13 into AncChr13, AncChr17',
 'Fusion of ancestral chromosome Chr3+16 into AncChr16, AncChr3',
 'Fusion of ancestral chromosome Chr6+11 into AncChr11, AncChr6',
 'Fusion of ancestral chromosome Chr8x7 into AncChr7, AncChr8',
 'Translocation of ancestral chromosome AncChr14, AncChr19 into Chr14;19, Chr19;14']

In [63]:
# counts the rearrangements
for i in range(100+1):
    input = 'Simulations/Ancestor_' + str(i) + '.bed'
    ancestor = Orthoscripts.readBED(input, 's')
    input = 'Simulations/SpeciesA_' + str(i) + '.bed'
    speciesA = Orthoscripts.readBED(input, 's')
    input = 'Simulations/Ancestor+SpeciesA_' + str(i) + '.txt'
    orthos = np.loadtxt(input, dtype = "str")

    data = Orthoscripts.orthologies(ancestor, speciesA, orthos)
    
    outfile = 'Simulations/Rearrangements_' + str(i) + '.txt'
    Orthoscripts.rearrangements(data, outfile)

In [13]:
def simulator(Nchr = 20, Ngene = 100, Nevents = 10, Nruns = 1):
    def makeancestor(Nchr, Ngene):
        ancestor = pd.DataFrame(columns = ['Chromosome'])
        for i in range(Nchr):
            row = {'Chromosome' : (i + 1)}
            for i in range(Ngene):
                    ancestor = pd.concat([ancestor, pd.DataFrame([row])], ignore_index = True)
        ancestor['Name'] = (ancestor.reset_index().index + 1)

        return ancestor

    # Dummy BED files :: type 'anc' for ancestor, 'des' for descendant
    def dummyBED(genome, type):
        if type == 'anc':
            genome['Chromosome'] = 'AncChr' + genome['Chromosome'].astype(str)
            genome['Name'] = 'ancg_' + genome['Name'].astype(str)
            
        if type == 'des':
            genome['Chromosome'] = 'Chr' + genome['Chromosome'].astype(str)
            genome['Name'] = 'g_' + genome['Name'].astype(str)
        
        genome['Start'] = np.arange(len(genome))
        genome['End'] = np.arange(len(genome)) + 5
        
        genome = genome[['Chromosome', 'Start', 'End', 'Name']]
            
        return genome

    # Dummy ortholog file
    def dummyOrthologs(genome):
        orthologs = pd.DataFrame()
        
        orthologs['Orthologs'] = np.arange(len(genome)) + 1
        orthologs['speciesA'] = np.arange(len(genome)) + 1
        orthologs['speciesB'] = np.arange(len(genome)) + 1
        
        orthologs['Orthologs'] = 'orthologs_' + orthologs['Orthologs'].astype(str)
        orthologs['speciesA'] = 'ancg_' + orthologs['speciesA'].astype(str)
        orthologs['speciesB'] = 'g_' + orthologs['speciesB'].astype(str)
        
        orthologs = orthologs.to_numpy()
        
        return orthologs

    def mixing(genome, mixing):
        genes = genome['Name'].to_numpy()
        n = len(genes)
        for i in range(int(mixing * n)):
            g1, g2 = randrange(n), randrange(n)
            genes[g2], genes[g1] = genes[g1], genes[g2]

            genome['Name'] = genes
            # genome['Chromosome'] = f'{fuse1}x{fuse2}'
            
    def fusion(genome, chr, mixing = 0):
        '''
        inputs: 
        ancestor : df with chromosome name | gene name
        mixing : float between 0 and 1, where 1 implies extreme mixing and 0 implies no mixing
        '''
        
        # Randomly select two chromosomes to fuse
        A = random.choice(chr)
        B = random.choice(chr)
        
        chr = [x for x in chr if x not in (A, B)]
        
        if A == B: # Just so the same chromosome isn't selected twice
            B = random.choice(chr)

        fusion = ancestor.loc[ancestor['Chromosome'].isin([A, B])]
        
        # Apply mixing if required
        if mixing > 0:
            genes = fusion['Name'].to_numpy()
            n = len(genes)
            for i in range(int(mixing * n)):
                g1, g2 = randrange(n), randrange(n)
                genes[g2], genes[g1] = genes[g1], genes[g2]

            fusion['Name'] = genes
            fusion['Chromosome'] = f'{A}x{B}'
            log = f'Fusion of ancestral chromosome AncChr{A}, AncChr{B} into Chr{A}x{B}'
            
        else:
            fusion['Chromosome'] = f'{A}+{B}'
            log = f'Fusion of ancestral chromosome AncChr{A}, AncChr{B} into Chr{A}+{B}'
        
        # Remove the unfused chromosomes
        genome.drop(genome[genome['Chromosome'].isin([A, B])].index, inplace = True)
        genome = pd.concat([genome, fusion])
        
        return genome, log, chr

    def fission(genome, chr):
        # Randomly select a chromosome for fission
        A = random.choice(chr)
        fission = genome.loc[genome['Chromosome'] == A]
        chr.remove(A)
        
        pos = random.choice(range(1, Ngene))

        # Add the new chromosomes back into the genome
        chr1 = fission.iloc[: pos]
        chr1['Chromosome'] = f'{A}_1'
        
        chr2 = fission.iloc[pos :]
        chr2['Chromosome'] = f'{A}_2'
        
        # Remove the fission chromosome from the genome
        genome = pd.concat([genome, chr1, chr2])
        genome = genome[genome.Chromosome != A]
        
        log = f'Fission of ancestral chromosome AncChr{A} into Chr{A}_1, Chr{A}_2'
        
        return genome, log, chr

    def translocation(genome, chr):
        # Randomly select two chromosomes for translocation
        A = random.choice(chr)
        B = random.choice(chr)
        
        if A == B: # Just so the same chromosome isn't selected twice
            B = random.choice(chr)
        
        chr = [x for x in chr if x not in (A, B)]
        
        chrA = genome.loc[genome['Chromosome'] == A]
        chrB = genome.loc[genome['Chromosome'] == B]
        
        # Randomly select two break point positions
        posA = random.choice(range(1, Ngene))
        posB = random.choice(range(1, Ngene))
        
        # Join the fragments to form recombinant chromosomes
        chr1 = pd.concat([chrA.iloc[: posA], chrB.iloc[posB :]])
        chr1['Chromosome'] = f'{A};{B}'
        chr2 = pd.concat([chrB.iloc[: posB], chrA.iloc[posA :]])
        chr2['Chromosome'] = f'{B};{A}'
        
        # Remove the original chromosomes from the genome
        genome = pd.concat([genome, chr1, chr2]).drop(genome[(genome['Chromosome'] == A) & (genome['Chromosome'] == B)].index)
        
        log = f'Translocation of ancestral chromosomes AncChr{A}, AncChr{B} into Chr{A};{B}, Chr{B};{A}'
        
        return genome, log, chr

    def syntenyloss(genome, chr):
        A = random.choice(chr)
        syn = genome.loc[genome['Chromosome'] == A]
        genome = genome[genome.Chromosome != syn]
        
        chr.remove(A)
        
        # Assign all elements to a random chromosome
        syn['Chromosome'] = random.choices(genome.Chromosome.unique(), k = len(syn))
        
        # Add back into the genome
        genome = pd.concat([genome, syn])
        
        log = f'Synteny loss of AncChr{A}'

        return genome, log, chr

    # Apply macro-rearrangements to the ancestor
    ancestor = makeancestor(Nchr, Ngene)
    chr = ancestor.Chromosome.unique().tolist()
    speciesA = ancestor.copy()

    events = []
    for event in range(Nevents):
        r = np.random.uniform()
        
        if r <= 0.30:
            if len(ancestor) < 2: continue
            speciesA, log, chr = fission(speciesA, chr)
            events.append(log)
        
        elif r <= 0.45:
            speciesA, log, chr = translocation(speciesA, chr)
            events.append(log)
        
        elif r <= 0.70:
            speciesA, log, chr = fusion(speciesA, chr)
            events.append(log)
        
        elif r <= 0.99:
            speciesA, log, chr = fusion(speciesA, chr, mixing = 0.5)
            events.append(log)
            
        else:
            # speciesA, log, chr = syntenyloss(speciesA, chr)
            # events.append(log)
            # print(log)
            continue
        
    ancestor = dummyBED(ancestor, 'anc')
    speciesA = dummyBED(speciesA, 'des')
    orthologs = dummyOrthologs(ancestor)
    
    return ancestor, speciesA, orthologs, events

In [44]:
simancestor, simspeciesA, simorthologs, simevents = simulator()
simdata = Orthoscripts.orthologies(simancestor, simspeciesA, simorthologs)
algoevents = rearrangements(simdata)

In [46]:
simevents

['Fusion of ancestral chromosome AncChr14, AncChr18 into Chr14+18',
 'Fission of ancestral chromosome AncChr10 into Chr10_1, Chr10_2',
 'Fission of ancestral chromosome AncChr12 into Chr12_1, Chr12_2',
 'Fusion of ancestral chromosome AncChr4, AncChr20 into Chr4+20',
 'Translocation of ancestral chromosomes AncChr11, AncChr3 into Chr11;3, Chr3;11',
 'Fission of ancestral chromosome AncChr19 into Chr19_1, Chr19_2',
 'Fusion of ancestral chromosome AncChr9, AncChr15 into Chr9x15',
 'Fusion of ancestral chromosome AncChr6, AncChr13 into Chr6x13',
 'Fusion of ancestral chromosome AncChr17, AncChr7 into Chr17x7',
 'Fusion of ancestral chromosome AncChr1, AncChr5 into Chr1x5']

In [45]:
algoevents

['Fission of ancestral chromosome AncChr10 into Chr10_1, Chr10_2',
 'Fission of ancestral chromosome AncChr12 into Chr12_1, Chr12_2',
 'Fission of ancestral chromosome AncChr19 into Chr19_1, Chr19_2',
 'Fusion of ancestral chromosome Chr14+18 into AncChr14, AncChr18',
 'Fusion of ancestral chromosome Chr17x7 into AncChr17, AncChr7',
 'Fusion of ancestral chromosome Chr1x5 into AncChr1, AncChr5',
 'Fusion of ancestral chromosome Chr4+20 into AncChr20, AncChr4',
 'Fusion of ancestral chromosome Chr6x13 into AncChr13, AncChr6',
 'Fusion of ancestral chromosome Chr9x15 into AncChr15, AncChr9',
 'Translocation of ancestral chromosome AncChr11, AncChr3 into Chr11;3, Chr3;11']

In [47]:
list(set(simevents).difference(algoevents))

['Translocation of ancestral chromosomes AncChr11, AncChr3 into Chr11;3, Chr3;11',
 'Fusion of ancestral chromosome AncChr6, AncChr13 into Chr6x13',
 'Fusion of ancestral chromosome AncChr4, AncChr20 into Chr4+20',
 'Fusion of ancestral chromosome AncChr9, AncChr15 into Chr9x15',
 'Fusion of ancestral chromosome AncChr1, AncChr5 into Chr1x5',
 'Fusion of ancestral chromosome AncChr17, AncChr7 into Chr17x7',
 'Fusion of ancestral chromosome AncChr14, AncChr18 into Chr14+18']

In [19]:
input = 'Simulations/Ancestor_' + str(4) + '.bed'
ancestor = Orthoscripts.readBED(input, 's')
input = 'Simulations/SpeciesA_' + str(4) + '.bed'
speciesA = Orthoscripts.readBED(input, 's')
input = 'Simulations/Ancestor+SpeciesA_' + str(4) + '.txt'
orthos = np.loadtxt(input, dtype = "str")

speciesA

Unnamed: 0,Chromosome,Start,End,Name
0,Chr3,0,5,g_201
1,Chr3,1,6,g_202
2,Chr3,2,7,g_203
3,Chr3,3,8,g_204
4,Chr3,4,9,g_205
...,...,...,...,...
2395,Chr10x2,2395,2400,g_982
2396,Chr10x2,2396,2401,g_943
2397,Chr10x2,2397,2402,g_998
2398,Chr10x2,2398,2403,g_945
