In [80]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import msprime, collections, tskit
import tempfile
import scipy.stats

from Bio import Phylo, AlignIO
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
from Bio.Phylo.Consensus import *

# ILS

In [81]:
def ils_quartet(sequences):
    # Check there are four names
    assert len(sequences) == 4
    
    # Sequence length
    seq_length = len(sequences[0])
    
    # Pattern count - xxyy, xyxy or xyyx
    patterns = [0, 0, 0]
    
    # Go through the positions one by one
    for i in range(seq_length):
        # Per position, check if there are 2 of each alleles, and which pattern
        alleles = [seq[i] for seq in sequences]
        
        # First pattern
        if (alleles[0] == alleles[1]) and (alleles[2] == alleles[3]) and (alleles[0] != alleles[2]):
            patterns[0] += 1

        # Second pattern
        if (alleles[0] == alleles[2]) and (alleles[1] == alleles[3]) and (alleles[0] != alleles[1]):
            patterns[1] += 1

        # Third pattern
        if (alleles[0] == alleles[3]) and (alleles[1] == alleles[2]) and (alleles[0] != alleles[1]):
            patterns[2] += 1
            
    return patterns

# Simulate without migration

In [82]:
great_apes_species_tree = "(((human:6, chimp:6):4,gorilla:10):6,orang:16)"

great_apes_initial_size = collections.defaultdict(lambda: 50000)
great_apes_initial_size.update({
    "human": 20000, 
    "chimp": 50000, 
    "gorilla": 50000,
})

great_apes_demography = msprime.Demography.from_species_tree(
    great_apes_species_tree, 
    great_apes_initial_size,
    time_units="myr",
    generation_time=20,
)

In [83]:
ts = msprime.sim_ancestry(
    samples={"human": 1, "chimp": 1, "gorilla": 1, "orang": 1},
    ploidy=1,
    demography=great_apes_demography,
    sequence_length=int(1e7),
    discrete_genome=True,
    recombination_rate=1e-8,    
)

ts = msprime.sim_mutations(
    tree_sequence=ts,
    rate=1e-8,
    model="jc69",
    discrete_genome=True,
)

In [84]:
ts

Tree Sequence,Unnamed: 1
Trees,37209
Sequence Length,10000000.0
Time Units,generations
Sample Nodes,4
Total Size,19.6 MiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,99370,3.0 MiB,
Individuals,4,136 Bytes,
Migrations,0,8 Bytes,
Mutations,258928,9.1 MiB,
Nodes,21897,598.8 KiB,
Populations,7,472 Bytes,✅
Provenances,2,3.6 KiB,
Sites,255602,6.1 MiB,


In [20]:
sequences = list(ts.haplotypes())

# Simulate with migration

In [131]:
migration_great_apes_demography = msprime.Demography.from_species_tree(
    great_apes_species_tree, 
    great_apes_initial_size,
    time_units="myr",
    generation_time=20,
)

migration_great_apes_demography.add_mass_migration(
    time = 3e5/20,
    source = "human",
    dest = "gorilla",
    proportion = 0.1,
)

migration_great_apes_demography.sort_events()



In [132]:
mts = msprime.sim_ancestry(
    samples={"human": 1, "chimp": 1, "gorilla": 1, "orang": 1},
    ploidy=1,
    demography=migration_great_apes_demography,
    sequence_length=int(1e6),
    discrete_genome=True,
    recombination_rate=1e-8,
    record_migrations=True, 
)

mts = msprime.sim_mutations(
    tree_sequence=mts,
    rate=1e-8,
    model="jc69",
    discrete_genome=True,
)

In [133]:
len(mts.migrations())

5141

In [134]:
migrated_sequences = list(mts.haplotypes())

# ABBA-BABA

In [135]:
# Pattern count - xxyy, xyxy or xyyx
patterns = ils_quartet(sequences)
print(patterns)
print(scipy.stats.binomtest(patterns[1], patterns[1]+patterns[2]))

[19764, 201, 228]
BinomTestResult(k=201, n=429, alternative='two-sided', statistic=0.46853146853146854, pvalue=0.2093220851757111)


In [136]:
# Pattern count - xxyy, xyxy or xyyx
patterns = ils_quartet(migrated_sequences)
print(patterns)
print(scipy.stats.binomtest(patterns[1], patterns[1]+patterns[2]))

[1721, 596, 25]
BinomTestResult(k=596, n=621, alternative='two-sided', statistic=0.9597423510466989, pvalue=6.36754487974895e-143)
