In [None]:
import os
import pandas as pd

from jacksonii_analyses import vcf_parser, clustering

vcf_path = "../data/var/filtered_variants.vcf.gz"

In [None]:
os.makedirs("../data/phylo", exist_ok=True)

In [None]:
from importlib import reload
reload(vcf_parser)
reload(clustering)

In [None]:
data = vcf_parser.vcf_to_chr_pos_df(vcf_path)
data

In [None]:
# cluster variants by distance with a threshold of 500 bp
dcluster_data = clustering.cluster_variants_by_distance(data, 500)
dcluster_data

In [None]:
dcluster_intervals = clustering.cluster_data_to_intervals(dcluster_data)
dcluster_intervals.sort_values(by=["n_variants"], ascending=False)

Apply filters to clustered variants (i.e., loci).

In [None]:
# remove variants with less than 10 variants, and size between 800 and 8000 bp
# this gives a theta range between 0.0125 and 0.00125
dcluster_intervals_filtered = dcluster_intervals[
    (dcluster_intervals["n_variants"] >= 10) & 
    (dcluster_intervals["size"] >= 800) & 
    (dcluster_intervals["size"] <= 8000)
].reset_index(drop=True)
# add distance from preceeding loci
dcluster_intervals_filtered = (
    dcluster_intervals_filtered.groupby("chrom").apply(
        lambda df: clustering.distance_for_group(df),
        include_groups=True,
    ).reset_index(drop=True)
)
# to mitigate issues with linkage disequilibrium, remove loci with distance < 1000 bp
dcluster_intervals_filtered = dcluster_intervals_filtered[ 
    (dcluster_intervals_filtered["distance"] > 1000) | 
    (dcluster_intervals_filtered["distance"] == 0)
].reset_index(drop=True)
dcluster_intervals_filtered

In [None]:
dcluster_intervals_filtered.to_csv(
    "../data/phylo/loci.bed",
    index=False,
    header=False,
    sep="\t"
)

In [None]:
admixed_samples = pd.read_csv("../data/var/admixture/admixed_individuals.csv")
csv_samples = ",".join(admixed_samples["sample"].tolist())
csv_samples

Produce fasta alignments for all loci.

In [None]:
import subprocess

ref_path = "../data/genome/reference.fasta.gz"
vcf_path = "../data/var/filtered_variants.vcf.gz"
vcf_clean_path = vcf_path.replace(".vcf.gz", "_clean.vcf.gz")
bed_path = "../data/phylo/loci.bed"

subprocess.run([
    "bcftools", 
    "view",
    "-s", f"^{csv_samples}",
    "-Oz",
    "-o", vcf_clean_path,
    vcf_path,
    "--write-index",
], check=True)

subprocess.run([
    "vcf2fasta", 
    "--fasta", ref_path,
    "--vcf", vcf_clean_path,
    "--bed", bed_path,
    "--out", "../data/phylo/loci",
    "--force",
], check=True)