In [None]:
import os
import pysam
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
data = pd.read_excel("./data/genomes/metadata_whole_genome.xlsx", engine='openpyxl')
data = data.drop(["Generation", "rep"], axis=1)
data.loc[data.query('`samples.1` == "RMF27"').index, "samples.1"] = "0.0_-"
sample_details = pd.DataFrame(data['samples.1'].apply(lambda x: x.split("_")[0].split(".") + [x.split("_")[1]]).to_list(), columns=["generation", "rep", "treatment"])
data = pd.concat([data, sample_details], axis=1 )
data.head()

In [None]:
VCF_DIR = "./results/vcf/"
files = os.listdir(VCF_DIR)

In [None]:
data['BGI_ID'] = data['BGI_ID'].astype(str)

In [None]:
variant_dict = {}

for file in files:
    
    bgi_id = file.replace(".vcf.gz", "")

    sample_info = data.loc[data.BGI_ID == bgi_id, ["generation", "rep", "treatment"]]    
    if len(sample_info) == 1:
        sample_info = sample_info.iloc[0].to_list()
        sample_info = tuple(sample_info)
    else:
        continue
        
    try:
        vcf_file = pysam.VariantFile(f"{VCF_DIR}/{file}")
    except:
        print("")
        continue
        
    variant_positions = [(record.chrom, record.pos) for record in vcf_file.fetch()]
    variant_dict.update({sample_info: set(variant_positions)})

Compute length of intersection over intersection of union:

In [None]:
intersection = []

for sample_i, variants_i in tqdm(variant_dict.items()):
    intersection.append([])
    for sample_j, variants_j in variant_dict.items():
        intersection[-1].append(len(variants_j.intersection(variants_i)) / len(variants_j.union(variants_i)))

colnames = ["_".join(k) for k, v in list(variant_dict.items())]
intersection_df = pd.DataFrame(np.array(intersection), columns=colnames)
intersection_df.index = colnames

In [None]:
intersection_df['71_1_MS'].sort_values(ascending=False)

In [None]:
from scipy.cluster.hierarchy import linkage, leaves_list
corr_linkage = linkage(intersection_df, method='average')
idx = leaves_list(corr_linkage)
ordered_matrix = intersection_df.iloc[idx, idx]

In [None]:
ordered_matrix["0_0_-"].sort_values(ascending=False)

In [None]:
rep = 1
gen_ref = 49
WINDOW_LEN = 60
treatment = "MS"
label_ref = f"{gen_ref}_{rep}_{treatment}"

kk = [ f"{gen}_{rep}_{treatment}" for gen in range(gen_ref-WINDOW_LEN//2, gen_ref+WINDOW_LEN//2)]
kk = sorted(list(set(kk).intersection(ordered_matrix.index)))

fig, ax = plt.subplots(figsize=(20,5))
ax.plot(ordered_matrix.loc[kk, f'{gen_ref}_{rep}_{treatment}'])# .sort_values(ascending=False).head(20)
ax.tick_params(axis='x', labelrotation=60)
ax.axvline(x=label_ref, color='red', linestyle='dashed')# type='---')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.imshow((np.array(ordered_matrix)))# , annot=False, cmap='coolwarm')
plt.title("Ordered Correlation Matrix by Similarity")
plt.show()

In [None]:
# for record in vcf_file.fetch():
#     print(f"Chromosome: {record.chrom}")
#     print(f"Position: {record.pos}")
#     print(f"Reference Allele: {record.ref}")
#     print(f"Alternative Alleles: {record.alts}")
#     print(f"Quality: {record.qual}")
#     print(f"Info: {dict(record.info)}")