In [1]:
from CB_02704 import *
path = "../02704_data"
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
def calc_corr(X, Y): #*
    '''Calculates the correlation between rows of masked arrays X and Y'''
    
    # Annoying-but-needed data setup:
    # Make sure arguments are 2-dimensional
    X = np.ma.atleast_2d(X)
    Y = np.ma.atleast_2d(Y)
    
    # get the masked arrays as integers
    Xnotmask = ~np.ma.getmaskarray(X) * 1
    Ynotmask = ~np.ma.getmaskarray(Y) * 1
    
    # Now onto the calculations:
    # Calculate N for each combination of X and Y
    N = np.dot(Xnotmask, Ynotmask.T)
    
    # Calculate the sums
    sum_X  = np.ma.dot(X,    Ynotmask.T).filled()
    sum_X2 = np.ma.dot(X**2, Ynotmask.T).filled()
    
    sum_Y  = np.ma.dot(Xnotmask, Y.T).filled()
    sum_Y2 = np.ma.dot(Xnotmask, Y.T**2).filled()
    
    sum_XY = np.ma.dot(X, Y.T * 1.).filled()
    
    r = ( (N*sum_XY - sum_X*sum_Y)
         / np.sqrt(N*sum_X2 - sum_X**2)
         / np.sqrt(N*sum_Y2 - sum_Y**2) )
         
    return r, N

In [2]:

ceu_geno = np.load("geno_files/ceu_geno.npy")
ceu_geno = np.ma.filled(ceu_geno, 0)

yri_geno = np.load("geno_files/yri_geno.npy")
yri_geno = np.ma.filled(yri_geno, 0)

asw_geno = np.load("geno_files/asw_geno.npy")
asw_geno = np.ma.filled(asw_geno, 0)

ceu_snp = read_snp_pop("ceu")

sim_geno = np.load("geno_files/sim_geno.npy")
sim_geno = np.array(sim_geno, dtype=np.int64)
snp_ids = ceu_snp.index

In [4]:
r_vals = [0]
for i in range(1, len(ceu_geno)):
    # print(i)
    r, _ = calc_corr(ceu_geno[i-1], ceu_geno[i])
    r_vals.append(r[0][0])
# print(r_vals)

In [5]:
r_vals_abs = np.array(r_vals)
r_vals_abs = np.abs(r_vals_abs)

In [4]:

ceu_snp.index = np.arange(0, len(ceu_snp))

In [7]:
hm3_genetic_map = {"CHROM": [], "POS": [], "GENETIC_DIST": []}

dist = 0

for i,row in ceu_snp.iterrows():
    if row["chromosome"] == 2:
        break
    hm3_genetic_map["CHROM"].append(row["chromosome"])
    hm3_genetic_map["POS"].append(row["position"])
    hm3_genetic_map["GENETIC_DIST"].append(0)

hm3_genetic_map_df = pd.DataFrame(hm3_genetic_map)
hm3_genetic_map_df.to_csv("genetic_map.txt", sep="\t",index=False)

### GENERATING VCF FILE FOR STACKED CEU AND YRI GENOTYPE DATA

In [None]:
header = "##fileformat=VCFv4.2\n"
header += "##fileDate=20241120\n"
header += "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"
header += "##INFO=<ID=H3,Number=0,Type=Flag,Description=\"HapMap3 membership\">\n"
header += "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"

N = np.shape(ceu_geno)[1] + np.shape(yri_geno)[1]
for i in range(N):
    header += f"\tR_SAMPLE{i + 1}"
    
stacked_geno = np.ma.hstack((ceu_geno, yri_geno))

for i in range(1, 23):
    print(f"Generating VCF file for chromosome {i}")
    vcf_body = ""
    prev_pos = -1
    indices = np.array(ceu_snp[ceu_snp["chromosome"] == i].index)
    for j in range(len(indices)):
        index = indices[j]
        row = ceu_snp.iloc[index]
        if int(row['position']) == prev_pos:
            continue         
        vcf_body += f"\n{row['chromosome']}\t"
        vcf_body += f"{row['position']}\t"

        vcf_body += f"{snp_ids[index]}\t"
        vcf_body += f"{row['ref']}\t"
        vcf_body += f"{row['alt']}\t"
        vcf_body += "100\t"
        vcf_body += "PASS\t"
        vcf_body += "H3\t"
        vcf_body += "GT\t"
        
        stacked_geno[index][np.where(stacked_geno[index] == 9)[0]] = 0

        phased_geno = []
        for g in stacked_geno[index]:
            if g == 0:
                phased_geno.append('0|0')
            elif g == 1:
                phased_geno.append('1|0')
            elif g == 2:
                phased_geno.append('1|1')
        
        genotypes = '\t'.join([g for g in phased_geno])
        vcf_body += f"{genotypes}"
        prev_pos = int(row['position'])


    with open(f'vcf_files/chromosome{i}/ceu_yri_geno.vcf', 'w') as f:
        f.write(header + vcf_body + "\n")

### GENERATING SAMPLE MAP FOR CEU AND YRI DATA

In [9]:
with open("ceu_yri_sample.txt", "w") as f:
    N = np.shape(ceu_geno)[1] + np.shape(yri_geno)[1]
    for i in range(N):
        if i < ceu_geno.shape[1]:
            f.write(f"R_SAMPLE{i+1}\tCEU\n")
        elif i < N-1:
            f.write(f"R_SAMPLE{i+1}\tYRI\n")
        else:
            f.write(f"R_SAMPLE{i+1}\tYRI\n")

### GENERATING VCF FILE FOR ASW GENOTYPE DATA

In [None]:
header = "##fileformat=VCFv4.2\n"
header += "##fileDate=20241120\n"
header += "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"
header += "##INFO=<ID=H3,Number=0,Type=Flag,Description=\"HapMap3 membership\">\n"
header += "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"

N = np.shape(asw_geno)[1]
for i in range(N):
    header += f"\tQ_SAMPLE{i + 1}"

for i in range(1, 23):
    print(f"Generating VCF file for chromosome {i}")
    vcf_body = ""
    prev_pos = -1
    indices = np.array(ceu_snp[ceu_snp["chromosome"] == i].index)
    for j in range(len(indices)):
        index = indices[j]
        row = ceu_snp.iloc[index]
        if int(row['position']) == prev_pos:
            continue         
        vcf_body += f"\n{row['chromosome']}\t"
        vcf_body += f"{row['position']}\t"

        vcf_body += f"{snp_ids[index]}\t"
        vcf_body += f"{row['ref']}\t"
        vcf_body += f"{row['alt']}\t"
        vcf_body += "100\t"
        vcf_body += "PASS\t"
        vcf_body += "H3\t"
        vcf_body += "GT\t"
        
        asw_geno[index][np.where(asw_geno[index] == 9)[0]] = 0

        phased_geno = []
        for g in asw_geno[index]:
            if g == 0:
                phased_geno.append('0|0')
            elif g == 1:
                phased_geno.append('1|0')
            elif g == 2:
                phased_geno.append('1|1')
        
        genotypes = '\t'.join([g for g in phased_geno])
        vcf_body += f"{genotypes}"
        prev_pos = int(row['position'])


    with open(f'vcf_files/chromosome{i}/asw_geno.vcf', 'w') as f:
        f.write(header + vcf_body + "\n")

In [None]:
header = "##fileformat=VCFv4.2\n"
header += "##fileDate=20241120\n"
header += "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"
header += "##INFO=<ID=H3,Number=0,Type=Flag,Description=\"HapMap3 membership\">\n"
header += "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"

N = np.shape(sim_geno)[1]
for i in range(N):
    header += f"\tQ_SAMPLE{i + 1}"

for i in range(1, 23):
    print(f"Generating VCF file for chromosome {i}")
    vcf_body = ""
    prev_pos = -1
    indices = np.array(ceu_snp[ceu_snp["chromosome"] == i].index)
    
    for j in range(len(indices)):
        index = indices[j]
        row = ceu_snp.iloc[index]
        if int(row['position']) == prev_pos:
            continue         
        vcf_body += f"\n{row['chromosome']}\t"
        vcf_body += f"{row['position']}\t"

        vcf_body += f"{snp_ids[index]}\t"
        vcf_body += f"{row['ref']}\t"
        vcf_body += f"{row['alt']}\t"
        vcf_body += "100\t"
        vcf_body += "PASS\t"
        vcf_body += "H3\t"
        vcf_body += "GT\t"
        
        sim_geno[index][np.where(sim_geno[index] == 9)[0]] = 0

        phased_geno = []
        for g in sim_geno[index]:
            if g == 0:
                phased_geno.append('0|0')
            elif g == 1:
                phased_geno.append('1|0')
            elif g == 2:
                phased_geno.append('1|1')
        
        genotypes = '\t'.join([g for g in phased_geno])
        vcf_body += f"{genotypes}"
        prev_pos = int(row['position'])


    with open(f'vcf_files/chromosome{i}/sim_geno.vcf', 'w') as f:
        f.write(header + vcf_body + "\n")