# Data Prepertaion

In [1]:
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import warnings

from pysam import VariantFile

warnings.filterwarnings("ignore")

%matplotlib inline

In [2]:
superpopulation_dir = "./data/superpopulation_data.tsv"
panel_dir           = "./data/phase1_integrated_calls.20101123.ALL.panel"
vcf_dir             = "./data/ALL.chr21.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz"

superpopulation_df  = pd.read_csv(superpopulation_dir, sep = '\t')
superpopulation_map = superpopulation_df.set_index("Population Code").to_dict()["Super Population"]

vcf_file = VariantFile(vcf_dir)

samples = list(vcf_file.header.samples)

skip_every  = 25
early_end   = 494328
total_count = 494328

In [3]:
counter = 0
variant_ids = []
genotypes = []
for record in tqdm(vcf_file.fetch(), total = early_end):
    counter += 1
    
    if counter % skip_every == 0:
        alleles = [record.samples[x].allele_indices for x in samples]

        genotypes.append(alleles)
        variant_ids.append(record.id)

    if counter > early_end:
        break

  0%|          | 0/494328 [00:00<?, ?it/s]

In [4]:
labels = {}  
with open(panel_dir) as panel_file:
    for line in panel_file:
        line = line.strip().split('\t')
        labels[line[0]] = line[1]

In [5]:
# shape: (samples, total_count // skip_every, 2)
genotypes_repr = np.array(genotypes).transpose(1, 0, 2)
genotypes_repr = np.sum(genotypes_repr, axis = 2)

In [6]:
df = pd.DataFrame(genotypes_repr, columns = variant_ids, index = samples)
df = df.reset_index().rename(columns = {"index": "sample"})

df["population_code"]      = df["sample"].map(labels)
df["superpopulation_code"] = df["population_code"].map(superpopulation_map) 

df

Unnamed: 0,sample,rs190723053,rs184643004,rs184107193,rs78466144,rs200750454,rs149876288,rs201969461,rs140138610,rs77203822,...,rs115279472,rs400140,rs28637180,rs140094480,rs114462634,rs111704908,rs114631634,rs141938398,population_code,superpopulation_code
0,HG00096,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,GBR,EUR
1,HG00097,0,2,0,2,2,2,0,1,1,...,0,1,0,0,0,0,0,0,GBR,EUR
2,HG00099,0,2,0,1,1,1,2,2,2,...,0,2,0,0,0,0,0,0,GBR,EUR
3,HG00100,0,0,0,0,0,2,1,0,1,...,0,1,0,0,0,0,0,0,GBR,EUR
4,HG00101,0,1,0,1,1,1,2,1,2,...,0,1,0,0,0,0,0,0,GBR,EUR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1087,NA20816,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,TSI,EUR
1088,NA20818,0,0,0,1,0,2,1,0,2,...,0,0,0,0,0,0,0,0,TSI,EUR
1089,NA20819,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,TSI,EUR
1090,NA20826,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,TSI,EUR


In [7]:
df.to_csv("./data/clean_data.csv", index = False)