In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=6)

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
### read the VCF file - contains data of all samples
df = pd.read_csv(f"../../WGS/vcf_cohorts/cohort_genotype_vcf_snps.tsv", sep="\t")

In [4]:
df.head()

Unnamed: 0,CHROM,POS,TYPE,REF,ALT,H5YTLCCXY-4_S0_L004.GT,H5YTLCCXY-5_S0_L005.GT,H5YTLCCXY-6_S0_L006.GT,H5YTLCCXY-7_S0_L007.GT,H5YTLCCXY-8_S0_L008.GT,H735GCCXY-1_S0_L001.GT,H735GCCXY-2_S0_L002.GT,H735GCCXY-3_S0_L003.GT,H735GCCXY-4_S0_L004.GT,H7WM2CCXY-2_S0_L002.GT,H7WM2CCXY-3_S0_L003.GT,H7WM2CCXY-8_S0_L008.GT
0,NC_044976.1,48,SNP,C,G,C/C,C/G,C/C,C/C,C/C,C/C,C/C,C/G,C/C,C/C,C/C,C/G
1,NC_044976.1,92,SNP,G,A,G/G,G/A,G/G,G/G,G/G,G/G,G/G,G/A,G/G,G/G,G/G,G/A
2,NC_044976.1,101,SNP,T,C,T/T,T/T,T/T,T/T,T/T,T/T,T/T,T/C,T/T,T/T,T/T,T/T
3,NC_044976.1,148,SNP,C,A,C/C,C/C,C/C,C/C,C/C,C/C,C/C,C/A,C/C,C/C,C/C,C/A
4,NC_044976.1,188,SNP,T,C,T|C,T|C,T/T,T/T,T/T,T/T,T/T,T|C,T/T,T/T,T/T,T|C


In [5]:
sample_map = {}
with open("../../sample_name_mapping.csv", "r") as f:
    for lines in f:
        lines = lines.strip().split(",")
        sample_map[lines[1]] = lines[0]

In [6]:
cols = {i: sample_map[i.split("_S")[0]] for i in df.columns if i.startswith("H")}

In [7]:
## rename the samples
df.rename(columns=cols, inplace=True)

In [8]:
## read in refseqid to chromosome name mapping
chr_name_map = {}
with open("../../reseq_to_chr_names.txt", 'r') as f:
    for lines in f:
        lines = lines.strip().split("\t")
        chr_name_map[lines[1]] = lines[0]

In [9]:
df =df[df['CHROM'].isin(chr_name_map)]

In [10]:
df.rename(columns={"CHROM":"refseqid"}, inplace=True)
df['CHROM'] = df['refseqid'].apply(lambda x: chr_name_map[x])

In [11]:
df.shape

(17046278, 18)

In [12]:
df.head()

Unnamed: 0,refseqid,POS,TYPE,REF,ALT,ZT8,ZT14,ZT0,ZT16,ZT2,ZT12,ZT18,ZT4,ZT20,ZT6,ZT22,ZT10,CHROM
0,NC_044976.1,48,SNP,C,G,C/C,C/G,C/C,C/C,C/C,C/C,C/C,C/G,C/C,C/C,C/C,C/G,1
1,NC_044976.1,92,SNP,G,A,G/G,G/A,G/G,G/G,G/G,G/G,G/G,G/A,G/G,G/G,G/G,G/A,1
2,NC_044976.1,101,SNP,T,C,T/T,T/T,T/T,T/T,T/T,T/T,T/T,T/C,T/T,T/T,T/T,T/T,1
3,NC_044976.1,148,SNP,C,A,C/C,C/C,C/C,C/C,C/C,C/C,C/C,C/A,C/C,C/C,C/C,C/A,1
4,NC_044976.1,188,SNP,T,C,T|C,T|C,T/T,T/T,T/T,T/T,T/T,T|C,T/T,T/T,T/T,T|C,1


### Now drop unnecessary columns to make the df lighter to work with

In [13]:
samples = [f"ZT{i}" for i in range(23) if i%2==0]
def count_alleles(x):
    return len(set([i for zt in x[samples] for i in zt.replace("|", "/").split("/")]))

In [14]:
df['#alleles'] = df.parallel_apply(count_alleles, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2841047), Label(value='0 / 2841047…

In [15]:
df.head(2)

Unnamed: 0,refseqid,POS,TYPE,REF,ALT,ZT8,ZT14,ZT0,ZT16,ZT2,ZT12,ZT18,ZT4,ZT20,ZT6,ZT22,ZT10,CHROM,#alleles
0,NC_044976.1,48,SNP,C,G,C/C,C/G,C/C,C/C,C/C,C/C,C/C,C/G,C/C,C/C,C/C,C/G,1,2
1,NC_044976.1,92,SNP,G,A,G/G,G/A,G/G,G/G,G/G,G/G,G/G,G/A,G/G,G/G,G/G,G/A,1,2


In [16]:
df['#alleles'].value_counts()

2    16182685
3      537393
1      282303
4       43373
5         516
6           8
Name: #alleles, dtype: int64