In [1]:
import numpy as np
import pandas as pd
import polars as pl
from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("paper", font_scale=1.5)
sns.set_style("ticks")

In [3]:
# from pandarallel import pandarallel
# pandarallel.initialize(progress_bar=True, nb_workers=6)

#### ReadME
First part of this notebook is concatenating and melting the files from wide to narrow format.
The second part is actual data processing. So go the second part directly.

#### Concatenate and melt the files

In [4]:
df = pd.read_csv("../../phasing_12_genomes/whatshap_phased_vcf/merged_all_samples.table", sep="\t")

In [5]:
df.head(2)

Unnamed: 0,CHROM,POS,REF,ALT,H5YTLCCXY-4_S0_L004.PS,H5YTLCCXY-4_S0_L004.GT,H5YTLCCXY-5_S0_L005.PS,H5YTLCCXY-5_S0_L005.GT,H5YTLCCXY-6_S0_L006.PS,H5YTLCCXY-6_S0_L006.GT,...,H735GCCXY-3_S0_L003.PS,H735GCCXY-3_S0_L003.GT,H735GCCXY-4_S0_L004.PS,H735GCCXY-4_S0_L004.GT,H7WM2CCXY-2_S0_L002.PS,H7WM2CCXY-2_S0_L002.GT,H7WM2CCXY-3_S0_L003.PS,H7WM2CCXY-3_S0_L003.GT,H7WM2CCXY-8_S0_L008.PS,H7WM2CCXY-8_S0_L008.GT
0,NC_044976.1,194,C,T,,./.,,./.,,./.,...,194.0,T|C,,./.,,./.,,./.,,./.
1,NC_044976.1,204,T,A,204.0,T|A,204.0,A|T,,T/A,...,194.0,A|T,204.0,T|A,204.0,A|T,204.0,T|A,204.0,A|T


In [6]:
sample_map = {}
with open("../../sample_name_mapping.csv", "r") as infile:
    for lines in infile:
        values, key = lines.strip().split(",")
        sample_map[key] = values

In [7]:
cols = [sample_map[col.split("_S0")[0]]+"."+col.split(".")[-1] if col.startswith("H") else col for col in df.columns]
df.columns = cols

In [8]:
df.head(2)

Unnamed: 0,CHROM,POS,REF,ALT,ZT8.PS,ZT8.GT,ZT14.PS,ZT14.GT,ZT0.PS,ZT0.GT,...,ZT4.PS,ZT4.GT,ZT20.PS,ZT20.GT,ZT6.PS,ZT6.GT,ZT22.PS,ZT22.GT,ZT10.PS,ZT10.GT
0,NC_044976.1,194,C,T,,./.,,./.,,./.,...,194.0,T|C,,./.,,./.,,./.,,./.
1,NC_044976.1,204,T,A,204.0,T|A,204.0,A|T,,T/A,...,194.0,A|T,204.0,T|A,204.0,A|T,204.0,T|A,204.0,A|T


In [9]:
df.shape

(16118088, 28)

In [10]:
value_vars=[f"ZT{i}.GT" for i in range(0,23,2)]
gt = df.melt(id_vars = ['CHROM', 'POS', 'REF', "ALT"], value_vars=value_vars, var_name="GT")\
    .assign(SNV = lambda x: x['CHROM'] + ":" + x['POS'].astype(str))

In [11]:
value_vars=[f"ZT{i}.PS" for i in range(0,23,2)]
ps = df.melt(id_vars = ['CHROM', 'POS', 'REF', "ALT"], value_vars=value_vars, var_name="PS")\
    .assign(SNV = lambda x: x['CHROM'] + ":" + x['POS'].astype(str))

In [12]:
gt.head(2)

Unnamed: 0,CHROM,POS,REF,ALT,GT,value,SNV
0,NC_044976.1,194,C,T,ZT0.GT,./.,NC_044976.1:194
1,NC_044976.1,204,T,A,ZT0.GT,T/A,NC_044976.1:204


In [13]:
ps.head(2)

Unnamed: 0,CHROM,POS,REF,ALT,PS,value,SNV
0,NC_044976.1,194,C,T,ZT0.PS,,NC_044976.1:194
1,NC_044976.1,204,T,A,ZT0.PS,,NC_044976.1:204


In [15]:
ps['animal'] = ps['PS'].progress_apply(lambda x: x.split(".")[0])
gt['animal'] = gt['GT'].progress_apply(lambda x: x.split(".")[0])

  0%|          | 0/193417056 [00:00<?, ?it/s]

  0%|          | 0/193417056 [00:00<?, ?it/s]

In [16]:
ps_dict = {}
for animal in tqdm(ps['animal'].unique()):
    ps_dict[animal] = ps[ps['animal'] == animal].set_index('SNV')['value'].to_dict()

  0%|          | 0/12 [00:00<?, ?it/s]

In [17]:
gt['PS'] = gt.progress_apply(lambda x: ps_dict[x['animal']][x['SNV']], axis=1)

  0%|          | 0/193417056 [00:00<?, ?it/s]

In [19]:
gt.head(20)

Unnamed: 0,CHROM,POS,REF,ALT,GT,value,SNV,animal,PS
0,NC_044976.1,194,C,T,ZT0.GT,./.,NC_044976.1:194,ZT0,
1,NC_044976.1,204,T,A,ZT0.GT,T/A,NC_044976.1:204,ZT0,
2,NC_044976.1,222,G,A,ZT0.GT,./.,NC_044976.1:222,ZT0,
3,NC_044976.1,244,G,C,ZT0.GT,./.,NC_044976.1:244,ZT0,
4,NC_044976.1,287,A,C,ZT0.GT,./.,NC_044976.1:287,ZT0,
5,NC_044976.1,296,G,A,ZT0.GT,./.,NC_044976.1:296,ZT0,
6,NC_044976.1,300,T,A,ZT0.GT,./.,NC_044976.1:300,ZT0,
7,NC_044976.1,325,C,T,ZT0.GT,./.,NC_044976.1:325,ZT0,
8,NC_044976.1,328,T,C,ZT0.GT,./.,NC_044976.1:328,ZT0,
9,NC_044976.1,441,G,C,ZT0.GT,C|G,NC_044976.1:441,ZT0,204.0


In [23]:
gt.to_parquet("./data_files/whatshap_phased_blocks.parquet", index=False)

In [24]:
gt.shape

(193417056, 9)

#### Let's read the ASE data

In [4]:
gt = pd.read_parquet("./data_files/whatshap_phased_blocks.parquet")

In [5]:
df = pd.read_parquet("./data_files/concatenated_data/gatk_all_samples_concatenated_annt_added_multiline_dupes_dropped_phase_added_with_TPM_and_genomic_data.parquet")

In [6]:
df.head(2)

Unnamed: 0,tissue,timepoint,loci,refCount,altCount,totalCount,refBias,binomTest,fdr,REF,...,homologBias,ai_type,ai_cat,chrom,pos,SNV,genomic_retained,gene_id,TPM,region
0,HIP,ZT0,NC_044976.1:204,1,17,18,0.055556,0.000145,0.012577,T,...,,MA,MA,1,204,1:204,dropped,,,Missing
1,HIP,ZT0,NC_044976.1:4126,1,0,1,1.0,1.0,1.0,G,...,0.0,no_ai,LT5_reads,1,4126,1:4126,dropped,,,Missing


In [7]:
gt.head(2)

Unnamed: 0,CHROM,POS,REF,ALT,GT,value,SNV,animal,PS
0,NC_044976.1,194,C,T,ZT0.GT,./.,NC_044976.1:194,ZT0,
1,NC_044976.1,204,T,A,ZT0.GT,T/A,NC_044976.1:204,ZT0,


In [8]:
snvs = set(df['loci'].unique())
gt['SNV'].nunique(), gt[gt['SNV'].isin(snvs)]['SNV'].nunique()

(16118088, 5949068)

In [9]:
whatshap_dict = {}
for animal in tqdm(gt['animal'].unique()):
    whatshap_dict[animal] = gt[gt['animal'] == animal].set_index('SNV')[['value', 'PS']].to_dict()

  0%|          | 0/12 [00:00<?, ?it/s]

In [10]:
whatshap_dict['ZT0'].keys()

dict_keys(['value', 'PS'])

In [10]:
del gt

In [11]:
df['whatshap_GT'] = df.progress_apply(lambda x: whatshap_dict[x['timepoint']]['value'][x['loci']] if x['loci'] in whatshap_dict[x['timepoint']]['value'] else np.nan, axis=1)

  0%|          | 0/46555541 [00:00<?, ?it/s]

In [12]:
df['whatshap_PS'] = df.progress_apply(lambda x: whatshap_dict[x['timepoint']]['PS'][x['loci']] if x['loci'] in whatshap_dict[x['timepoint']]['PS'] else np.nan, axis=1)

  0%|          | 0/46555541 [00:00<?, ?it/s]

In [13]:
df.head(2)

Unnamed: 0,tissue,timepoint,loci,refCount,altCount,totalCount,refBias,binomTest,fdr,REF,...,ai_cat,chrom,pos,SNV,genomic_retained,gene_id,TPM,region,whatshap_GT,whatshap_PS
0,HIP,ZT0,NC_044976.1:204,1,17,18,0.055556,0.000145,0.012577,T,...,MA,1,204,1:204,dropped,,,Missing,T/A,
1,HIP,ZT0,NC_044976.1:4126,1,0,1,1.0,1.0,1.0,G,...,LT5_reads,1,4126,1:4126,dropped,,,Missing,A|G,204.0


In [14]:
df.to_parquet("./data_files/concatenated_data/gatk_all_samples_concatenated_annt_added_multiline_dupes_dropped_phase_added_with_TPM_and_genomic_data_with_whatshap_phase_data.parquet")