## Matrix Processing with Polars
### 2023-10-12

This notebook details the step to process the matrix of allele frequencies for each variant in each sample. The matrix is generated from the output `bcftools` pipeline.

In [104]:
import pandas as pd
import bioframe as bf
import matplotlib.pyplot as plt

In [125]:
# load an example filtered matrix of footprinting scores, which overlap the variant sites in the vcf files

df_fpscore = pd.read_csv("/scratch/users/ntu/suffiazi/outputs/brca-vcf-filtered-fpscore-matrices/TFDP1_M08108_2.00_BRCA-subtype-vcf-filtered-matrix.txt", sep="\t")

# drop the column "TFBS_strand" and "TFBS_score"
df_fpscore = df_fpscore.drop(columns=["TFBS_strand", "TFBS_score"])
# rename columns in the dataframe
df_fpscore = df_fpscore.rename(columns={"TFBS_chr": "chrom", "TFBS_start": "start", "TFBS_end": "end", "2GAMBDQ_Normal-like_score": "2GAMBDQ_Norm_fps"})
# for all column names that end with the string 'score', replace the string with 'fps'
df_fpscore = df_fpscore.rename(columns=lambda x: x.replace('score', 'fps') if x.endswith('score') else x)
df_fpscore.head(n=10)

Unnamed: 0,chrom,start,end,98JKPD8_LumA_fps,ANAB5F7_Basal_fps,S6R691V_Her2_fps,PU24GB8_LumB_fps,2GAMBDQ_Norm_fps
0,chr1,191257,191268,0.0,0.0,0.0,0.0,0.0
1,chr1,818021,818032,0.02966,0.04399,0.02131,0.0347,0.03443
2,chr1,818780,818791,0.02572,0.02376,0.02271,0.02814,0.03391
3,chr1,960570,960581,0.28046,0.335,0.28495,0.27618,0.22986
4,chr1,977015,977026,1.10047,0.50514,0.71035,1.21305,1.30155
5,chr1,977047,977058,0.88179,0.40974,0.53173,0.98628,1.07101
6,chr1,977110,977121,0.13137,0.08845,0.10541,0.16812,0.15493
7,chr1,977142,977153,0.14436,0.08225,0.09233,0.17809,0.14291
8,chr1,977174,977185,0.22616,0.10631,0.13808,0.24319,0.22306
9,chr1,977206,977217,0.2971,0.15487,0.21703,0.33427,0.31093


In [126]:
# load up the vcf file with indels and multiallelic sites split into separate rows
df_vcf_basal = pd.read_csv("/scratch/users/ntu/suffiazi/outputs/brca-called-variants-diffmode-AF-query/ANAB5F7/ANAB5F7_TFDP1_M08108_2.00_AF-per-site-with-indels.txt", sep="\t")
# rename columns in the dataframe
df_vcf_basal = df_vcf_basal.rename(columns={"#[1]CHROM": "chrom", "[2]POS": "start", "[3]REF": "ref_allele", "[4]ALT": "alt_allele", "[5]AF": "AF"})
# add a column next to the "start" column called "end" with the same value as the "start" column
df_vcf_basal.insert(2, "end", df_vcf_basal["start"])

df_vcf_basal.head(n=10)

Unnamed: 0,chrom,start,end,ref_allele,alt_allele,AF
0,chr1,818025,818025,C,A,0.8125
1,chr1,818783,818783,T,C,0.033333
2,chr1,977023,977023,G,A,0.941176
3,chr1,977055,977055,G,A,0.5
4,chr1,977118,977118,G,A,0.852941
5,chr1,977150,977150,G,A,0.53125
6,chr1,977214,977214,G,A,0.588235
7,chr1,977338,977338,G,A,1.0
8,chr1,977370,977370,G,A,1.0
9,chr1,986629,986629,G,A,0.15625


In [127]:
# load up the vcf file with indels and multiallelic sites split into separate rows
df_vcf_lumA = pd.read_csv("/scratch/users/ntu/suffiazi/outputs/brca-called-variants-diffmode-AF-query/98JKPD8/98JKPD8_TFDP1_M08108_2.00_AF-per-site-with-indels.txt", sep="\t")
# rename columns in the dataframe
df_vcf_lumA = df_vcf_lumA.rename(columns={"#[1]CHROM": "chrom", "[2]POS": "start", "[3]REF": "ref_allele", "[4]ALT": "alt_allele", "[5]AF": "AF"})
# add a column next to the "start" column called "end" with the same value as the "start" column
df_vcf_lumA.insert(2, "end", df_vcf_lumA["start"])

df_vcf_lumA.head(n=10)

Unnamed: 0,chrom,start,end,ref_allele,alt_allele,AF
0,chr1,818025,818025,C,A,0.9
1,chr1,818783,818783,T,C,0.125
2,chr1,960574,960574,C,A,0.041667
3,chr1,977023,977023,G,A,1.0
4,chr1,977055,977055,G,A,1.0
5,chr1,977118,977118,G,A,0.909091
6,chr1,977150,977150,G,A,0.625
7,chr1,977214,977214,G,A,0.818182
8,chr1,977338,977338,G,A,1.0
9,chr1,977370,977370,G,A,1.0


In [128]:
# load up the vcf file with indels and multiallelic sites split into separate rows
df_vcf_lumB = pd.read_csv("/scratch/users/ntu/suffiazi/outputs/brca-called-variants-diffmode-AF-query/PU24GB8/PU24GB8_TFDP1_M08108_2.00_AF-per-site-with-indels.txt", sep="\t")
# rename columns in the dataframe
df_vcf_lumB = df_vcf_lumB.rename(columns={"#[1]CHROM": "chrom", "[2]POS": "start", "[3]REF": "ref_allele", "[4]ALT": "alt_allele", "[5]AF": "AF"})
# add a column next to the "start" column called "end" with the same value as the "start" column
df_vcf_lumB.insert(2, "end", df_vcf_lumB["start"])

df_vcf_lumB.head(n=10)

Unnamed: 0,chrom,start,end,ref_allele,alt_allele,AF
0,chr1,191258,191258,G,C,0.026316
1,chr1,818025,818025,C,A,0.833333
2,chr1,818783,818783,T,C,0.055556
3,chr1,977023,977023,G,A,0.947368
4,chr1,977055,977055,G,A,0.789474
5,chr1,977118,977118,G,A,0.944444
6,chr1,977150,977150,G,A,1.0
7,chr1,977214,977214,G,A,0.868421
8,chr1,977338,977338,G,A,1.0
9,chr1,977370,977370,G,A,1.0


In [129]:
# load up the vcf file with indels and multiallelic sites split into separate rows
df_vcf_her2 = pd.read_csv("/scratch/users/ntu/suffiazi/outputs/brca-called-variants-diffmode-AF-query/S6R691V/S6R691V_TFDP1_M08108_2.00_AF-per-site-with-indels.txt", sep="\t")
# rename columns in the dataframe
df_vcf_her2 = df_vcf_her2.rename(columns={"#[1]CHROM": "chrom", "[2]POS": "start", "[3]REF": "ref_allele", "[4]ALT": "alt_allele", "[5]AF": "AF"})
# add a column next to the "start" column called "end" with the same value as the "start" column
df_vcf_her2.insert(2, "end", df_vcf_her2["start"])

df_vcf_her2.head(n=10)
# S6R691V

Unnamed: 0,chrom,start,end,ref_allele,alt_allele,AF
0,chr1,818025,818025,C,A,0.9
1,chr1,818783,818783,T,C,0.0625
2,chr1,977023,977023,G,A,0.911765
3,chr1,977055,977055,G,A,0.588235
4,chr1,977118,977118,G,A,1.0
5,chr1,977150,977150,G,A,1.0
6,chr1,977338,977338,G,A,1.0
7,chr1,977370,977370,G,A,1.0
8,chr1,986629,986629,G,A,0.264706
9,chr1,996350,996350,A,G,1.0


In [130]:
# load up the vcf file with indels and multiallelic sites split into separate rows
df_vcf_norm = pd.read_csv("/scratch/users/ntu/suffiazi/outputs/brca-called-variants-diffmode-AF-query/2GAMBDQ/2GAMBDQ_TFDP1_M08108_2.00_AF-per-site-with-indels.txt", sep="\t")
# rename columns in the dataframe
df_vcf_norm = df_vcf_norm.rename(columns={"#[1]CHROM": "chrom", "[2]POS": "start", "[3]REF": "ref_allele", "[4]ALT": "alt_allele", "[5]AF": "AF"})
# add a column next to the "start" column called "end" with the same value as the "start" column
df_vcf_norm.insert(2, "end", df_vcf_norm["start"])

df_vcf_norm.head(n=10)
# 2GAMBDQ

Unnamed: 0,chrom,start,end,ref_allele,alt_allele,AF
0,chr1,818025,818025,C,A,0.875
1,chr1,977023,977023,G,A,1.0
2,chr1,977055,977055,G,A,0.75
3,chr1,977118,977118,G,A,1.0
4,chr1,977150,977150,G,A,1.0
5,chr1,977182,977182,G,A,1.0
6,chr1,977214,977214,G,A,1.0
7,chr1,977338,977338,G,A,1.0
8,chr1,977370,977370,G,A,1.0
9,chr1,977402,977402,G,A,1.0


In [131]:
# overlap fpscore combined matrix with Basal vcf data
overlap_df = bf.overlap(df_fpscore, df_vcf_basal, suffixes=["","_ANAB5F7_Basal_varsite"], how='left')

# drop these columns
drop_patterns = ["ref_allele_", "alt_allele_", "chrom_", "end_"]
for pat in drop_patterns:
    overlap_df = overlap_df.drop(columns=overlap_df.filter(like=pat, axis=1).columns)
# rename these columns
overlap_df = overlap_df.rename(columns=lambda x: x.replace('start_', 'pos_') if x.startswith('start_') else x)

# overlap the resulting dataframe again with lumA vcf data
overlap_df = bf.overlap(overlap_df, df_vcf_lumA, suffixes=["","_98JKPD8_LumA_varsite"], how='left')

# drop these columns
for pat in drop_patterns:
    overlap_df = overlap_df.drop(columns=overlap_df.filter(like=pat, axis=1).columns)
# rename these columns
overlap_df = overlap_df.rename(columns=lambda x: x.replace('start_', 'pos_') if x.startswith('start_') else x)

# overlap again with lumB vcf data
overlap_df = bf.overlap(overlap_df, df_vcf_lumB, suffixes=["","_PU24GB8_LumB_varsite"], how='left')

# drop these columns
for pat in drop_patterns:
    overlap_df = overlap_df.drop(columns=overlap_df.filter(like=pat, axis=1).columns)
# rename these columns
overlap_df = overlap_df.rename(columns=lambda x: x.replace('start_', 'pos_') if x.startswith('start_') else x)

# overlap again with her2 vcf data
overlap_df = bf.overlap(overlap_df, df_vcf_her2, suffixes=["","_S6R691V_Her2_varsite"], how='left')

# drop these columns
for pat in drop_patterns:
    overlap_df = overlap_df.drop(columns=overlap_df.filter(like=pat, axis=1).columns)
# rename these columns
overlap_df = overlap_df.rename(columns=lambda x: x.replace('start_', 'pos_') if x.startswith('start_') else x)

# overlap again with normal-like vcf data
overlap_df = bf.overlap(overlap_df, df_vcf_norm, suffixes=["","_2GAMBDQ_Norm_varsite"], how='left')

# drop these columns
for pat in drop_patterns:
    overlap_df = overlap_df.drop(columns=overlap_df.filter(like=pat, axis=1).columns)
# rename these columns
overlap_df = overlap_df.rename(columns=lambda x: x.replace('start_', 'pos_') if x.startswith('start_') else x)

# save the dataframe to a file
overlap_df.to_csv('/home/users/ntu/suffiazi/scripts/gatk-workflow-scripts/polars_processing/combined_fps-variant-freq.tsv', sep='\t', index=False)

overlap_df.head(n=15)

Unnamed: 0,chrom,start,end,98JKPD8_LumA_fps,ANAB5F7_Basal_fps,S6R691V_Her2_fps,PU24GB8_LumB_fps,2GAMBDQ_Norm_fps,pos_ANAB5F7_Basal_varsite,AF_ANAB5F7_Basal_varsite,pos_98JKPD8_LumA_varsite,AF_98JKPD8_LumA_varsite,pos_PU24GB8_LumB_varsite,AF_PU24GB8_LumB_varsite,pos_S6R691V_Her2_varsite,AF_S6R691V_Her2_varsite,pos_2GAMBDQ_Norm_varsite,AF_2GAMBDQ_Norm_varsite
0,chr1,191257,191268,0.0,0.0,0.0,0.0,0.0,,,,,191258.0,0.026316,,,,
1,chr1,818021,818032,0.02966,0.04399,0.02131,0.0347,0.03443,818025.0,0.8125,818025.0,0.9,818025.0,0.833333,818025.0,0.9,818025.0,0.875
2,chr1,818780,818791,0.02572,0.02376,0.02271,0.02814,0.03391,818783.0,0.033333,818783.0,0.125,818783.0,0.055556,818783.0,0.0625,,
3,chr1,960570,960581,0.28046,0.335,0.28495,0.27618,0.22986,,,960574.0,0.041667,,,,,,
4,chr1,977015,977026,1.10047,0.50514,0.71035,1.21305,1.30155,977023.0,0.941176,977023.0,1.0,977023.0,0.947368,977023.0,0.911765,977023.0,1.0
5,chr1,977047,977058,0.88179,0.40974,0.53173,0.98628,1.07101,977055.0,0.5,977055.0,1.0,977055.0,0.789474,977055.0,0.588235,977055.0,0.75
6,chr1,977110,977121,0.13137,0.08845,0.10541,0.16812,0.15493,977118.0,0.852941,977118.0,0.909091,977118.0,0.944444,977118.0,1.0,977118.0,1.0
7,chr1,977142,977153,0.14436,0.08225,0.09233,0.17809,0.14291,977150.0,0.53125,977150.0,0.625,977150.0,1.0,977150.0,1.0,977150.0,1.0
8,chr1,977174,977185,0.22616,0.10631,0.13808,0.24319,0.22306,,,,,,,,,977182.0,1.0
9,chr1,977206,977217,0.2971,0.15487,0.21703,0.33427,0.31093,977214.0,0.588235,977214.0,0.818182,977214.0,0.868421,,,977214.0,1.0


In [134]:
# show only row number 100 and 101
overlap_df.iloc[[99, 100]]

Unnamed: 0,chrom,start,end,98JKPD8_LumA_fps,ANAB5F7_Basal_fps,S6R691V_Her2_fps,PU24GB8_LumB_fps,2GAMBDQ_Norm_fps,pos_ANAB5F7_Basal_varsite,AF_ANAB5F7_Basal_varsite,pos_98JKPD8_LumA_varsite,AF_98JKPD8_LumA_varsite,pos_PU24GB8_LumB_varsite,AF_PU24GB8_LumB_varsite,pos_S6R691V_Her2_varsite,AF_S6R691V_Her2_varsite,pos_2GAMBDQ_Norm_varsite,AF_2GAMBDQ_Norm_varsite
99,chr1,3653047,3653058,0.07863,0.04912,0.05257,0.0775,0.05414,3653055,0.029412,,,,,,,,
100,chr1,3653047,3653058,0.07863,0.04912,0.05257,0.0775,0.05414,3653056,0.029412,,,,,,,,


In [90]:

# Create a sample DataFrame
df = pd.DataFrame({'col_1': [1, 2, 3], 'col2': [4, 5, 6], 'col3': [7, 8, 9], 'col4': [10, 11, 12]})

# Drop columns matching the pattern "col_"
result = df.drop(columns=df.filter(like="col_", axis=1).columns)

# Print the result
print(result)

   col2  col3  col4
0     4     7    10
1     5     8    11
2     6     9    12
