In [9]:
import pandas as pd

cols = [
    "#AlleleID",
    "VariationID",
    "Type",
    "Name",
    "GeneID",
    "GeneSymbol",
    "HGNC_ID",
    "ClinicalSignificance",
    "ClinSigSimple",
    "Chromosome",
    "Start",
    "RS# (dbSNP)",
    "PositionVCF",
    "ReferenceAlleleVCF",
    "AlternateAlleleVCF",
    "OtherIDs",
]

clinvar_df = pd.read_csv(
    "data/variant_summary.txt",
    sep="\t",
    dtype=str,  # prevents dtype inference issues
    usecols=cols,
    low_memory=False,  # avoids mixed-type warnings
)

clinvar_df['ClinicalSignificance'].unique()

array(['Pathogenic/Likely pathogenic', 'Pathogenic',
       'Uncertain significance', 'Likely pathogenic',
       'Conflicting classifications of pathogenicity',
       'Conflicting classifications of pathogenicity; other; risk factor',
       'Conflicting classifications of pathogenicity; other', 'Benign',
       'risk factor', 'Likely benign', 'association',
       'Likely pathogenic; risk factor', 'Benign/Likely benign',
       'no classification for the single variant',
       'Conflicting classifications of pathogenicity; risk factor',
       'drug response', 'no classifications from unflagged records',
       'Affects', 'Benign; drug response', 'Likely benign; drug response',
       'Conflicting classifications of pathogenicity; association; risk factor',
       'Pathogenic; risk factor', 'Benign; risk factor',
       'Benign/Likely benign; other',
       'Pathogenic/Likely pathogenic; risk factor', 'Benign; other',
       'not provided', 'protective; risk factor', 'Likely benign

In [11]:
snv = clinvar_df[clinvar_df["Type"] == "single nucleotide variant"]
snv_test = snv[:200]

snv_test['ClinicalSignificance'].unique()
snv_test

Unnamed: 0,#AlleleID,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,RS# (dbSNP),Chromosome,Start,OtherIDs,VariationID,PositionVCF,ReferenceAlleleVCF,AlternateAlleleVCF
4,15043,single nucleotide variant,NM_014630.3(ZNF592):c.3136G>A (p.Gly1046Arg),9640,ZNF592,HGNC:28986,Uncertain significance,0,150829393,15,85342440,"ClinGen:CA210674,UniProtKB:Q92610#VAR_064583,O...",4,85342440,G,A
5,15043,single nucleotide variant,NM_014630.3(ZNF592):c.3136G>A (p.Gly1046Arg),9640,ZNF592,HGNC:28986,Uncertain significance,0,150829393,15,84799209,"ClinGen:CA210674,UniProtKB:Q92610#VAR_064583,O...",4,84799209,G,A
6,15044,single nucleotide variant,NM_017547.4(FOXRED1):c.694C>T (p.Gln232Ter),55572,FOXRED1,HGNC:26927,Pathogenic,1,267606829,11,126145284,"ClinGen:CA113792,OMIM:613622.0001",5,126145284,C,T
7,15044,single nucleotide variant,NM_017547.4(FOXRED1):c.694C>T (p.Gln232Ter),55572,FOXRED1,HGNC:26927,Pathogenic,1,267606829,11,126275389,"ClinGen:CA113792,OMIM:613622.0001",5,126275389,C,T
8,15045,single nucleotide variant,NM_017547.4(FOXRED1):c.1289A>G (p.Asn430Ser),55572,FOXRED1,HGNC:26927,Likely pathogenic,1,267606830,11,126147412,"ClinGen:CA113794,UniProtKB:Q96CU9#VAR_064571,O...",6,126147412,A,G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,15169,single nucleotide variant,NM_000071.3(CBS):c.1397C>T (p.Ser466Leu),875,CBS,HGNC:1550,Conflicting classifications of pathogenicity,1,121964971,21,43058215,"ClinGen:CA113900,UniProtKB:P35520#VAR_008091,O...",130,43058215,G,A
252,15170,single nucleotide variant,NM_000071.3(CBS):c.1058C>T (p.Thr353Met),875,CBS,HGNC:1550,Pathogenic,1,121964972,21,44480638,"ClinGen:CA113902,UniProtKB:P35520#VAR_008082,O...",131,44480638,G,A
253,15170,single nucleotide variant,NM_000071.3(CBS):c.1058C>T (p.Thr353Met),875,CBS,HGNC:1550,Pathogenic,1,121964972,21,43060528,"ClinGen:CA113902,UniProtKB:P35520#VAR_008082,O...",131,43060528,G,A
254,15171,single nucleotide variant,NM_000071.3(CBS):c.572C>T (p.Thr191Met),875,CBS,HGNC:1550,Pathogenic,1,121964973,21,44485591,"ClinGen:CA113904,UniProtKB:P35520#VAR_008068,O...",132,44485591,G,A


In [34]:
import pysam

fa = pysam.FastaFile("data/GRCh38.fa")  # path to your GRCh37 FASTA


def get_dna_window(row, flank=512):
    chrom = str(row["Chromosome"])  # e.g. '15'
    pos = int(row["PositionVCF"])  # 1-based
    ref = row["ReferenceAlleleVCF"]  # 'G'
    alt = row["AlternateAlleleVCF"]  # 'A'

    start = pos - flank
    end = pos + flank

    # pysam: 0-based, half-open
    wt_seq = fa.fetch(chrom, start - 1, end)

    # sanity check
    ref_base = fa.fetch(chrom, pos - 1, pos).upper()
    if ref_base != ref:
        raise ValueError(
            f"Ref mismatch at {chrom}:{pos}: fasta={ref_base}, ClinVar={ref}"
        )

    offset = flank  # position of variant within the window
    alt_seq = wt_seq[:offset] + alt + wt_seq[offset + 1 :]

    return wt_seq, alt_seq

In [41]:
row = snv_test.iloc[1]
wt_dna, alt_dna = get_dna_window(row)

wt_dna, alt_dna

('CTAGTGTGGCTGCTCGGAGCAGCTCCCTGCCTTCTGGCCGCTGGGGTAGGCCTGAAGCCCACCGCAGGGTGGAAGCCAGGCCGCGGCTGAGGAACACTGGCTGGACCTGCCAGGAGTGCCAGGAGTGGGTTCCAGATCGGGAGAGCTACGTGTCCCACATGAAAAAGAGCCACGGTCGGGTAAGTGCAGCCACACAGTCATAATGCAGAGCCCAGTCCTCTGGACTTCCTTCTGTGAAGCCAGAACCCCTAGGGTTCCTGGTGCTTAGGGCAGGGTGGGTACCACAGATCTTGAGGTCTTCGGGAGTATCCTCCTTTCTGCCCATGGCATCTGAGAAGAAAAATGCACCCAGAACTATCTTACAGTTCTGATGCTTTGTGTGTTGCCACCTCCTTCCTTAGACATTGAAGCGGTACCCATGCCGGCAGTGTGAACAGTCCTTCCACACCCCCAACAGCCTGCGCAAACACATCCGCAACAACCATGACACAGTAAAGAAGTTCTACACCTGCGGGTGAGTCCCTGGGGATAGTAGTGAGGAGGCCTGAGGTTCAAAAGACTCTGTCCGTGGCACCACTGGGGCTTTTCTGTGCTGCAAGATCAGGTGTCTAAGACAAGAGACAAGTGATTTCCAACTGGAAGAAATTGCGGCTAAGTCAGAAATCAGGGGCAGGTCAAAAATCAGCTTCCAGAACCTGGTAGCTCCTGAGCCCTCTCTCGTCACTCTCTAGCCCAGGACTGCACAGCCCATCAGTCACGAAGCATCCTGAGGTTTAAGAGGAGGGGTGGTGATGTGAGCATGCACCCCTGGGGCCGAGAGGCTTCTGCACCATCTGCCTGTGCCTTGGGGTGGCCCCAGGCCCTTTCACTGTGACTACTAGCCTAGCACTTGGGTCTCTGGGCGGTGACATCAGGTAGTGTTCAGTGAGATTGTGGCTGAGTGCCAGGTGTCAAGAGTGCAAGTATTCTGACGTGCTATTGTCTGCTACCTTGGCTGG

In [36]:
import re
import requests

three2one = {
    "Ala": "A",
    "Arg": "R",
    "Asn": "N",
    "Asp": "D",
    "Cys": "C",
    "Glu": "E",
    "Gln": "Q",
    "Gly": "G",
    "His": "H",
    "Ile": "I",
    "Leu": "L",
    "Lys": "K",
    "Met": "M",
    "Phe": "F",
    "Pro": "P",
    "Ser": "S",
    "Thr": "T",
    "Trp": "W",
    "Tyr": "Y",
    "Val": "V",
    "Ter": "*",
    "Sec": "U",
    "Pyl": "O",
}


def get_uniprot_acc(other_ids: str):
    m = re.search(r"UniProtKB:([A-Z0-9]+)", str(other_ids))
    return m.group(1) if m else None


def fetch_uniprot_seq(acc: str) -> str:
    url = f"https://rest.uniprot.org/uniprotkb/{acc}.fasta"
    r = requests.get(url)
    r.raise_for_status()
    lines = r.text.splitlines()
    return "".join(l.strip() for l in lines if not l.startswith(">"))


def parse_protein_hgvs(name: str):
    # handles p.Gly1046Arg style
    m = re.search(r"p\.([A-Za-z]{3})(\d+)([A-Za-z]{3})", str(name))
    if not m:
        raise ValueError(f"Cannot parse protein HGVS from {name}")
    ref3, pos, alt3 = m.groups()
    pos = int(pos)
    ref1 = three2one[ref3]
    alt1 = three2one[alt3]
    return pos, ref1, alt1


def get_protein_wt_mut(row):
    acc = get_uniprot_acc(row["OtherIDs"])
    if acc is None:
        raise ValueError("No UniProt accession found in OtherIDs")

    prot_wt = fetch_uniprot_seq(acc)
    pos, ref_aa, alt_aa = parse_protein_hgvs(row["Name"])

    if prot_wt[pos - 1] != ref_aa:
        raise ValueError(
            f"Ref AA mismatch: protein has {prot_wt[pos - 1]} at {pos}, HGVS says {ref_aa}"
        )

    prot_mut = prot_wt[: pos - 1] + alt_aa + prot_wt[pos:]
    return prot_wt, prot_mut

In [43]:
prot_wt, prot_mut = get_protein_wt_mut(row)
prot_wt, prot_mut

('MGDMKTPDFDDLLAAFDIPDPTSLDAKEAIQTPSEENESPLKPPGICMDESVSLSHSGSAPDVPAVSVIVKNTSRQESFEAEKDHITPSLLHNGFRGSDLPPDPHNCGKFDSTFMNGDSARSFPGKLEPPKSEPLPTFNQFSPISSPEPEDPIKDNGFGIKPKHSDSYFPPPLGCGAVGGPVLEALAKFPVPELHMFDHFCKKEPKPEPLPLGSQQEHEQSGQNTVEPHKDPDATRFFGEALEFNSHPSNSIGESKGLARELGTCSSVPPRQRLKPAHSKLSSCVAALVALQAKRVASVTKEDQPGHTKDLSGPTKESSKGSPKMPKSPKSPRSPLEATRKSIKPSDSPRSICSDSSSKGSPSVAASSPPAIPKVRIKTIKTSSGEIKRTVTRILPDPDDPSKSPVGSPLGSAIAEAPSEMPGDEVPVEEHFPEAGTNSGSPQGARKGDESMTKASDSSSPSCSSGPRVPKGAAPGSQTGKKQQSTALQASTLAPANLLPKAVHLANLNLVPHSVAASVTAKSSVQRRSQPQLTQMSVPLVHQVKKAAPLIVEVFNKVLHSSNPVPLYAPNLSPPADSRIHVPASGYCCLECGDAFALEKSLSQHYGRRSVHIEVLCTLCSKTLLFFNKCSLLRHARDHKSKGLVMQCSQLLVKPISADQMFVSAPVNSTAPAAPAPSSSPKHGLTSGSASPPPPALPLYPDPVRLIRYSIKCLECHKQMRDYMVLAAHFQRTTEETEGLTCQVCQMLLPNQCSFCAHQRIHAHKSPYCCPECGVLCRSAYFQTHVKENCLHYARKVGYRCIHCGVVHLTLALLKSHIQERHCQVFHKCAFCPMAFKTASSTADHSATQHPTQPHRPSQLIYKCSCEMVFNKKRHIQQHFYQNVSKTQVGVFKCPECPLLFVQKPELMQHVKSTHGVPRNVDELSSLQSSADTSSSRPGSRVPTEPPATSVAARSSSLPSGRWGRPEAHRRVEARPRLRNTGWTCQECQEWVPDRESY

In [None]:
import pandas as pd

# df = your full variant_summary dataframe


def process_variant_row(row, flank=512):
    """
    Try to build everything we need for one SNV row.
    Return a dict if successful, or None if anything fails.
    """
    try:
        # 1) DNA windows (wt + alt)
        wt_dna, alt_dna = get_dna_window(row, flank=flank)

        # 2) Protein sequences (wt + mutant)
        prot_wt, prot_mut = get_protein_wt_mut(row)

        # 3) Label from ClinSigSimple (already 0/1)
        label = int(row["ClinSigSimple"])

    except Exception:
        # Any failure → skip this variant
        return None

    return {
        "variant_id": int(row["VariationID"]),
        "chrom": str(row["Chromosome"]),
        "pos": int(row["PositionVCF"]),
        "ref": row["ReferenceAlleleVCF"],
        "alt": row["AlternateAlleleVCF"],
        "gene_symbol": row["GeneSymbol"],
        "wt_dna": wt_dna,
        "alt_dna": alt_dna,
        "prot_wt": prot_wt,
        "prot_mut": prot_mut,
        "label": label,
    }


def build_snv_dataset(df, flank=512):
    """
    Go over all rows, keep only SNVs where both DNA + protein processing work.
    Returns a clean DataFrame ready for embedding.
    """
    records = []

    for _, row in df.iterrows():
        # Only single nucleotide variants
        if row["Type"] != "single nucleotide variant":
            continue

        rec = process_variant_row(row, flank=flank)
        if rec is not None:
            records.append(rec)

    return pd.DataFrame.from_records(records)


# Example usage:
snv_df = build_snv_dataset(snv_test, flank=512)
print(snv_df.head())
print(len(snv_df), "SNVs kept after filtering")

   variant_id chrom        pos ref alt gene_symbol  \
0           4    15   84799209   G   A      ZNF592   
1           6    11  126277517   A   G     FOXRED1   
2      214885    14   32031331   G   A       NUBPL   
3      214885    14   31562125   G   A       NUBPL   
4           9     6   26092913   G   A         HFE   

                                              wt_dna  \
0  CTAGTGTGGCTGCTCGGAGCAGCTCCCTGCCTTCTGGCCGCTGGGG...   
1  TAAACAAGTCTGGGCCTGTCCTTGTGTCCCAGGCAATGTAAGCGTT...   
2  TATGTCTCTTCCACCATCCTGAAGGCTAGTACTCTGCATAAAACCA...   
3  CCTGGCATTGCTTCTTGCCAAAGACTCGCCTCAGTTCCTGAGACCC...   
4  TCATTTTCAATGCACATAAAGGGCAATTTTATCTATCAGAACAAAG...   

                                             alt_dna  \
0  CTAGTGTGGCTGCTCGGAGCAGCTCCCTGCCTTCTGGCCGCTGGGG...   
1  TAAACAAGTCTGGGCCTGTCCTTGTGTCCCAGGCAATGTAAGCGTT...   
2  TATGTCTCTTCCACCATCCTGAAGGCTAGTACTCTGCATAAAACCA...   
3  CCTGGCATTGCTTCTTGCCAAAGACTCGCCTCAGTTCCTGAGACCC...   
4  TCATTTTCAATGCACATAAAGGGCAATTTTATCTATCAGAACAAAG...   

 

In [49]:
snv_df.shape

(66, 11)

In [50]:
snv_df.head()

Unnamed: 0,variant_id,chrom,pos,ref,alt,gene_symbol,wt_dna,alt_dna,prot_wt,prot_mut,label
0,4,15,84799209,G,A,ZNF592,CTAGTGTGGCTGCTCGGAGCAGCTCCCTGCCTTCTGGCCGCTGGGG...,CTAGTGTGGCTGCTCGGAGCAGCTCCCTGCCTTCTGGCCGCTGGGG...,MGDMKTPDFDDLLAAFDIPDPTSLDAKEAIQTPSEENESPLKPPGI...,MGDMKTPDFDDLLAAFDIPDPTSLDAKEAIQTPSEENESPLKPPGI...,0
1,6,11,126277517,A,G,FOXRED1,TAAACAAGTCTGGGCCTGTCCTTGTGTCCCAGGCAATGTAAGCGTT...,TAAACAAGTCTGGGCCTGTCCTTGTGTCCCAGGCAATGTAAGCGTT...,MIRRVLPHGMGRGLLTRRPGTRRGGFSLDWDGKVSEIKKKIKSILP...,MIRRVLPHGMGRGLLTRRPGTRRGGFSLDWDGKVSEIKKKIKSILP...,1
2,214885,14,32031331,G,A,NUBPL,TATGTCTCTTCCACCATCCTGAAGGCTAGTACTCTGCATAAAACCA...,TATGTCTCTTCCACCATCCTGAAGGCTAGTACTCTGCATAAAACCA...,MGIWQRLLLFGGVSLRAGGGATAPLGGSRAMVCGRQLSGAGSETLK...,MGIWQRLLLFGGVSLRAGGGATAPLGGSRAMVCGRQLSGAGSETLK...,1
3,214885,14,31562125,G,A,NUBPL,CCTGGCATTGCTTCTTGCCAAAGACTCGCCTCAGTTCCTGAGACCC...,CCTGGCATTGCTTCTTGCCAAAGACTCGCCTCAGTTCCTGAGACCC...,MGIWQRLLLFGGVSLRAGGGATAPLGGSRAMVCGRQLSGAGSETLK...,MGIWQRLLLFGGVSLRAGGGATAPLGGSRAMVCGRQLSGAGSETLK...,1
4,9,6,26092913,G,A,HFE,TCATTTTCAATGCACATAAAGGGCAATTTTATCTATCAGAACAAAG...,TCATTTTCAATGCACATAAAGGGCAATTTTATCTATCAGAACAAAG...,MGPRARPALLLLMLLQTAVLQGRLLRSHSLHYLFMGASEQDLGLSL...,MGPRARPALLLLMLLQTAVLQGRLLRSHSLHYLFMGASEQDLGLSL...,1


In [51]:
snv_df.iloc[1]

variant_id                                                     6
chrom                                                         11
pos                                                    126277517
ref                                                            A
alt                                                            G
gene_symbol                                              FOXRED1
wt_dna         TAAACAAGTCTGGGCCTGTCCTTGTGTCCCAGGCAATGTAAGCGTT...
alt_dna        TAAACAAGTCTGGGCCTGTCCTTGTGTCCCAGGCAATGTAAGCGTT...
prot_wt        MIRRVLPHGMGRGLLTRRPGTRRGGFSLDWDGKVSEIKKKIKSILP...
prot_mut       MIRRVLPHGMGRGLLTRRPGTRRGGFSLDWDGKVSEIKKKIKSILP...
label                                                          1
Name: 1, dtype: object

In [None]:
import polars as pl

df_pl = pl.read_parquet("snvs.parquet")
df_pl.head()

variant_id,chrom,pos,ref,alt,gene_symbol,wt_dna,alt_dna,prot_wt,prot_mut,label
i64,str,i64,str,str,str,str,str,str,str,i64
4,"""15""",84799209,"""G""","""A""","""ZNF592""","""CTAGTGTGGCTGCTCGGAGCAGCTCCCTGC…","""CTAGTGTGGCTGCTCGGAGCAGCTCCCTGC…","""MGDMKTPDFDDLLAAFDIPDPTSLDAKEAI…","""MGDMKTPDFDDLLAAFDIPDPTSLDAKEAI…",0
6,"""11""",126277517,"""A""","""G""","""FOXRED1""","""TAAACAAGTCTGGGCCTGTCCTTGTGTCCC…","""TAAACAAGTCTGGGCCTGTCCTTGTGTCCC…","""MIRRVLPHGMGRGLLTRRPGTRRGGFSLDW…","""MIRRVLPHGMGRGLLTRRPGTRRGGFSLDW…",1
214885,"""14""",32031331,"""G""","""A""","""NUBPL""","""TATGTCTCTTCCACCATCCTGAAGGCTAGT…","""TATGTCTCTTCCACCATCCTGAAGGCTAGT…","""MGIWQRLLLFGGVSLRAGGGATAPLGGSRA…","""MGIWQRLLLFGGVSLRAGGGATAPLGGSRA…",1
214885,"""14""",31562125,"""G""","""A""","""NUBPL""","""CCTGGCATTGCTTCTTGCCAAAGACTCGCC…","""CCTGGCATTGCTTCTTGCCAAAGACTCGCC…","""MGIWQRLLLFGGVSLRAGGGATAPLGGSRA…","""MGIWQRLLLFGGVSLRAGGGATAPLGGSRA…",1
9,"""6""",26092913,"""G""","""A""","""HFE""","""TCATTTTCAATGCACATAAAGGGCAATTTT…","""TCATTTTCAATGCACATAAAGGGCAATTTT…","""MGPRARPALLLLMLLQTAVLQGRLLRSHSL…","""MGPRARPALLLLMLLQTAVLQGRLLRSHSL…",1


: 

In [2]:
import pandas as pd
test = pd.read_parquet("snvs.parquet")

test.head()

Unnamed: 0,variant_id,chrom,pos,ref,alt,gene_symbol,wt_dna,alt_dna,prot_wt,prot_mut,label
0,4,15,84799209,G,A,ZNF592,CTAGTGTGGCTGCTCGGAGCAGCTCCCTGCCTTCTGGCCGCTGGGG...,CTAGTGTGGCTGCTCGGAGCAGCTCCCTGCCTTCTGGCCGCTGGGG...,MGDMKTPDFDDLLAAFDIPDPTSLDAKEAIQTPSEENESPLKPPGI...,MGDMKTPDFDDLLAAFDIPDPTSLDAKEAIQTPSEENESPLKPPGI...,0
1,6,11,126277517,A,G,FOXRED1,TAAACAAGTCTGGGCCTGTCCTTGTGTCCCAGGCAATGTAAGCGTT...,TAAACAAGTCTGGGCCTGTCCTTGTGTCCCAGGCAATGTAAGCGTT...,MIRRVLPHGMGRGLLTRRPGTRRGGFSLDWDGKVSEIKKKIKSILP...,MIRRVLPHGMGRGLLTRRPGTRRGGFSLDWDGKVSEIKKKIKSILP...,1
2,214885,14,32031331,G,A,NUBPL,TATGTCTCTTCCACCATCCTGAAGGCTAGTACTCTGCATAAAACCA...,TATGTCTCTTCCACCATCCTGAAGGCTAGTACTCTGCATAAAACCA...,MGIWQRLLLFGGVSLRAGGGATAPLGGSRAMVCGRQLSGAGSETLK...,MGIWQRLLLFGGVSLRAGGGATAPLGGSRAMVCGRQLSGAGSETLK...,1
3,214885,14,31562125,G,A,NUBPL,CCTGGCATTGCTTCTTGCCAAAGACTCGCCTCAGTTCCTGAGACCC...,CCTGGCATTGCTTCTTGCCAAAGACTCGCCTCAGTTCCTGAGACCC...,MGIWQRLLLFGGVSLRAGGGATAPLGGSRAMVCGRQLSGAGSETLK...,MGIWQRLLLFGGVSLRAGGGATAPLGGSRAMVCGRQLSGAGSETLK...,1
4,9,6,26092913,G,A,HFE,TCATTTTCAATGCACATAAAGGGCAATTTTATCTATCAGAACAAAG...,TCATTTTCAATGCACATAAAGGGCAATTTTATCTATCAGAACAAAG...,MGPRARPALLLLMLLQTAVLQGRLLRSHSLHYLFMGASEQDLGLSL...,MGPRARPALLLLMLLQTAVLQGRLLRSHSLHYLFMGASEQDLGLSL...,1


In [7]:
len(test.prot_mut.max())

1710

In [8]:
import polars as pl

# Load your parquet file
df = pl.read_parquet("snvs.parquet")

# Add length columns and filter
df_clean = (
    df
    .with_columns([
        pl.col("prot_wt").str.len_chars().alias("len_wt"),
        pl.col("prot_mut").str.len_chars().alias("len_mut"),
    ])
    .filter(
        (pl.col("len_wt") < 1022) &
        (pl.col("len_mut") < 1022)
    )
    .drop(["len_wt", "len_mut"])  # optional cleanup
)

# Save cleaned file
df_clean.write_parquet("snvs_filtered.parquet", compression="snappy")

print("Before:", df.shape)
print("After:", df_clean.shape)

Before: (23077, 11)
After: (15773, 11)
