In [2]:
# Cell 1: setup
import os, sys, re
import pandas as pd
from pathlib import Path

PROJECT_DIR = r"C:\Users\MYG-D02\variant-effect-prediction"
RAW_DIR     = os.path.join(PROJECT_DIR, "data", "raw")
PROC_DIR    = os.path.join(PROJECT_DIR, "data", "processed")
os.makedirs(PROC_DIR, exist_ok=True)

CLINVAR_RAW = os.path.join(RAW_DIR, "variant_summary.txt")          # or .gz
DBSNP_PATH  = os.path.join(RAW_DIR, "GCF_000001405.40")             # directory or file
GNOMAD_PATH = os.path.join(RAW_DIR, "ExAC.r1.sites.vep.vcf")       # or .gz

print("Project paths:")
print(" PROJECT_DIR:", PROJECT_DIR)
print(" RAW_DIR:", RAW_DIR)
print(" PROC_DIR:", PROC_DIR)
print(" CLINVAR_RAW:", CLINVAR_RAW)
print(" DBSNP_PATH:", DBSNP_PATH)
print(" GNOMAD_PATH:", GNOMAD_PATH)

# Try to detect VCF helper libraries
HAS_PYSAM = False; HAS_CYVCF2 = False
try:
    import pysam
    HAS_PYSAM = True
except Exception:
    pass
try:
    import cyvcf2
    HAS_CYVCF2 = True
except Exception:
    pass

print("HAS_PYSAM:", HAS_PYSAM, "HAS_CYVCF2:", HAS_CYVCF2)

# small helper: safe read CSV with gzip detection
def read_table_auto(path, **kwargs):
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(path)
    if path.suffix == ".gz":
        return pd.read_csv(path, compression="gzip", **kwargs)
    return pd.read_csv(path, **kwargs)


Project paths:
 PROJECT_DIR: C:\Users\MYG-D02\variant-effect-prediction
 RAW_DIR: C:\Users\MYG-D02\variant-effect-prediction/data/raw
 PROC_DIR: C:\Users\MYG-D02\variant-effect-prediction/data/processed
 CLINVAR_RAW: C:\Users\MYG-D02\variant-effect-prediction/data/raw/variant_summary.txt
 DBSNP_PATH: C:\Users\MYG-D02\variant-effect-prediction/data/raw/GCF_000001405.40
 GNOMAD_PATH: C:\Users\MYG-D02\variant-effect-prediction/data/raw/ExAC.r1.sites.vep.vcf
HAS_PYSAM: True HAS_CYVCF2: True


In [3]:
# Cell A: detect ClinVar file in RAW_DIR and show helpful info
import os, sys
from pathlib import Path
RAW_DIR = Path("/mnt/c/Users/MYG-D02/variant-effect-prediction/data/raw")

print("RAW_DIR exists?", RAW_DIR.exists(), "  path:", RAW_DIR)
if not RAW_DIR.exists(): 
    raise FileNotFoundError(f"RAW_DIR not found: {RAW_DIR}")

# list files and directories
items = sorted(RAW_DIR.iterdir(), key=lambda p: p.name.lower())
print(f"\nFiles/dirs in {RAW_DIR} (first 200):")
for p in items[:200]:
    print(" ", p.name, "(dir)" if p.is_dir() else "")

# Candidate names / patterns to look for
candidates = []
patterns = ["variant_summary", "variant-summary", "variant_summary.txt", "variant_summary.txt.gz",
            "variant_summary.txt.gz", "variant_summary.txt", "variant_summary", "ExAC", "GCF_000001405.40"]
for p in items:
    name = p.name.lower()
    if any(pat in name for pat in ["variant_summary", "variant-summary", "variant_summary.txt", "variant_summary.txt.gz", "variant_summary.txt"]):
        candidates.append(p)
# if none found, try other large files
if not candidates:
    for p in items:
        if p.suffix.lower() in [".gz", ".txt", ".vcf", ".vcf.gz"] or p.is_dir():
            candidates.append(p)

print("\nCandidate files for ClinVar (top 10):")
for c in candidates[:10]:
    print("  -", c)

# Auto-pick the most likely ClinVar file if present
clinvar_path = None
for c in candidates:
    if "variant_summary" in c.name.lower():
        clinvar_path = c
        break
# otherwise prefer gz over txt
if clinvar_path is None:
    for c in candidates:
        if c.suffix.lower() == ".gz" and "variant" in c.name.lower():
            clinvar_path = c
            break

# Final fallback: first big file > 100MB that looks like the large ClinVar you downloaded
if clinvar_path is None:
    big_files = [p for p in items if p.is_file() and p.stat().st_size > 100*1024*1024]
    if big_files:
        clinvar_path = big_files[0]

print("\nSelected clinvar candidate:", clinvar_path)
if clinvar_path:
    print(" Size (MB):", round(clinvar_path.stat().st_size/1024/1024,2))
    print(" Is dir?:", clinvar_path.is_dir())
else:
    raise FileNotFoundError("No plausible ClinVar file found in RAW_DIR. Please download variant_summary and place it in RAW_DIR or update RAW_DIR path.")


RAW_DIR exists? True   path: /mnt/c/Users/MYG-D02/variant-effect-prediction/data/raw

Files/dirs in /mnt/c/Users/MYG-D02/variant-effect-prediction/data/raw (first 200):
  ExAC.r1.sites.vep.vcf (dir)
  ExAC_test.vcf 
  GCF_000001405.40 (dir)
  GCF_000001405.40.gz 
  homo_sapiens_vep_115_GRCh38 (dir)
  homo_sapiens_vep_115_GRCh38.tar.gz 
  variant_summary.txt (dir)
  variant_summary.txt.gz 

Candidate files for ClinVar (top 10):
  - /mnt/c/Users/MYG-D02/variant-effect-prediction/data/raw/variant_summary.txt
  - /mnt/c/Users/MYG-D02/variant-effect-prediction/data/raw/variant_summary.txt.gz

Selected clinvar candidate: /mnt/c/Users/MYG-D02/variant-effect-prediction/data/raw/variant_summary.txt
 Size (MB): 0.0
 Is dir?: True


In [4]:
# Cell B1 — quick diagnostics
from pathlib import Path
import os

RAW_DIR = Path("/mnt/c/Users/MYG-D02/variant-effect-prediction/data/raw")   # adjust if different
print("RAW_DIR exists?", RAW_DIR.exists(), " ->", RAW_DIR)

# list the files & directories in RAW_DIR (show sizes)
for p in sorted(RAW_DIR.iterdir(), key=lambda p: (not p.is_dir(), p.name.lower())):
    t = "DIR " if p.is_dir() else "FILE"
    try:
        size = p.stat().st_size
    except Exception:
        size = None
    print(f"{t:4} {p.name:60}  size={size}")

# inspect the specific entry you used
target = RAW_DIR / "variant_summary.txt"
print("\nTarget path:", target)
print("Exists:", target.exists())
print("Is file:", target.is_file())
print("Is dir :", target.is_dir())

if target.exists() and target.is_dir():
    print("\nContents of the directory (first 50 items):")
    for i, c in enumerate(sorted(target.iterdir(), key=lambda x: x.name)[:50]):
        print("  ", c.name)
    print("If you see a .gz or .txt file inside, use that path (print it here):")
    for c in target.iterdir():
        if c.suffix in (".gz", ".txt", ".vcf"):
            print(" -> candidate file inside:", c)


RAW_DIR exists? True  -> /mnt/c/Users/MYG-D02/variant-effect-prediction/data/raw
DIR  ExAC.r1.sites.vep.vcf                                         size=4096
DIR  GCF_000001405.40                                              size=4096
DIR  homo_sapiens_vep_115_GRCh38                                   size=4096
DIR  variant_summary.txt                                           size=4096
FILE ExAC_test.vcf                                                 size=665914
FILE GCF_000001405.40.gz                                           size=29552227779
FILE homo_sapiens_vep_115_GRCh38.tar.gz                            size=25272957721
FILE variant_summary.txt.gz                                        size=397000747

Target path: /mnt/c/Users/MYG-D02/variant-effect-prediction/data/raw/variant_summary.txt
Exists: True
Is file: False
Is dir : True

Contents of the directory (first 50 items):
   variant_summary.txt
If you see a .gz or .txt file inside, use that path (print it here):
 -> candidate

In [5]:
ABCA4


Using clinvar_path: /mnt/c/Users/MYG-D02/variant-effect-prediction/data/raw/variant_summary.txt/variant_summary.txt
Exists: True Is file: True Is dir: False
Detected compression: None
Reading ClinVar in chunks from: /mnt/c/Users/MYG-D02/variant-effect-prediction/data/raw/variant_summary.txt/variant_summary.txt
Processed chunk 1: found 79525 rows of benign/pathogenic
Processed chunk 2: found 45513 rows of benign/pathogenic
Processed chunk 3: found 37537 rows of benign/pathogenic
Processed chunk 4: found 27752 rows of benign/pathogenic
Processed chunk 5: found 26513 rows of benign/pathogenic
Processed chunk 6: found 77166 rows of benign/pathogenic
Processed chunk 7: found 25280 rows of benign/pathogenic
Processed chunk 8: found 15816 rows of benign/pathogenic
Processed chunk 9: found 25759 rows of benign/pathogenic
Processed chunk 10: found 17811 rows of benign/pathogenic
Processed chunk 11: found 121255 rows of benign/pathogenic
Processed chunk 12: found 50456 rows of benign/pathogenic


Unnamed: 0,#alleleid,type,name,geneid,genesymbol,hgnc_id,clinicalsignificance,clinsigsimple,lastevaluated,rs# (dbsnp),...,somaticclinicalimpact,somaticclinicalimpactlastevaluated,reviewstatusclinicalimpact,oncogenicity,oncogenicitylastevaluated,reviewstatusoncogenicity,scvsforaggregategermlineclassification,scvsforaggregatesomaticclinicalimpact,scvsforaggregateoncogenicityclassification,clnsig_simple
0,15042,Deletion,NM_014855.3(AP5Z1):c.1413_1426del (p.Leu473fs),9907,AP5Z1,HGNC:22197,Pathogenic,1,"Jun 29, 2010",397704709,...,-,-,-,-,-,-,SCV000020156,-,-,pathogenic
1,15042,Deletion,NM_014855.3(AP5Z1):c.1413_1426del (p.Leu473fs),9907,AP5Z1,HGNC:22197,Pathogenic,1,"Jun 29, 2010",397704709,...,-,-,-,-,-,-,SCV000020156,-,-,pathogenic
2,15044,single nucleotide variant,NM_017547.4(FOXRED1):c.694C>T (p.Gln232Ter),55572,FOXRED1,HGNC:26927,Pathogenic,1,"Aug 17, 2025",267606829,...,-,-,-,-,-,-,SCV000680696|SCV001363290|SCV002793147|SCV0029...,-,-,pathogenic
3,15044,single nucleotide variant,NM_017547.4(FOXRED1):c.694C>T (p.Gln232Ter),55572,FOXRED1,HGNC:26927,Pathogenic,1,"Aug 17, 2025",267606829,...,-,-,-,-,-,-,SCV000680696|SCV001363290|SCV002793147|SCV0029...,-,-,pathogenic


In [6]:
# Cell 1 — standardize & label
import pandas as pd
from pathlib import Path

try:
    df  # if exists
except NameError:
    proc_dir = Path("/mnt/c/Users/MYG-D02/variant-effect-prediction/data/processed")
   
    candidate = proc_dir / "clinvar_filtered_labels.csv"
    if candidate.exists():
        df = pd.read_csv(candidate, low_memory=False)
    else:
        raise RuntimeError("No `df` found in memory and no clinvar_filtered_labels.csv detected. Load ClinVar first.")

# Normalize column names
df.columns = [c.strip().lower() for c in df.columns]

# Map common column names to canonical names we will use
col_map = {}
# try to detect fields
if "chromosome" in df.columns:
    col_map["chromosome"] = "chr"
if "start" in df.columns:
    col_map["start"] = "pos"
if "referenceallele" in df.columns:
    col_map["referenceallele"] = "ref"
if "alternateallele" in df.columns:
    col_map["alternateallele"] = "alt"
if "rs# (dbsnp)" in df.columns:
    df = df.rename(columns={"rs# (dbsnp)": "rsid"})

rename_map = {
    "genesymbol": "gene",
    "clinicalsignificance": "clinicalsignificance",
}
df = df.rename(columns={**col_map, **rename_map})

# build minimal df with canonical columns (some may be missing; keep what exists)
keep = []
for k in ["variationid","gene","chr","pos","ref","alt","name","clinicalsignificance","rsid"]:
    if k in df.columns:
        keep.append(k)

df = df[keep].copy()

# create simple label: 'pathogenic'->1, 'benign'->0
def simplify_label(x):
    if pd.isna(x): 
        return None
    s = str(x).lower()
    # clinicalSignificance might be 'Pathogenic/likely pathogenic' etc.
    # We take the first semicolon-split or comma-split token when needed
    s0 = s.split(";")[0].split(",")[0].strip()
    if "pathogenic" in s0:
        return 1
    if "benign" in s0:
        return 0
    return None

df["label"] = df["clinicalsignificance"].apply(simplify_label)

print("Kept columns:", keep)
print("Label counts (incl. None):")
print(df["label"].value_counts(dropna=False))
# Save a checkpoint
proc_dir = Path("/mnt/c/Users/MYG-D02/variant-effect-prediction/data/processed")
proc_dir.mkdir(parents=True, exist_ok=True)
df.to_csv(proc_dir / "clinvar_stage1_standardized.csv", index=False)
print("Saved: clinvar_stage1_standardized.csv")


Kept columns: ['variationid', 'gene', 'chr', 'pos', 'ref', 'alt', 'name', 'clinicalsignificance', 'rsid']
Label counts (incl. None):
label
0    425402
1    388161
Name: count, dtype: int64
Saved: clinvar_stage1_standardized.csv


In [7]:
# Cell 2 — drop duplicates, basic filtering, and try to fill ref/alt from name
import re
from pathlib import Path

# reload to be safe
proc_dir = Path("/mnt/c/Users/MYG-D02/variant-effect-prediction/data/processed")
df = pd.read_csv(proc_dir / "clinvar_stage1_standardized.csv", low_memory=False)

before = len(df)
# drop exact duplicates by major identifying fields
id_cols = [c for c in ["variationid","gene","chr","pos","ref","alt","name"] if c in df.columns]
df = df.drop_duplicates(subset=id_cols)
print(f"Rows before dedup: {before} -> after dedup: {len(df)}")

# remove rows missing chromosome or position
mask_coords = df["chr"].notnull() & df["pos"].notnull()
df = df[mask_coords].copy()
print("Rows after removing missing coords:", len(df))

# helper to extract ref/alt from name when pattern like c.1144C>T exists
def extract_ref_alt_from_name(name):
    if pd.isna(name): 
        return (None, None)
    s = str(name)
    m = re.search(r"c\.[0-9_+-]*([ACGT])>([ACGT])", s, flags=re.IGNORECASE)
    if m:
        return (m.group(1).upper(), m.group(2).upper())
    # sometimes the canonical is in parentheses like (p.Gln382Ter) — we don't get ref/alt from protein-level
    return (None, None)

# only attempt fill where ref or alt is missing (NaN or 'na' or '')
def is_missing_val(x):
    if pd.isna(x): return True
    s = str(x).strip().lower()
    return s in ("", "na", "n/a", "none", ".")
    
fill_count = 0
for i, row in df.loc[df.apply(lambda r: is_missing_val(r.get("ref")) or is_missing_val(r.get("alt")), axis=1)].iterrows():
    ref_from_name, alt_from_name = extract_ref_alt_from_name(row.get("name", ""))
    if ref_from_name and alt_from_name:
        df.at[i, "ref"] = ref_from_name
        df.at[i, "alt"] = alt_from_name
        fill_count += 1

print("Filled ref/alt from name for", fill_count, "rows (simple SNV cases).")

# Save stage1 cleaned
df.to_csv(proc_dir / "clinvar_stage1_cleaned.csv", index=False)
print("Saved clinvar_stage1_cleaned.csv")


Rows before dedup: 813563 -> after dedup: 809642
Rows after removing missing coords: 809642
Filled ref/alt from name for 490411 rows (simple SNV cases).
Saved clinvar_stage1_cleaned.csv


In [2]:
# Cell 3 (fixed) — parse `name` for c. and p. tokens and create `cdna` + `protein_change`
# Why: many ClinVar entries store the nucleotide/protein change inside the "name" field
# (e.g. "NM_017547.4(...):c.694C>T (p.Gln232Ter)"). We extract those tokens so later cells
# can parse amino-acid changes (BLOSUM/hydropathy/etc).

import re
import pandas as pd
from pathlib import Path


proc_dir = Path("/mnt/c/Users/MYG-D02/variant-effect-prediction/data/processed")
infile = proc_dir / "clinvar_stage1_cleaned.csv"
if not infile.exists():
    raise FileNotFoundError(f"Input file not found: {infile}\nMake sure you ran the previous cells and saved clinvar_stage1_cleaned.csv")

# load (low_memory False avoids some warnings for mixed dtypes)
df = pd.read_csv(infile, low_memory=False)
print("Loaded:", infile)
print("Initial shape:", df.shape)
print("Columns available:", df.columns.tolist()[:30])

# parsing function: returns dict with cdna (c.xxx) and protein_change (p.xxx) or None
def parse_name_for_cdna_protein(name):
    # return dict to make conversion to DataFrame simpler
    if pd.isna(name) or str(name).strip() == "":
        return {"cdna": None, "protein_change": None}
    s = str(name)
    # cdna: look for substring starting with 'c.' up to whitespace or punctuation
    cdna_m = re.search(r"(c\.[A-Za-z0-9_+\->\*]+)", s)
    # protein: look for 'p.' token like p.Gln382Ter or p.R72H; we allow parentheses and punctuation
    prot_m = re.search(r"(p\.[A-Za-z0-9\*\._]+)", s)
    return {"cdna": cdna_m.group(1) if cdna_m else None,
            "protein_change": prot_m.group(1) if prot_m else None}

# Apply parsing to the 'name' column and expand results into new columns
parsed = df["name"].apply(parse_name_for_cdna_protein).apply(pd.Series)
df["cdna"] = parsed["cdna"]
df["protein_change"] = parsed["protein_change"]

# Diagnostics
print("Total rows:", len(df))
print("Non-null counts:")
print(df[["cdna","protein_change"]].notnull().sum())

# Show examples for visual check
display(df[["name","cdna","protein_change"]].head(12))

# Save the result to avoid re-running parsing later; overwrite is ok
outpath = proc_dir / "clinvar_stage2_parsed_name.csv"
df.to_csv(outpath, index=False)
print("Saved parsed file to:", outpath)


Loaded: /mnt/c/Users/MYG-D02/variant-effect-prediction/data/processed/clinvar_stage1_cleaned.csv
Initial shape: (809642, 10)
Columns available: ['variationid', 'gene', 'chr', 'pos', 'ref', 'alt', 'name', 'clinicalsignificance', 'rsid', 'label']
Total rows: 809642
Non-null counts:
cdna              758597
protein_change    466458
dtype: int64


Unnamed: 0,name,cdna,protein_change
0,NM_014855.3(AP5Z1):c.1413_1426del (p.Leu473fs),c.1413_1426del,p.Leu473fs
1,NM_014855.3(AP5Z1):c.1413_1426del (p.Leu473fs),c.1413_1426del,p.Leu473fs
2,NM_017547.4(FOXRED1):c.694C>T (p.Gln232Ter),c.694C>T,p.Gln232Ter
3,NM_017547.4(FOXRED1):c.694C>T (p.Gln232Ter),c.694C>T,p.Gln232Ter
4,NM_000410.4(HFE):c.892+48G>A,c.892+48G>A,
5,NM_000410.4(HFE):c.892+48G>A,c.892+48G>A,
6,NM_000410.4(HFE):c.989G>T (p.Arg330Met),c.989G>T,p.Arg330Met
7,NM_000410.4(HFE):c.989G>T (p.Arg330Met),c.989G>T,p.Arg330Met
8,NM_020779.4(WDR35):c.25-2A>G,c.25-2A>G,
9,NM_020779.4(WDR35):c.25-2A>G,c.25-2A>G,


Saved parsed file to: /mnt/c/Users/MYG-D02/variant-effect-prediction/data/processed/clinvar_stage2_parsed_name.csv


In [4]:
# Cell 4 — protein token -> ref_aa, pos_aa, alt_aa
import pandas as pd
import re
from pathlib import Path

three_to_one = {
    'Ala':'A','Arg':'R','Asn':'N','Asp':'D','Cys':'C',
    'Gln':'Q','Glu':'E','Gly':'G','His':'H','Ile':'I',
    'Leu':'L','Lys':'K','Met':'M','Phe':'F','Pro':'P',
    'Ser':'S','Thr':'T','Trp':'W','Tyr':'Y','Val':'V',
    'Ter':'*', 'Stop':'*'
}

proc_dir = Path("/mnt/c/Users/MYG-D02/variant-effect-prediction/data/processed")
df = pd.read_csv(proc_dir / "clinvar_stage2_parsed_name.csv", low_memory=False)

def parse_protein_token(p):
    if pd.isna(p): return pd.Series({"ref_aa": None, "pos_aa": None, "alt_aa": None})
    s = str(p).strip()
    # try 3-letter pattern: p.Gln382Ter or p.Gln382Arg
    m = re.match(r"p\.([A-Za-z]{3})(\d+)([A-Za-z]{3}|\*)", s)
    if m:
        ref3, pos, alt3 = m.groups()
        ref1 = three_to_one.get(ref3.capitalize(), None)
        alt1 = three_to_one.get(alt3.capitalize(), None) if alt3 != "*" else "*"
        return pd.Series({"ref_aa": ref1, "pos_aa": int(pos), "alt_aa": alt1})
    # fallback 1-letter like p.R72H
    m2 = re.match(r"p\.([A-Za-z])(\d+)([A-Za-z]|\*)", s)
    if m2:
        ref1, pos, alt1 = m2.groups()
        return pd.Series({"ref_aa": ref1.upper(), "pos_aa": int(pos), "alt_aa": alt1.upper()})
    return pd.Series({"ref_aa": None, "pos_aa": None, "alt_aa": None})

parsed_aa = df["protein_change"].apply(parse_protein_token)
df["ref_aa"] = parsed_aa["ref_aa"]
df["pos_aa"] = parsed_aa["pos_aa"]
df["alt_aa"] = parsed_aa["alt_aa"]

print("AA parse counts (non-null):", df[["ref_aa","alt_aa"]].notnull().sum().to_dict())
df[["name","protein_change","ref_aa","pos_aa","alt_aa"]].head(8)

df.to_csv(proc_dir / "clinvar_stage2_aa.csv", index=False)
print("Saved clinvar_stage2_aa.csv")


AA parse counts (non-null): {'ref_aa': 206933, 'alt_aa': 203377}
Saved clinvar_stage2_aa.csv


In [5]:
# Cell 5 — compute blosum62_raw, hydropathy_diff, is_stop
import math
from pathlib import Path
import pandas as pd

proc_dir = Path("/mnt/c/Users/MYG-D02/variant-effect-prediction/data/processed")
df = pd.read_csv(proc_dir / "clinvar_stage2_aa.csv", low_memory=False)

# hydropathy scale (Kyte-Doolittle)
hydro = {'A':1.8,'R':-4.5,'N':-3.5,'D':-3.5,'C':2.5,'Q':-3.5,'E':-3.5,'G':-0.4,
         'H':-3.2,'I':4.5,'L':3.8,'K':-3.9,'M':1.9,'F':2.8,'P':-1.6,'S':-0.8,
         'T':-0.7,'W':-0.9,'Y':-1.3,'V':4.2,'*':0}

# BLOSUM
try:
    from Bio.SubsMat.MatrixInfo import blosum62
    def blosum_score(a,b):
        if a is None or b is None: return None
        if a == '*' or b == '*':  # treat stop specially
            return -4
        pair = (a,b)
        if pair in blosum62:
            return blosum62[pair]
        if (b,a) in blosum62:
            return blosum62[(b,a)]
        return -1
    print("Using Biopython blosum62.")
except Exception:
    # fallback simple scoring
    def blosum_score(a,b):
        if a is None or b is None: return None
        if a == b: return 4
        if a == '*' or b == '*': return -4
        return -1
    print("Biopython BLOSUM not found — using fallback scoring.")

def safe_hydro(a):
    if pd.isna(a): return None
    return hydro.get(str(a).upper(), None)

# compute columns
df["blosum62_raw"] = df.apply(lambda r: blosum_score(r["ref_aa"], r["alt_aa"]), axis=1)
df["hydropathy_diff"] = df.apply(lambda r: 
                                 (safe_hydro(r["alt_aa"]) - safe_hydro(r["ref_aa"])) 
                                 if (safe_hydro(r["alt_aa"]) is not None and safe_hydro(r["ref_aa"]) is not None) else None, axis=1)
df["is_stop"] = df["alt_aa"].apply(lambda x: 1 if str(x) == "*" else 0)

print("Computed feature sample counts:")
print(df[["blosum62_raw","hydropathy_diff","is_stop"]].notnull().sum())

df.to_csv(proc_dir / "clinvar_stage3_features_basic.csv", index=False)
print("Saved clinvar_stage3_features_basic.csv")


Biopython BLOSUM not found — using fallback scoring.
Computed feature sample counts:
blosum62_raw       809642
hydropathy_diff    203377
is_stop            809642
dtype: int64
Saved clinvar_stage3_features_basic.csv


In [18]:
# Cell 6 (fixed, robust, memory-friendly) — merge ExAC AF into your clinvar stage3 file
import sys
from pathlib import Path
import pandas as pd

proc_dir = Path("/mnt/c/Users/MYG-D02/variant-effect-prediction/data/processed")
raw_exac_path = Path("/mnt/c/Users/MYG-D02/variant-effect-prediction/data/raw/ExAC.r1.sites.vep.vcf")  # file or directory

# ---- sanity checks ----
if not proc_dir.exists():
    raise FileNotFoundError(f"Processed directory not found: {proc_dir}")

# auto-detect real VCF file if a directory was given
if raw_exac_path.is_dir():
    # prefer .vcf.gz, then .vcf
    candidates = sorted(raw_exac_path.glob("**/*.vcf*"))
    if not candidates:
        raise FileNotFoundError(f"No .vcf or .vcf.gz found inside directory: {raw_exac_path}")
    exac_path = candidates[0]
else:
    exac_path = raw_exac_path

print("Using ExAC path:", exac_path, "| exists:", exac_path.exists())

# import pysam (helpful error if not installed)
try:
    import pysam
except Exception as e:
    raise RuntimeError("pysam is required but not importable in this environment. Install pysam and re-run. Error: " + str(e))

# load the dataframe we want to enrich
ml_input = proc_dir / "clinvar_stage3_features_basic.csv"
if not ml_input.exists():
    raise FileNotFoundError(f"Expected input CSV not found: {ml_input}\nMake sure previous stage saved this file.")
df = pd.read_csv(ml_input, low_memory=False)
print("Loaded ML-stage df shape:", df.shape)

# ensure the columns exist
for c in ("chr","pos","ref","alt"):
    if c not in df.columns:
        raise KeyError(f"Required column missing from dataframe: {c}")

# prepare target keys (normalized)
def key_from_row(r):
    return (str(r["chr"]).replace("chr","").strip(), int(r["pos"]), str(r["ref"]), str(r["alt"]))

want_keys = set()
for idx, row in df.iterrows():
    try:
        want_keys.add(key_from_row(row))
    except Exception:
        continue
print("Unique variant keys to match:", len(want_keys))

# initialize allele_freq column
if "allele_freq" not in df.columns:
    df["allele_freq"] = pd.NA
else:
    df["allele_freq"] = df["allele_freq"].fillna(pd.NA)

# iterate VCF and only record AFs for keys we care about
af_map = {}
count_scanned = 0
count_matched = 0

open_path = str(exac_path)
try:
    vcf = pysam.VariantFile(open_path)
except Exception as e:
    raise RuntimeError(f"Failed to open VCF with pysam: {open_path}\nError: {e}")

for rec in vcf:
    count_scanned += 1
    chrom = rec.chrom.replace("chr","").strip()
    pos = rec.pos
    ref = rec.ref
    alts = rec.alts or []
    # AF in VCF INFO can be: AF (list) or missing. Try canonical 'AF' first.
    info_af = rec.info.get("AF") or rec.info.get("af") or None

    # if INFO AF not present, try common alternate fields (some VCFs use different tags)
    # we will treat absence like None and skip storing AF for those entries
    if info_af is None:
        # optionally try other fields e.g. 'gnomAD_AF' etc. (customize if needed)
        # info_af = rec.info.get("gnomad_AF") or rec.info.get("GNOMAD_AF")
        info_af = None

    # normal handling: zip alts with provided AFs (if AF is a single number for multiallelic, treat as list)
    if info_af is not None:
        # make it list-like
        if isinstance(info_af, (float, int)):
            af_list = [float(info_af)] + [None]*(len(alts)-1)
        else:
            af_list = list(info_af)
    else:
        af_list = [None]*len(alts)

    for alt, af_val in zip(alts, af_list):
        key = (chrom, pos, ref, alt)
        if key in want_keys:
            # convert to float if possible
            try:
                af_float = float(af_val) if af_val is not None else None
            except Exception:
                af_float = None
            af_map[key] = af_float
            count_matched += 1

# report
print(f"Scanned {count_scanned} VCF records. Matched AF entries: {len(af_map)}")

# fill dataframe from map (fast single-pass)
filled = 0
for i, row in df.iterrows():
    key = key_from_row(row)
    af = af_map.get(key, None)
    if af is not None:
        df.at[i, "allele_freq"] = af
        filled += 1

print("Filled allele_freq for", filled, "rows out of", len(df))

# save output
outpath = proc_dir / "clinvar_stage3_with_af.csv"
df.to_csv(outpath, index=False)
print("Saved merged file to:", outpath)


Using ExAC path: /mnt/c/Users/MYG-D02/variant-effect-prediction/data/raw/ExAC.r1.sites.vep.vcf/ExAC.r1.sites.vep.vcf | exists: True
Loaded ML-stage df shape: (809642, 18)
Unique variant keys to match: 777857


[W::vcf_parse_info] INFO 'HOM_CONSANGUINEOUS' is not defined in the header, assuming Type=String
[W::vcf_parse_filter] FILTER 'AC_Adj0_Filter' is not defined in the header


Scanned 9362318 VCF records. Matched AF entries: 53468
Filled allele_freq for 53474 rows out of 809642
Saved merged file to: /mnt/c/Users/MYG-D02/variant-effect-prediction/data/processed/clinvar_stage3_with_af.csv
