In [1]:
# Cell A: list files in your notebooks\data folder
import os
project_dir = r"C:\BLAST\variant-effect-prediction"
alt_data_dir = os.path.join(project_dir, "notebooks", "data")

print("Checking folder:", alt_data_dir)
if not os.path.exists(alt_data_dir):
    raise FileNotFoundError(f"Folder not found: {alt_data_dir}")

files = sorted(os.listdir(alt_data_dir))
print(f"Found {len(files)} files:")
for f in files:
    print(" -", f)
    
# show if any file name looks like ClinVar/variant
candidates = [f for f in files if any(k in f.lower() for k in ["variant_summary","clinvar","variant","clinvar_variant","clinvar_summary", ".vcf", ".txt", ".csv", ".gz"])]
print("\nPossible ClinVar/variant-like files:")
for f in candidates:
    print(" *", f)


Checking folder: C:\BLAST\variant-effect-prediction\notebooks\data
Found 11 files:
 - .ipynb_checkpoints
 - Unconfirmed 243949.crdownload
 - clinvar.vcf
 - clinvar.vcf.gz
 - clinvar.vcf.gz.tbi
 - clinvar_features.csv
 - clinvar_labeled.csv
 - clinvar_ml_ready.csv
 - data
 - variant_summary.txt
 - variant_summary.txt.gz

Possible ClinVar/variant-like files:
 * clinvar.vcf
 * clinvar.vcf.gz
 * clinvar.vcf.gz.tbi
 * clinvar_features.csv
 * clinvar_labeled.csv
 * clinvar_ml_ready.csv
 * variant_summary.txt
 * variant_summary.txt.gz


In [None]:
# Debug + robust runner for week2_prepare_subset
import os, shutil, gzip, sys
import pandas as pd
from pathlib import Path
import traceback

project_dir = r"C:\BLAST\variant-effect-prediction"
alt_data_dir = os.path.join(project_dir, "notebooks", "data")
raw_dir = os.path.join(project_dir, "data", "raw")
proc_dir = os.path.join(project_dir, "data", "processed")

print("PROJECT DIR:", project_dir)
print("ALT DATA DIR:", alt_data_dir)
print("RAW DIR:", raw_dir)
print("PROCESSED DIR:", proc_dir)

# ensure canonical folders exist
os.makedirs(raw_dir, exist_ok=True)
os.makedirs(proc_dir, exist_ok=True)

# 1) list alt data dir contents (the place you said the file is)
print("\n--- Listing ALT_DATA_DIR ---")
if os.path.exists(alt_data_dir):
    alt_files = sorted(os.listdir(alt_data_dir))
    print(f"{len(alt_files)} files found in {alt_data_dir}:")
    for f in alt_files:
        p = os.path.join(alt_data_dir, f)
        size = os.path.getsize(p)
        print(f" - {f}   ({size} bytes)")
else:
    print(f"ALT data dir does NOT exist: {alt_data_dir}")
    alt_files = []

# 2) list canonical raw dir contents
print("\n--- Listing RAW_DIR ---")
raw_files = sorted(os.listdir(raw_dir)) if os.path.exists(raw_dir) else []
print(f"{len(raw_files)} files in {raw_dir}: {raw_files}")

# 3) list processed dir contents
print("\n--- Listing PROCESSED_DIR ---")
proc_files = sorted(os.listdir(proc_dir)) if os.path.exists(proc_dir) else []
print(f"{len(proc_files)} files in {proc_dir}: {proc_files}")

# 4) try to auto-detect a clinvar-like file in alt_data_dir
priority = ["variant_summary.txt.gz", "variant_summary.txt", "clinvar.vcf.gz", "clinvar.vcf",
            "clinvar_features.csv", "clinvar_labeled.csv", "clinvar_ml_ready.csv"]
chosen = None

# pick exact priority match first
for name in priority:
    if name in alt_files:
        chosen = name
        break

# else, pick first file that looks appropriate
if chosen is None:
    for f in alt_files:
        if any(k in f.lower() for k in ["variant_summary","clinvar","variant",".vcf",".txt",".csv",".gz"]):
            chosen = f
            break

print("\nAuto-detected candidate:", chosen)

if chosen is None:
    print("\nNo candidate found in alt_data_dir. Please check where you downloaded the ClinVar file.")
    print("If you downloaded via browser, it may be in Downloads. Move the file into:")
    print("  ", alt_data_dir)
else:
    src_path = os.path.join(alt_data_dir, chosen)
    print("Source path:", src_path)
    # 5) attempt to copy to raw_dir (so code uses canonical raw path)
    dst_path = os.path.join(raw_dir, chosen)
    try:
        if not os.path.exists(dst_path):
            shutil.copy2(src_path, dst_path)
            print("Copied to raw dir:", dst_path)
        else:
            print("File already exists in raw dir:", dst_path)
    except Exception as e:
        print("Could not copy file to raw dir. Error:")
        traceback.print_exc()
        dst_path = src_path
        print("Will attempt to read from source path:", dst_path)

    # 6) try to load the file robustly and create subset
    def load_clinvar_table(path):
        low = path.lower()
        try:
            if low.endswith(".gz"):
                print(" -> reading gzipped tab-delimited via pandas")
                return pd.read_csv(path, sep="\t", compression="gzip", low_memory=False)
            if low.endswith(".vcf") or low.endswith(".vcf.gz"):
                print(" -> attempting to read VCF-like file with pandas (skip meta lines)")
                # pandas can read vcf rows if we skip '#'
                if low.endswith(".vcf.gz"):
                    import gzip
                    with gzip.open(path, "rt", errors="ignore") as gf:
                        for line in gf:
                            if line.startswith("#CHROM"):
                                header = line.strip().lstrip("#")
                                break
                    if 'header' in locals():
                        cols = header.split("\t")
                        return pd.read_csv(path, sep="\t", comment="#", names=cols, low_memory=False)
                    else:
                        return pd.read_csv(path, sep="\t", comment="#", low_memory=False)
                else:
                    return pd.read_csv(path, sep="\t", comment="#", low_memory=False)
            if low.endswith(".txt") or low.endswith(".tsv"):
                print(" -> reading plain TSV")
                return pd.read_csv(path, sep="\t", low_memory=False)
            if low.endswith(".csv"):
                print(" -> reading CSV")
                return pd.read_csv(path, low_memory=False)
            # fallback attempts
            try:
                print(" -> fallback: try gzip read")
                return pd.read_csv(path, sep="\t", compression="gzip", low_memory=False)
            except Exception:
                print(" -> fallback: try plain tab-delimited read")
                return pd.read_csv(path, sep="\t", low_memory=False)
        except Exception:
            print("Error when attempting to load file:", path)
            traceback.print_exc()
            raise

    try:
        print("\nAttempting to load the candidate file (this may take some seconds)...")
        df = load_clinvar_table(dst_path)
        print("Loaded dataframe shape:", df.shape)
        print("First columns (up to 40):", df.columns.tolist()[:40])

        # normalize col names to lowercase
        df.columns = [str(c).strip() for c in df.columns]
        df = df.rename(columns={c:c.lower() for c in df.columns})

        # try to find clinical significance column
        cln_candidates = [c for c in df.columns if "clinsig" in c or "clinical" in c or "significance" in c]
        print("Possible clinical significance columns:", cln_candidates[:10])
        if not cln_candidates:
            print("No clinical significance-like column found. Here are the column names:")
            print(df.columns.tolist())
            raise ValueError("Cannot detect clinical significance column. Inspect columns above.")
        cln_col = cln_candidates[0]
        print("Using clinical significance column:", cln_col)

        # create simple CLNSIG
        df["clnsig_simple"] = df[cln_col].astype(str).str.split(";").str[0].str.strip()
        mask = df["clnsig_simple"].str.lower().isin(["pathogenic","benign"])
        df_filtered = df[mask].copy()
        print("Filtered to Pathogenic/Benign rows:", len(df_filtered))

        # try SNV filter via ref/alt-like columns
        ref_col = next((c for c in df_filtered.columns if c in ["ref","referenceallele","reference_allele"]), None)
        alt_col = next((c for c in df_filtered.columns if c in ["alt","alternateallele","alternate_allele"]), None)
        print("Detected REF col:", ref_col, "ALT col:", alt_col)
        if ref_col and alt_col:
            before = len(df_filtered)
            df_filtered = df_filtered[df_filtered[ref_col].astype(str).str.len()==1]
            df_filtered = df_filtered[df_filtered[alt_col].astype(str).str.len()==1]
            print(f"After simple SNV filter: {before} -> {len(df_filtered)}")
        else:
            print("Skipping SNV length filter (REF/ALT not both found).")

        # select preferred columns
        preferred = ["alleleid","genesymbol","gene","clnsig","clnsig_simple","clinicalsignificance","type","name","chr","chrom","start","position","ref","alt","referenceallele","alternateallele","rsid","variationid","reviewstatus"]
        keep = [c for c in preferred if c in df_filtered.columns]
        if not keep:
            keep = df_filtered.columns.tolist()[:12]
            print("Preferred columns not found; defaulting to first 12 columns:", keep)
        else:
            print("Keeping columns:", keep)

        subset = df_filtered[keep].reset_index(drop=True)

        n = 10000
        if len(subset) > n:
            subset = subset.sample(n, random_state=42).reset_index(drop=True)
            print(f"Sampled down to {n}")

        out_name = f"clinvar_subset_{len(subset)}.csv"
        out_path = os.path.join(proc_dir, out_name)
        subset.to_csv(out_path, index=False)
        print("Saved processed subset to:", out_path)

        print("\nPreview:")
        display(subset.head())

    except Exception as e:
        print("\nFAILED to create processed subset. Error details:")
        traceback.print_exc()
        print("\nPlease paste the entire output of this cell here so I can diagnose.")



PROJECT DIR: C:\BLAST\variant-effect-prediction
ALT DATA DIR: C:\BLAST\variant-effect-prediction\notebooks\data
RAW DIR: C:\BLAST\variant-effect-prediction\data\raw
PROCESSED DIR: C:\BLAST\variant-effect-prediction\data\processed

--- Listing ALT_DATA_DIR ---
11 files found in C:\BLAST\variant-effect-prediction\notebooks\data:
 - .ipynb_checkpoints   (0 bytes)
 - Unconfirmed 243949.crdownload   (395037542 bytes)
 - clinvar.vcf   (0 bytes)
 - clinvar.vcf.gz   (174846686 bytes)
 - clinvar.vcf.gz.tbi   (571720 bytes)
 - clinvar_features.csv   (69043392 bytes)
 - clinvar_labeled.csv   (373504980 bytes)
 - clinvar_ml_ready.csv   (126205240 bytes)
 - data   (0 bytes)
 - variant_summary.txt   (0 bytes)
 - variant_summary.txt.gz   (395037542 bytes)

--- Listing RAW_DIR ---
2 files in C:\BLAST\variant-effect-prediction\data\raw: ['variant_summary.txt', 'variant_summary.txt.gz']

--- Listing PROCESSED_DIR ---
0 files in C:\BLAST\variant-effect-prediction\data\processed: []

Auto-detected candida

In [None]:
# Chunked processing: extract Pathogenic / Benign rows and create a balanced subset
import os, gzip, shutil, math
import pandas as pd

project_dir = r"C:\BLAST\variant-effect-prediction"
alt_data_dir = os.path.join(project_dir, "notebooks", "data")
raw_dir = os.path.join(project_dir, "data", "raw")
proc_dir = os.path.join(project_dir, "data", "processed")
os.makedirs(proc_dir, exist_ok=True)

gz_path = os.path.join(raw_dir, "variant_summary.txt.gz")
if not os.path.exists(gz_path):
    # fallback to alt location
    alt_candidate = os.path.join(alt_data_dir, "variant_summary.txt.gz")
    if os.path.exists(alt_candidate):
        gz_path = alt_candidate
    else:
        raise FileNotFoundError(f"Could not find variant_summary.txt.gz in {raw_dir} or {alt_data_dir}.")

print("Reading from:", gz_path)
chunksize = 100_000  # adjust if your memory is limited
temp_out = os.path.join(proc_dir, "clinvar_filtered_temp.csv")

# remove temp if exists
if os.path.exists(temp_out):
    os.remove(temp_out)

# We will auto-detect the clinical significance column from first chunk
first_chunk = True
cln_col = None
rows_written = 0
total_candidates = 0

for chunk_i, chunk in enumerate(pd.read_csv(gz_path, sep="\t", compression="gzip", chunksize=chunksize, low_memory=False)):
    if first_chunk:
        # normalize column names to simple lowercase tokens for detection
        cols_lower = [str(c).strip().lower() for c in chunk.columns]
        # find a good clinical significance candidate
        for c, orig in zip(cols_lower, chunk.columns):
            if ("clinsig" in c) or ("clinical" in c and "sign" in c) or ("significance" in c):
                cln_col = orig
                break
        if cln_col is None:
            # fallback: print columns and raise error
            print("Could not detect clinical significance column from first chunk. Columns were:")
            print(chunk.columns.tolist())
            raise ValueError("No clinical significance column detected.")
        print("Detected clinical-significance column:", cln_col)
        first_chunk = False

    # create normalized clnsig_simple
    chunk["clnsig_simple"] = chunk[cln_col].astype(str).str.split(";").str[0].str.strip()
    mask = chunk["clnsig_simple"].str.lower().isin(["pathogenic", "benign"])
    df_sel = chunk.loc[mask].copy()
    total_candidates += len(df_sel)

    if len(df_sel) == 0:
        if (chunk_i % 5) == 0:
            print(f"Chunk {chunk_i}: no matching rows")
        continue

    # choose columns to keep (prefer this list)
    preferred = ["alleleid","genesymbol","gene","clnsig","clnsig_simple","clinicalsignificance","type","name",
                 "chr","chrom","start","position","ref","alt","referenceallele","alternateallele","rsid","variationid","reviewstatus"]
    keep = [c for c in preferred if c in df_sel.columns]
    if not keep:
        keep = df_sel.columns.tolist()  # keep all if none matched
    df_out = df_sel[keep]

    # append to temp CSV
    header = not os.path.exists(temp_out)
    df_out.to_csv(temp_out, mode="a", index=False, header=header)
    rows_written += len(df_out)

    if (chunk_i % 5) == 0:
        print(f"Processed chunk {chunk_i}, matched rows added: {len(df_out)}, total so far: {rows_written}")

print(f"\nFinished chunked scan. Total matching rows found: {rows_written} (saved to temp: {temp_out})")

if rows_written == 0:
    raise ValueError("No Pathogenic/Benign rows found. Check clinical significance column and file contents.")

# Now load the temp file (it should be much smaller) and create a balanced sample up to N_total
df_all = pd.read_csv(temp_out, low_memory=False)
print("Filtered table shape:", df_all.shape)
print("Label counts (raw):")
print(df_all["clnsig_simple"].value_counts())

# target total rows
N_total = 10000
# compute per-class target aiming for balance
n_path = min(len(df_all[df_all["clnsig_simple"].str.lower()=="pathogenic"]), N_total//2)
n_ben  = min(len(df_all[df_all["clnsig_simple"].str.lower()=="benign"]), N_total - n_path)
# if insufficient in one class, fill with more from the other
remaining = N_total - (n_path + n_ben)
if remaining > 0:
    # add from whichever class has more remaining
    more_class = "pathogenic" if len(df_all[df_all["clnsig_simple"].str.lower()=="pathogenic"]) > len(df_all[df_all["clnsig_simple"].str.lower()=="benign"]) else "benign"
    more_avail = len(df_all[df_all["clnsig_simple"].str.lower()==more_class]) - (n_path if more_class=="pathogenic" else n_ben)
    add = min(remaining, max(0, more_avail))
    if add > 0:
        if more_class=="pathogenic":
            n_path += add
        else:
            n_ben += add

print(f"Sampling targets -> Pathogenic: {n_path}, Benign: {n_ben} (total target {n_path+n_ben})")

df_path = df_all[df_all["clnsig_simple"].str.lower()=="pathogenic"]
df_ben  = df_all[df_all["clnsig_simple"].str.lower()=="benign"]

# sample without replacement (if available), else take all
df_path_s = df_path.sample(n=n_path, random_state=42) if len(df_path) > n_path else df_path
df_ben_s  = df_ben.sample(n=n_ben, random_state=42) if len(df_ben) > n_ben else df_ben

subset = pd.concat([df_path_s, df_ben_s]).reset_index(drop=True)
print("Final subset shape:", subset.shape)
out_name = f"clinvar_subset_{len(subset)}.csv"
out_path = os.path.join(proc_dir, out_name)
subset.to_csv(out_path, index=False)
print("Saved final subset to:", out_path)

# cleanup temp if you want (uncomment to remove)
# os.remove(temp_out)


In [None]:
import os, pandas as pd

project_dir = r"C:\BLAST\variant-effect-prediction"
raw_dir = os.path.join(project_dir, "data", "raw")
proc_dir = os.path.join(project_dir, "data", "processed")
os.makedirs(proc_dir, exist_ok=True)

raw_path = os.path.join(raw_dir, "variant_summary.txt.gz")
print("Reading from:", raw_path)

chunksize = 500_000
matched_rows = []

for i, chunk in enumerate(pd.read_csv(raw_path, sep="\t", compression="gzip", low_memory=False, chunksize=chunksize)):
    if i == 0:
        print("Columns:", list(chunk.columns)[:20])
    # detect clin sig column
    cln_candidates = [c for c in chunk.columns if "significance" in c.lower() or "clinsig" in c.lower()]
    if not cln_candidates:
        continue
    cln_col = cln_candidates[0]
    # keep benign / pathogenic
    sub = chunk[chunk[cln_col].astype(str).str.lower().str.contains("benign|pathogenic", na=False)].copy()
    if not sub.empty:
        matched_rows.append(sub)
    print(f"Chunk {i}: matched {len(sub)}")

df = pd.concat(matched_rows, ignore_index=True)
print("Total matched:", len(df))

# Normalize column names
df.columns = [c.lower().strip() for c in df.columns]

# Simplify clin sig
cln_col = [c for c in df.columns if "significance" in c or "clinsig" in c][0]
df["clnsig_simple"] = df[cln_col].astype(str).str.split(";").str[0].str.strip()

# Filter for simple benign/pathogenic
mask = df["clnsig_simple"].str.lower().isin(["benign", "pathogenic"])
df_filtered = df[mask].copy()
print("Filtered:", len(df_filtered))

# Pick essential columns
preferred = ["alleleid","genesymbol","gene","clnsig_simple","clinicalsignificance",
             "name","ref","alt","rsid","variationid","reviewstatus"]
keep = [c for c in preferred if c in df_filtered.columns]
subset = df_filtered[keep].reset_index(drop=True)

# Downsample
N_total = 10_000  # adjust here (e.g., 50000 if you want even more)
if len(subset) > N_total:
    subset = subset.sample(N_total, random_state=42)
print("Subset shape:", subset.shape)

out_path = os.path.join(proc_dir, f"clinvar_subset_{len(subset)}.csv")
subset.to_csv(out_path, index=False)
print("Saved:", out_path)
display(subset.head())

