In [2]:
import pandas as pd
from pathlib import Path

# Resolve project root (repo root)
PROJECT_ROOT = Path.cwd().resolve().parent

# Processed data directory
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed" / "clinvar_clean"

# Input CSV path
DF_PATH = PROCESSED_DIR / "clinvar_balanced_benign_pathogenic_vus.csv"

# Sanity check
assert DF_PATH.exists(), f"Processed ClinVar CSV not found at {DF_PATH}"

# Load dataframe
df_balanced = pd.read_csv(DF_PATH)

print("Loaded df_balanced with shape:", df_balanced.shape)
df_balanced.head()


(313748, 11)


Unnamed: 0,chrom,pos,ref,alt,clnsig,review_status,molecular_consequence,geneinfo,disease_name,clinical_group,functional_class
0,chr1,1509223,G,A,Likely_benign,"criteria_provided,_single_submitter",SO:0002152|genic_downstream_transcript_variant,ATAD3A:55210|ATAD3B:83858,not_provided,benign,downstream
1,chr14,64782881,G,A,Benign,"criteria_provided,_single_submitter",SO:0001627|intron_variant,SPTB:6710,not_provided,benign,intronic
2,chr9,110695387,T,G,Likely_benign,"criteria_provided,_single_submitter",SO:0001627|intron_variant,MUSK:4593,Congenital_myasthenic_syndrome_9|Fetal_akinesi...,benign,intronic
3,chr10,104041361,G,C,Likely_benign,"criteria_provided,_single_submitter",SO:0001627|intron_variant,COL17A1:1308,not_provided,benign,intronic
4,chr19,50409251,C,T,Benign/Likely_benign,"criteria_provided,_multiple_submitters,_no_con...",SO:0001627|intron_variant,POLD1:5424,"not_specified|Colorectal_cancer,_susceptibilit...",benign,intronic


In [3]:
import pandas as pd
from pathlib import Path

# Resolve project root (repo root)
PROJECT_ROOT = Path.cwd().resolve().parent

# Genome FASTA path
GENOME_DIR = PROJECT_ROOT / "data" / "raw" / "genome"
FASTA_PATH = GENOME_DIR / "hg38.fa"

# Sanity check
assert FASTA_PATH.exists(), f"Genome FASTA not found at {FASTA_PATH}"

# Define flank sizes (flank=50 -> 101bp total, flank=500 -> 1001bp total)
FLANK_SIZES = {
    "small": 50,
    "medium": 250,
    "large": 500,  # requested
}

# Load genome
try:
    import pysam
    fasta = pysam.FastaFile(str(FASTA_PATH))
except ImportError as e:
    raise ImportError(
        "pysam is not installed. Install with: pip install pysam"
    ) from e

print("Loaded genome FASTA:", FASTA_PATH)

def fetch_ref_alt_seq(chrom, pos, ref, alt, flank):
    # Normalize inputs
    chrom = str(chrom)
    pos = int(pos)               # VCF-style 1-based
    ref = str(ref).upper()
    alt = str(alt).upper()

    # SNVs only
    if len(ref) != 1 or len(alt) != 1:
        return (None, None, "not_snv")

    # Convert to 0-based coordinates for fasta.fetch (end is exclusive)
    start = pos - 1 - flank
    end = pos - 1 + flank + 1

    # Prevent negative indexing
    if start < 0:
        return (None, None, "out_of_bounds")

    # Fetch sequence window
    try:
        seq = fasta.fetch(chrom, start, end).upper()
    except Exception:
        return (None, None, "bad_chrom")

    # Validate length
    if len(seq) != (2 * flank + 1):
        return (None, None, "out_of_bounds")

    # Validate reference allele matches genome
    center = flank
    if seq[center] != ref:
        return (None, None, "ref_mismatch")

    # Create ref/alt sequences
    seq_ref = seq
    seq_alt = seq[:center] + alt + seq[center + 1:]

    return (seq_ref, seq_alt, "ok")


# Build 3 dataframes with different sequence lengths
dfs_by_size = {}

for label, flank in FLANK_SIZES.items():
    # Build sequences for this flank size
    results = [
        fetch_ref_alt_seq(row.chrom, row.pos, row.ref, row.alt, flank=flank)
        for row in df_balanced.itertuples(index=False)
    ]

    # Copy and attach sequence columns
    df_tmp = df_balanced.copy()
    df_tmp["seq_ref"] = [r[0] for r in results]
    df_tmp["seq_alt"] = [r[1] for r in results]
    df_tmp["seq_status"] = [r[2] for r in results]

    # Print status counts
    print(f"\n{label.upper()} (flank={flank}, total_bp={2*flank+1}) status:")
    print(df_tmp["seq_status"].value_counts())

    # Keep only valid sequences
    df_ok = df_tmp[df_tmp["seq_status"] == "ok"].copy()
    print(f"Kept variants with valid sequences: {df_ok.shape[0]}")

    # Store
    dfs_by_size[label] = df_ok


# Final named outputs (3 distinct dataframes)
df_small = dfs_by_size["small"]
df_medium = dfs_by_size["medium"]
df_large = dfs_by_size["large"]

df_small.head()



SMALL (flank=50, total_bp=101) status:
seq_status
ok           311007
bad_chrom      2741
Name: count, dtype: int64
Kept variants with valid sequences: 311007

MEDIUM (flank=250, total_bp=501) status:
seq_status
ok               311007
bad_chrom          2740
out_of_bounds         1
Name: count, dtype: int64
Kept variants with valid sequences: 311007

LARGE (flank=500, total_bp=1001) status:
seq_status
ok               311007
bad_chrom          2740
out_of_bounds         1
Name: count, dtype: int64
Kept variants with valid sequences: 311007


Unnamed: 0,chrom,pos,ref,alt,clnsig,review_status,molecular_consequence,geneinfo,disease_name,clinical_group,functional_class,seq_ref,seq_alt,seq_status
0,chr1,1509223,G,A,Likely_benign,"criteria_provided,_single_submitter",SO:0002152|genic_downstream_transcript_variant,ATAD3A:55210|ATAD3B:83858,not_provided,benign,downstream,CGGCCTCCCTCAGCTCCCTCTCTCCCCACTAGGCCACGGCGTATGC...,CGGCCTCCCTCAGCTCCCTCTCTCCCCACTAGGCCACGGCGTATGC...,ok
1,chr14,64782881,G,A,Benign,"criteria_provided,_single_submitter",SO:0001627|intron_variant,SPTB:6710,not_provided,benign,intronic,AAAAGCCTGTGCATAATTTTCATGATATGCACCACCACAGATAAGG...,AAAAGCCTGTGCATAATTTTCATGATATGCACCACCACAGATAAGG...,ok
2,chr9,110695387,T,G,Likely_benign,"criteria_provided,_single_submitter",SO:0001627|intron_variant,MUSK:4593,Congenital_myasthenic_syndrome_9|Fetal_akinesi...,benign,intronic,GTTAGAAACTCTAGGTTTAATAAAGCCATATTGCCTTATTTATTTT...,GTTAGAAACTCTAGGTTTAATAAAGCCATATTGCCTTATTTATTTT...,ok
3,chr10,104041361,G,C,Likely_benign,"criteria_provided,_single_submitter",SO:0001627|intron_variant,COL17A1:1308,not_provided,benign,intronic,CCTCGGGGTCCTGGTGGGCCTGGAATGGAAGGCCCTGCAGAAGAGA...,CCTCGGGGTCCTGGTGGGCCTGGAATGGAAGGCCCTGCAGAAGAGA...,ok
4,chr19,50409251,C,T,Benign/Likely_benign,"criteria_provided,_multiple_submitters,_no_con...",SO:0001627|intron_variant,POLD1:5424,"not_specified|Colorectal_cancer,_susceptibilit...",benign,intronic,CAGATCCTGGAGAACCTGCTCAGTGCCCGGAAGAGGTGAGCCCTGG...,CAGATCCTGGAGAACCTGCTCAGTGCCCGGAAGAGGTGAGCCCTGG...,ok


In [5]:
from pathlib import Path

# Resolve project root (repo root)
PROJECT_ROOT = Path.cwd().resolve().parent

# Output directory for model inputs
OUT_DIR = PROJECT_ROOT / "data" / "processed" / "model_input"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Filenames
df_small_path = OUT_DIR / "clinvar_balanced_seq_101bp.csv"
df_medium_path = OUT_DIR / "clinvar_balanced_seq_501bp.csv"
df_large_path = OUT_DIR / "clinvar_balanced_seq_1001bp.csv"

# Save CSVs
df_small.to_csv(df_small_path, index=False)
df_medium.to_csv(df_medium_path, index=False)
df_large.to_csv(df_large_path, index=False)

# Confirmation
print("Saved files:")
print(df_small_path)
print(df_medium_path)
print(df_large_path)


Saved files:
/Users/rm1406/Desktop/resume/2026/PMCC/tech_test/notebooks/clinvar_balanced_seq_101bp.csv
/Users/rm1406/Desktop/resume/2026/PMCC/tech_test/notebooks/clinvar_balanced_seq_501bp.csv
/Users/rm1406/Desktop/resume/2026/PMCC/tech_test/notebooks/clinvar_balanced_seq_1001bp.csv
