In [2]:
# Cell 1 (fixed): parse name -> cdna, ref_parsed, alt_parsed, protein_change
import pandas as pd, re, os

proc_dir = r"C:\BLAST\variant-effect-prediction\data\processed"
in_path = os.path.join(proc_dir, "clinvar_subset_20000.csv")
out_path = os.path.join(proc_dir, "clinvar_subset_parsed.csv")

df = pd.read_csv(in_path, low_memory=False)
print("Loaded rows:", len(df))
print("Columns:", df.columns.tolist()[:30])

# safer parse function returning a dict so we can build a dataframe
def parse_name_row(name):
    if pd.isna(name):
        return {"cdna": None, "ref_parsed": None, "alt_parsed": None, "protein_change": None}
    # try to find c. notation with single-letter ref>alt (e.g. c.1144C>T)
    cdna_m = re.search(r"c\.([0-9_+\-]+)?([ACGT])>([ACGT])", str(name))
    # sometimes del/ins: c.1234del, c.123_125del, c.123_124ins... we ignore those for ref/alt extraction
    prot_m = re.search(r"p\.([A-Za-z]{3})(\d+)([A-Za-z]{3}|\*)", str(name))
    cdna = cdna_m.group(0) if cdna_m else None
    refp = cdna_m.group(2) if cdna_m else None
    altp = cdna_m.group(3) if cdna_m else None
    prot = prot_m.group(0) if prot_m else None
    return {"cdna": cdna, "ref_parsed": refp, "alt_parsed": altp, "protein_change": prot}

# apply parsing to all rows (fast-ish)
parsed_df = df["name"].apply(parse_name_row).apply(pd.Series)

# attach parsed columns to original df (do not use join to avoid column-name collision)
for c in parsed_df.columns:
    df[c] = parsed_df[c]

# quick diagnostics
print("\nParsed columns added. Sample counts (non-null):")
print(df[["cdna","ref_parsed","alt_parsed","protein_change"]].notnull().sum())

print("\nFirst 10 rows (name, cdna, ref_parsed, alt_parsed, protein_change, label):")
print(df[["name","cdna","ref_parsed","alt_parsed","protein_change","label"]].head(10).to_string(index=False))

# save parsed file
df.to_csv(out_path, index=False)
print("\nSaved parsed dataset to:", out_path)


Loaded rows: 20000
Columns: ['variationid', 'gene', 'chr', 'pos', 'stop', 'ref', 'alt', 'name', 'label']

Parsed columns added. Sample counts (non-null):
cdna              11973
ref_parsed        11973
alt_parsed        11973
protein_change     5175
dtype: int64

First 10 rows (name, cdna, ref_parsed, alt_parsed, protein_change, label):
                                          name       cdna ref_parsed alt_parsed protein_change      label
  NM_012144.4(DNAI1):c.484_485del (p.Asp162fs)       None       None       None           None pathogenic
     NM_000059.4(BRCA2):c.2870del (p.Asn957fs)       None       None       None           None pathogenic
  NM_177438.3(DICER1):c.4443G>A (p.Trp1481Ter)  c.4443G>A          G          A   p.Trp1481Ter pathogenic
    NM_018297.4(NGLY1):c.1512T>A (p.Tyr504Ter)  c.1512T>A          T          A    p.Tyr504Ter pathogenic
NM_053025.4(MYLK):c.3876_3937del (p.Ser1293fs)       None       None       None           None pathogenic
 NM_004447.6(EPS8):c.1802

In [4]:
# Cell 2: extract ref_aa, pos_aa, alt_aa from protein_change
three_to_one = {
    'Ala':'A','Arg':'R','Asn':'N','Asp':'D','Cys':'C',
    'Gln':'Q','Glu':'E','Gly':'G','His':'H','Ile':'I',
    'Leu':'L','Lys':'K','Met':'M','Phe':'F','Pro':'P',
    'Ser':'S','Thr':'T','Trp':'W','Tyr':'Y','Val':'V',
    'Ter':'*'
}
import re
def parse_protein_token(p):
    if pd.isna(p):
        return pd.Series({"ref_aa": None, "pos_aa": None, "alt_aa": None})
    s = str(p).strip()
    # 3-letter to 1-letter pattern: p.Gln382Ter or p.Gln382Arg
    m = re.match(r"p\.([A-Za-z]{3})(\d+)([A-Za-z]{3}|\*)", s)
    if m:
        ref3, pos, alt3 = m.groups()
        ref1 = three_to_one.get(ref3.capitalize(), None)
        alt1 = three_to_one.get(alt3.capitalize(), None) if alt3 != "*" else "*"
        return pd.Series({"ref_aa": ref1, "pos_aa": int(pos), "alt_aa": alt1})
    # fallback: 1-letter pattern like p.R72H
    m2 = re.match(r"p\.([A-Za-z])(\d+)([A-Za-z]|\*)", s)
    if m2:
        ref1, pos, alt1 = m2.groups()
        return pd.Series({"ref_aa": ref1.upper(), "pos_aa": int(pos), "alt_aa": alt1.upper()})
    return pd.Series({"ref_aa": None, "pos_aa": None, "alt_aa": None})

# apply to the parsed file (reload if needed)
parsed_path = out_path  # from previous cell
dfp = pd.read_csv(parsed_path, low_memory=False)
df_parsed_prot = dfp["protein_change"].apply(parse_protein_token)
dfp["ref_aa"] = df_parsed_prot["ref_aa"]
dfp["pos_aa"] = df_parsed_prot["pos_aa"]
dfp["alt_aa"] = df_parsed_prot["alt_aa"]

print("AA parse counts (non-null):", dfp[["ref_aa","alt_aa"]].notnull().sum().to_dict())
display(dfp[["name","protein_change","ref_aa","pos_aa","alt_aa","label"]].head(10))

# save
out2 = os.path.join(proc_dir, "clinvar_subset_parsed_aa.csv")
dfp.to_csv(out2, index=False)
print("Saved with AA columns to:", out2)


AA parse counts (non-null): {'ref_aa': 5175, 'alt_aa': 5098}


Unnamed: 0,name,protein_change,ref_aa,pos_aa,alt_aa,label
0,NM_012144.4(DNAI1):c.484_485del (p.Asp162fs),,,,,pathogenic
1,NM_000059.4(BRCA2):c.2870del (p.Asn957fs),,,,,pathogenic
2,NM_177438.3(DICER1):c.4443G>A (p.Trp1481Ter),p.Trp1481Ter,W,1481.0,*,pathogenic
3,NM_018297.4(NGLY1):c.1512T>A (p.Tyr504Ter),p.Tyr504Ter,Y,504.0,*,pathogenic
4,NM_053025.4(MYLK):c.3876_3937del (p.Ser1293fs),,,,,pathogenic
5,NM_004447.6(EPS8):c.1802_1817del (p.Pro601fs),,,,,pathogenic
6,NM_000019.4(ACAT1):c.789del (p.Val264fs),,,,,pathogenic
7,NM_022132.5(MCCC2):c.512-1G>A,,,,,pathogenic
8,NM_020975.6(RET):c.1144C>T (p.Gln382Ter),p.Gln382Ter,Q,382.0,*,pathogenic
9,NM_000448.3(RAG1):c.2882_2891del (p.Ser961fs),,,,,pathogenic


Saved with AA columns to: C:\BLAST\variant-effect-prediction\data\processed\clinvar_subset_parsed_aa.csv
