# Convert ClinVar TSV/VCF → CSV


**Output columns:** `chrom,pos,ref,alt,gene_symbol,clinical_significance,hgvs_p`

- Keeps **Pathogenic/Benign** (optionally collapses Likely_*).
- For VCF reading, install `cyvcf2`.


In [1]:

from pathlib import Path
import pandas as pd, numpy as np
import re

INPUT_PATH = Path("../data/clinvar_input.tsv")  # or ../data/clinvar_input.vcf.gz
OUTPUT_CSV = Path("../data/clinvar_clean.csv")

COLLAPSE_LIKELY = True
KEEP_MAP = {
    "pathogenic": "Pathogenic",
    "likely_pathogenic": "Pathogenic",
    "benign": "Benign",
    "likely_benign": "Benign"
}

def normalize_clinsig(s):
    if s is None or (isinstance(s, float) and np.isnan(s)):
        return None
    parts = re.split(r"[|/,;]", str(s))
    for p in parts:
        key = p.strip().lower().replace(" ", "_")
        if key in KEEP_MAP:
            if COLLAPSE_LIKELY:
                return KEEP_MAP[key]
            # else keep only strict
            if key in ["pathogenic","benign"]:
                return key.capitalize()
    return None


In [2]:

def tsv_to_csv(path: Path, out_csv: Path) -> Path:
    df = pd.read_csv(path, sep="\t", dtype=str, low_memory=False)
    cols = {c.lower(): c for c in df.columns}

    def pick(*cands):
        for c in cands:
            cl = c.lower()
            if cl in cols:
                return cols[cl]
        return None

    col_chrom = pick("chromosome","chrom","chr")
    col_pos   = pick("start","pos","position")
    col_ref   = pick("referenceallele","ref")
    col_alt   = pick("alternateallele","alt","altallele")
    col_gene  = pick("genesymbol","gene_symbol","gene","symbol")
    col_sig   = pick("clinicalsignificance","clinsig","clinical_significance")
    col_hgvs_p= pick("hgvsp","hgvs_p","protein_change","protein_hgvs")

    if not all([col_chrom,col_pos,col_ref,col_alt,col_gene,col_sig]):
        raise ValueError("Missing required columns in TSV.")

    df["clinical_significance"] = df[col_sig].apply(normalize_clinsig)
    df = df[df["clinical_significance"].isin(["Pathogenic","Benign"])].copy()

    out = pd.DataFrame({
        "chrom": df[col_chrom].astype(str),
        "pos": pd.to_numeric(df[col_pos], errors="coerce").astype("Int64"),
        "ref": df[col_ref].astype(str),
        "alt": df[col_alt].astype(str),
        "gene_symbol": df[col_gene].astype(str),
        "hgvs_p": df[col_hgvs_p].astype(str) if col_hgvs_p else ""
    }).dropna(subset=["pos"])

    out.to_csv(out_csv, index=False)
    return out_csv


In [3]:

def vcf_to_csv(path: Path, out_csv: Path) -> Path:
    from cyvcf2 import VCF
    rows = []
    for rec in VCF(str(path)):
        info = rec.INFO or {}
        # CLNSIG often present; sometimes combined
        label = normalize_clinsig(info.get("CLNSIG") or info.get("CLNSIGCONF") or "")
        if label not in ["Pathogenic","Benign"]:
            continue

        gene = ""
        gi = info.get("GENEINFO")
        if gi:
            gene = str(gi).split("|")[0].split(":")[0]

        hgvs_p = ""
        for key in ["HGVSP","HGVSp","HGVS_P","HGVS"]:
            if key in info and info[key]:
                hgvs_p = str(info[key])
                break

        chrom = str(rec.CHROM)
        pos = int(rec.POS)
        ref = rec.REF
        for alt in rec.ALT or []:
            rows.append({
                "chrom": chrom, "pos": pos, "ref": ref, "alt": alt,
                "gene_symbol": gene, "clinical_significance": label, "hgvs_p": hgvs_p
            })
    out = pd.DataFrame.from_records(rows)
    if not out.empty:
        out.to_csv(out_csv, index=False)
    return out_csv


In [4]:

inp = Path(INPUT_PATH)
if not inp.exists():
    print("Upload your ClinVar file to:", inp.resolve())
else:
    if any(s in "".join(inp.suffixes).lower() for s in [".tsv",".txt"]):
        out = tsv_to_csv(inp, OUTPUT_CSV)
        print("Wrote CSV:", out)
    elif any(s in "".join(inp.suffixes).lower() for s in [".vcf",".vcf.gz",".gz"]):
        out = vcf_to_csv(inp, OUTPUT_CSV)
        print("Wrote CSV:", out)
    else:
        raise ValueError("Unknown input type. Use TSV/TXT or VCF/VCF.GZ")


Upload your ClinVar file to: /Users/rahuls/Desktop/science/data/clinvar_input.tsv


In [5]:

# Sanity check
out = Path("../data/clinvar_clean.csv")
if out.exists():
    df = pd.read_csv(out)
    print("Rows:", len(df))
    print(df["clinical_significance"].value_counts())
    print(df.head())
else:
    print("No output yet. Run the conversion cell above after setting INPUT_PATH.")


No output yet. Run the conversion cell above after setting INPUT_PATH.
