In [None]:
#!/usr/bin/env python3
"""
Complete HBB Variant Processing and Validation Pipeline (Fully Offline)

This script:
1. Fetches gnomAD data for HBB
2. Processes and parses protein-level information using LOCAL reference files
3. Validates internally (consistency checks)
4. Validates externally (local UniProt, local Ensembl, local MANE)
5. Adds UniProt and AlphaFold IDs
6. Saves validated data

ALL validation is done using local reference files - NO online queries.

Usage:
    python process_hbb_fully_offline.py
"""

import pandas as pd
import json
import re
import time
import subprocess
import os
import gzip
from pathlib import Path
from typing import Dict, Optional, Tuple
from collections import defaultdict
from variant_utils.get_gene_info import get_gene_info
from variant_utils.gnomad_utils import queryGnomAD

# ============================================================================
# CONFIGURATION
# ============================================================================

# Set Java environment
os.environ['JAVA_HOME'] = '/jet/home/barazand/NEWOCEAN/java/jdk-21.0.9'
os.environ['PATH'] = f"/jet/home/barazand/NEWOCEAN/java/jdk-21.0.9/bin:{os.environ.get('PATH', '')}"

# Paths
CACHE_DIR = Path(".cache")
CACHE_DIR.mkdir(exist_ok=True)
EXTERNAL_TOOLS_CONFIG = "external_tools.json"

# Local reference data paths
REFERENCE_DIR = Path("/jet/home/barazand/NEWOCEAN/ref_data")  # adjust as needed
ENSEMBL_GTF = REFERENCE_DIR / "ensembl" / "Homo_sapiens.GRCh38.112.gtf.gz"
ENSEMBL_PEP = REFERENCE_DIR / "ensembl" / "Homo_sapiens.GRCh38.pep.all.fa.gz"
MANE_SUMMARY = REFERENCE_DIR / "mane" / "MANE.GRCh38.v1.3.summary.txt.gz"
UNIPROT_FASTA = REFERENCE_DIR / "uniprot" / "uniprot_sprot.fasta.gz"
UNIPROT_IDMAPPING = REFERENCE_DIR / "uniprot" / "HUMAN_9606_idmapping.dat.gz"

# Gene to process
GENE_SYMBOL = "HBB"

# Output files
OUTPUT_CSV = CACHE_DIR / f"{GENE_SYMBOL}_protein_level_validated.csv"
OUTPUT_JSON = CACHE_DIR / f"{GENE_SYMBOL}_protein_level_validated.json"
VALIDATION_REPORT = CACHE_DIR / f"{GENE_SYMBOL}_validation_report.txt"

# Global caches (loaded once at startup)
ENST_TO_SEQ = None
GENE_TO_MANE_ENST = None
ENST_TO_NM = None
GENE_TO_UNIPROT = None
UNIPROT_TO_SEQ = None


# ============================================================================
# LOCAL REFERENCE LOADERS - ENSEMBL
# ============================================================================

def load_enst_to_ensp(gtf_path: Path) -> Dict[str, str]:
    """
    Parse Ensembl GTF to build mapping transcript_id (ENST) -> protein_id (ENSP).
    Maps both versioned and versionless ENST/ENSP.
    """
    print(f"   Parsing GTF: {gtf_path.name}")
    enst_to_ensp = {}
    
    with gzip.open(gtf_path, "rt") as f:
        for line in f:
            if line.startswith("#"):
                continue
            parts = line.strip().split("\t")
            if len(parts) < 9:
                continue
            feature_type = parts[2]
            if feature_type != "CDS":
                continue
            attrs = parts[8]

            # Parse attributes
            attr_dict = {}
            for field in attrs.split(";"):
                field = field.strip()
                if not field or " " not in field:
                    continue
                key, value = field.split(" ", 1)
                attr_dict[key] = value.strip('"')

            tid = attr_dict.get("transcript_id")
            pid = attr_dict.get("protein_id")
            if not tid or not pid:
                continue

            # Store versioned and versionless
            enst_full = tid
            ensp_full = pid
            enst_base = tid.split(".")[0]
            ensp_base = pid.split(".")[0]

            enst_to_ensp.setdefault(enst_full, ensp_full)
            enst_to_ensp.setdefault(enst_base, ensp_full)
    
    print(f"      Found {len(enst_to_ensp)} ENST‚ÜíENSP mappings")
    return enst_to_ensp


def load_ensp_to_seq(pep_path: Path) -> Dict[str, str]:
    """
    Parse Ensembl protein FASTA: ENSP -> AA sequence.
    Maps both versioned and versionless ENSP IDs.
    """
    print(f"   Parsing peptide FASTA: {pep_path.name}")
    ensp_to_seq = {}
    current_id = None
    seq_chunks = []

    with gzip.open(pep_path, "rt") as f:
        for line in f:
            if line.startswith(">"):
                # Flush previous
                if current_id is not None:
                    seq = "".join(seq_chunks)
                    ensp_to_seq.setdefault(current_id, seq)
                    base = current_id.split(".")[0]
                    ensp_to_seq.setdefault(base, seq)

                # Parse new header (first token is ENSP ID)
                header = line[1:].strip()
                current_id = header.split()[0]
                seq_chunks = []
            else:
                seq_chunks.append(line.strip())

        # Flush last
        if current_id is not None:
            seq = "".join(seq_chunks)
            ensp_to_seq.setdefault(current_id, seq)
            base = current_id.split(".")[0]
            ensp_to_seq.setdefault(base, seq)

    print(f"      Found {len(ensp_to_seq)} ENSP‚Üísequence mappings")
    return ensp_to_seq


def build_enst_to_seq(gtf_path: Path, pep_path: Path) -> Dict[str, str]:
    """
    Combine GTF and peptide FASTA to build ENST -> protein sequence mapping.
    """
    enst_to_ensp = load_enst_to_ensp(gtf_path)
    ensp_to_seq = load_ensp_to_seq(pep_path)

    enst_to_seq = {}
    for enst, ensp in enst_to_ensp.items():
        # Try versioned, then versionless ENSP
        seq = ensp_to_seq.get(ensp) or ensp_to_seq.get(ensp.split(".")[0])
        if seq:
            enst_to_seq[enst] = seq
            # Also store versionless transcript key
            base = enst.split(".")[0]
            enst_to_seq.setdefault(base, seq)
    
    print(f"   ‚úÖ Built {len(enst_to_seq)} ENST‚Üísequence mappings")
    return enst_to_seq


# ============================================================================
# LOCAL REFERENCE LOADERS - MANE
# ============================================================================

def load_mane_summary(summary_path: Path):
    """
    Load MANE summary table, build:
      - gene_symbol -> MANE Select ENST
      - ENST -> NM_ RefSeq transcript
    """
    print(f"   Parsing MANE summary: {summary_path.name}")
    gene_to_mane_enst = {}
    enst_to_nm = {}

    with gzip.open(summary_path, "rt") as f:
        header = f.readline().strip().split("\t")
        col_idx = {name: i for i, name in enumerate(header)}

        for line in f:
            parts = line.strip().split("\t")
            if len(parts) != len(header):
                continue

            # Get MANE status
            status_col = col_idx.get("MANE_status") or col_idx.get("MANE_Status")
            if status_col is None:
                continue
            status = parts[status_col]
            
            if status not in ("MANE Select", "MANE_Select"):
                continue

            # Get transcript IDs
            enst = parts[col_idx.get("Ensembl_nuc", col_idx.get("Ensembl_transcript", -1))]
            nm = parts[col_idx.get("RefSeq_nuc", col_idx.get("RefSeq_transcript", -1))]
            symbol = parts[col_idx.get("symbol", col_idx.get("HGNC_symbol", -1))]

            if not enst or not symbol:
                continue

            # Store mapping (versionless)
            enst_base = enst.split(".")[0]
            gene_to_mane_enst.setdefault(symbol, enst_base)
            if nm:
                enst_to_nm.setdefault(enst_base, nm)

    print(f"      Found {len(gene_to_mane_enst)} MANE Select transcripts")
    return gene_to_mane_enst, enst_to_nm


# ============================================================================
# LOCAL REFERENCE LOADERS - UNIPROT
# ============================================================================

def load_uniprot_id_mapping(idmapping_path: Path) -> Dict[str, str]:
    """
    Load UniProt ID mapping file to build gene_symbol -> UniProt ID mapping.
    
    File format: UniProtKB-AC <tab> ID_type <tab> ID
    Example: P68871	Gene_Name	HBB
    """
    print(f"   Parsing UniProt ID mapping: {idmapping_path.name}")
    gene_to_uniprot = {}
    
    with gzip.open(idmapping_path, "rt") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) != 3:
                continue
            
            uniprot_id, id_type, value = parts
            
            # We're interested in Gene_Name mappings
            if id_type == "Gene_Name":
                gene_symbol = value
                # Store only if not already present (first entry = primary)
                if gene_symbol not in gene_to_uniprot:
                    gene_to_uniprot[gene_symbol] = uniprot_id
    
    print(f"      Found {len(gene_to_uniprot)} gene‚ÜíUniProt mappings")
    return gene_to_uniprot


def load_uniprot_sequences(fasta_path: Path) -> Dict[str, str]:
    """
    Load UniProt FASTA to build UniProt ID -> sequence mapping.
    
    FASTA header format: >sp|P68871|HBB_HUMAN ...
    We extract the UniProt accession (P68871)
    """
    print(f"   Parsing UniProt FASTA: {fasta_path.name}")
    uniprot_to_seq = {}
    current_id = None
    seq_chunks = []
    
    with gzip.open(fasta_path, "rt") as f:
        for line in f:
            if line.startswith(">"):
                # Flush previous
                if current_id is not None:
                    seq = "".join(seq_chunks)
                    uniprot_to_seq[current_id] = seq
                
                # Parse header: >sp|P68871|HBB_HUMAN or >tr|...
                header = line[1:].strip()
                parts = header.split("|")
                if len(parts) >= 2:
                    current_id = parts[1]  # UniProt accession
                else:
                    current_id = header.split()[0]
                seq_chunks = []
            else:
                seq_chunks.append(line.strip())
        
        # Flush last
        if current_id is not None:
            seq = "".join(seq_chunks)
            uniprot_to_seq[current_id] = seq
    
    print(f"      Found {len(uniprot_to_seq)} UniProt‚Üísequence mappings")
    return uniprot_to_seq


# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def get_local_protein_seq(transcript_id: str) -> Optional[str]:
    """Get protein sequence from local Ensembl cache."""
    if not transcript_id or ENST_TO_SEQ is None:
        return None
    # Try versioned, then versionless
    return ENST_TO_SEQ.get(transcript_id) or ENST_TO_SEQ.get(transcript_id.split(".")[0])


def get_uniprot_info(gene_symbol: str) -> Tuple[Optional[str], Optional[str]]:
    """
    Get UniProt ID and sequence from local cache.
    Returns: (uniprot_id, sequence)
    """
    if GENE_TO_UNIPROT is None or UNIPROT_TO_SEQ is None:
        return None, None
    
    uniprot_id = GENE_TO_UNIPROT.get(gene_symbol)
    if not uniprot_id:
        return None, None
    
    sequence = UNIPROT_TO_SEQ.get(uniprot_id)
    return uniprot_id, sequence


def initialize_reference_data():
    """Load all reference data at startup."""
    global ENST_TO_SEQ, GENE_TO_MANE_ENST, ENST_TO_NM, GENE_TO_UNIPROT, UNIPROT_TO_SEQ
    
    print("\n" + "="*80)
    print("LOADING LOCAL REFERENCE DATA")
    print("="*80)
    
    # Check files exist
    required_files = [
        (ENSEMBL_GTF, "Ensembl GTF"),
        (ENSEMBL_PEP, "Ensembl peptides"),
        (MANE_SUMMARY, "MANE summary"),
        (UNIPROT_FASTA, "UniProt FASTA"),
        (UNIPROT_IDMAPPING, "UniProt ID mapping")
    ]
    
    for path, name in required_files:
        if not path.exists():
            raise FileNotFoundError(f"{name} not found: {path}")
        print(f"‚úÖ Found: {path}")
    
    print("\nüìö Loading Ensembl reference data (GTF + peptides)...")
    ENST_TO_SEQ = build_enst_to_seq(ENSEMBL_GTF, ENSEMBL_PEP)
    
    print("\nüìö Loading MANE summary...")
    GENE_TO_MANE_ENST, ENST_TO_NM = load_mane_summary(MANE_SUMMARY)
    
    print("\nüìö Loading UniProt data...")
    GENE_TO_UNIPROT = load_uniprot_id_mapping(UNIPROT_IDMAPPING)
    UNIPROT_TO_SEQ = load_uniprot_sequences(UNIPROT_FASTA)
    
    print("\n‚úÖ All reference data loaded successfully")
    print("="*80)


# ============================================================================
# STEP 1: FETCH GNOMAD DATA
# ============================================================================

def fetch_gnomad_data(gene_symbol: str) -> pd.DataFrame:
    """Fetch gnomAD data for gene"""
    print("\n" + "="*80)
    print(f"STEP 1: FETCHING GNOMAD DATA FOR {gene_symbol}")
    print("="*80)
    
    # Get gene info
    print(f"\nüìä Getting gene information...")
    gene_info = get_gene_info(gene_symbol)
    
    print(f"   Gene: {gene_symbol}")
    print(f"   Chromosome: {gene_info['CHROM']}")
    print(f"   Position: {int(gene_info['chr_start']):,} - {int(gene_info['chr_end']):,}")
    print(f"   HGNC ID: {gene_info['HGNC_ID']}")
    
    # Query gnomAD
    print(f"\nüîç Querying gnomAD v4...")
    gnomad_file = CACHE_DIR / f"gnomAD_{gene_symbol}.json"
    
    gnomad_df = queryGnomAD(
        "GRCh38",
        gene_info['CHROM'],
        int(gene_info['chr_start']),
        int(gene_info['chr_end']),
        gene_info['HGNC_ID'],
        EXTERNAL_TOOLS_CONFIG,
        write_dir=str(CACHE_DIR),
        use_cache=True,           # ‚Üê Cache results for faster re-runs
        gene_symbol=GENE_SYMBOL,  # ‚Üê Better cache naming
        parallel=True,            # ‚Üê Run exomes/genomes in parallel (faster)
        cleanup=True              # ‚Üê Auto-delete intermediate files (default anyway)
    )
    
    gnomad_df.to_json(gnomad_file)
    print(f"   ‚úÖ Fetched {len(gnomad_df)} variant-transcript pairs")
    print(f"   ‚úÖ Saved to: {gnomad_file}")
    
    return gnomad_df


# ============================================================================
# STEP 2: PARSE PROTEIN-LEVEL INFORMATION
# ============================================================================

def parse_hgvsp_notation(hgvsp: str) -> Optional[Dict]:
    """Parse HGVSp notation to extract ref_aa, position, alt_aa"""
    if pd.isna(hgvsp) or hgvsp == '':
        return None
    
    aa_3to1 = {
        'Ala': 'A', 'Arg': 'R', 'Asn': 'N', 'Asp': 'D', 'Cys': 'C',
        'Gln': 'Q', 'Glu': 'E', 'Gly': 'G', 'His': 'H', 'Ile': 'I',
        'Leu': 'L', 'Lys': 'K', 'Met': 'M', 'Phe': 'F', 'Pro': 'P',
        'Ser': 'S', 'Thr': 'T', 'Trp': 'W', 'Tyr': 'Y', 'Val': 'V',
        'Ter': '*', 'Stop': '*', 'Sec': 'U', 'Pyl': 'O'
    }
    
    hgvsp_str = str(hgvsp)
    if ':p.' in hgvsp_str:
        hgvsp_str = 'p.' + hgvsp_str.split(':p.')[1]
    
    pattern = r'p\.([A-Z][a-z]{2}|[A-Z\*])(\d+)([A-Z][a-z]{2}|[A-Z\*\=])'
    match = re.match(pattern, hgvsp_str)
    
    if match:
        ref_aa = match.group(1)
        pos = int(match.group(2))
        alt_aa = match.group(3)
        
        ref_1letter = aa_3to1.get(ref_aa, ref_aa)
        alt_1letter = aa_3to1.get(alt_aa, alt_aa) if alt_aa != '=' else ref_aa
        
        return {'ref_aa': ref_1letter, 'pos': pos, 'alt_aa': alt_1letter}
    
    return None


def process_protein_level(gnomad_df: pd.DataFrame) -> pd.DataFrame:
    """Process gnomAD data to extract protein-level information using local sequences"""
    print("\n" + "="*80)
    print("STEP 2: PROCESSING PROTEIN-LEVEL INFORMATION")
    print("="*80)
    
    # Filter missense
    print(f"\nüîç Filtering for missense variants...")
    missense = gnomad_df[
        gnomad_df['Consequence'].str.contains('missense', case=False, na=False)
    ].copy()
    print(f"   Total missense: {len(missense)}")
    
    # Parse HGVSp
    print(f"\nüß¨ Parsing HGVSp notation...")
    missense['protein_change'] = missense['HGVSp'].apply(parse_hgvsp_notation)
    missense['ref_aa'] = missense['protein_change'].apply(lambda x: x['ref_aa'] if x else None)
    missense['protein_pos'] = missense['protein_change'].apply(lambda x: x['pos'] if x else None)
    missense['alt_aa'] = missense['protein_change'].apply(lambda x: x['alt_aa'] if x else None)
    missense['mutation'] = missense.apply(
        lambda row: f"{row['ref_aa']}{row['protein_pos']}{row['alt_aa']}" if row['ref_aa'] else None,
        axis=1
    )
    
    parsed = missense[missense['ref_aa'].notna()].copy()
    print(f"   Parsed: {len(parsed)}/{len(missense)}")
    
    # Filter canonical
    print(f"\nüìå Filtering for canonical transcripts...")
    canonical = parsed[parsed['CANONICAL'] == 'YES'].copy()
    print(f"   Canonical: {len(canonical)}")
    
    # Select columns
    key_columns = [
        'CHROM', 'POS', 'REF', 'ALT',
        'SYMBOL', 'Gene', 'Feature', 'Feature_type',
        'CANONICAL', 'BIOTYPE',
        'ref_aa', 'protein_pos', 'alt_aa', 'mutation',
        'HGVSp', 'HGVSc', 'Amino_acids', 'Codons',
        'AF', 'AC', 'AN', 'IMPACT', 'Consequence',
        'MANE_SELECT', 'EXON'
    ]
    
    available_columns = [col for col in key_columns if col in canonical.columns]
    result = canonical[available_columns].copy()
    
    # Attach sequences from local cache
    print(f"\nüî¨ Attaching protein sequences from local Ensembl cache...")
    result['protein_seq'] = result['Feature'].apply(get_local_protein_seq)
    
    seqs_fetched = result['protein_seq'].notna().sum()
    print(f"üìä Sequences attached: {seqs_fetched}/{len(result)}")
    
    if seqs_fetched == 0:
        print(f"\n‚ö†Ô∏è  WARNING: No sequences found in local cache!")
    
    # Verify sequences
    print(f"\n‚úÖ Verifying sequences...")
    def verify_match(row):
        if pd.isna(row['protein_seq']) or pd.isna(row['protein_pos']):
            return None
        try:
            actual = row['protein_seq'][row['protein_pos'] - 1]
            expected = row['ref_aa']
            return actual == expected
        except:
            return None
    
    result['seq_verified'] = result.apply(verify_match, axis=1)
    verified = result['seq_verified'].sum()
    print(f"   Verified: {verified}/{len(result)}")
    
    # Remove duplicates
    print(f"\nüîÑ Removing genomic duplicates...")
    result_unique = result.drop_duplicates(subset=['CHROM', 'POS', 'REF', 'ALT']).copy()
    print(f"   Before: {len(result)}")
    print(f"   After: {len(result_unique)}")
    
    return result_unique


# ============================================================================
# STEP 3: INTERNAL VALIDATION
# ============================================================================

def validate_internal(df: pd.DataFrame) -> Dict:
    """Comprehensive internal validation"""
    print("\n" + "="*80)
    print("STEP 3: INTERNAL VALIDATION")
    print("="*80)
    
    errors = []
    warnings = []
    
    # Check required columns
    print(f"\n1Ô∏è‚É£  Required columns...")
    required = ['CHROM', 'POS', 'REF', 'ALT', 'ref_aa', 'protein_pos', 'alt_aa', 'mutation']
    missing = [col for col in required if col not in df.columns]
    if missing:
        errors.append(f"Missing columns: {missing}")
        print(f"   ‚ùå Missing: {missing}")
    else:
        print(f"   ‚úÖ All present")
    
    # Check nulls
    print(f"\n2Ô∏è‚É£  Null values...")
    for col in ['ref_aa', 'protein_pos', 'alt_aa']:
        if col in df.columns:
            null_count = df[col].isna().sum()
            if null_count > 0:
                errors.append(f"{col} has {null_count} nulls")
                print(f"   ‚ùå {col}: {null_count} nulls")
            else:
                print(f"   ‚úÖ {col}: no nulls")
    
    # Check protein_seq separately
    if 'protein_seq' in df.columns:
        null_count = df['protein_seq'].isna().sum()
        if null_count > 0:
            warnings.append(f"protein_seq has {null_count} nulls (sequence not in local cache)")
            print(f"   ‚ö†Ô∏è  protein_seq: {null_count} nulls (not in cache)")
        else:
            print(f"   ‚úÖ protein_seq: no nulls")
    
    # Check amino acids
    print(f"\n3Ô∏è‚É£  Amino acid validity...")
    valid_aas = set('ACDEFGHIKLMNPQRSTVWY*')
    invalid_ref = df[~df['ref_aa'].isin(valid_aas)]
    invalid_alt = df[~df['alt_aa'].isin(valid_aas)]
    
    if len(invalid_ref) > 0:
        errors.append(f"{len(invalid_ref)} invalid ref_aa")
        print(f"   ‚ùå Invalid ref_aa: {len(invalid_ref)}")
    else:
        print(f"   ‚úÖ All ref_aa valid")
    
    if len(invalid_alt) > 0:
        errors.append(f"{len(invalid_alt)} invalid alt_aa")
        print(f"   ‚ùå Invalid alt_aa: {len(invalid_alt)}")
    else:
        print(f"   ‚úÖ All alt_aa valid")
    
    # Verify sequences
    print(f"\n4Ô∏è‚É£  Sequence verification...")
    if 'seq_verified' in df.columns:
        verified = df['seq_verified'].sum()
        total_with_seq = df['protein_seq'].notna().sum()
        print(f"   Verified: {verified}/{total_with_seq} (with sequences)")
        
        if total_with_seq == 0:
            warnings.append("No sequences available for verification")
    
    # Check duplicates
    print(f"\n5Ô∏è‚É£  Duplicates...")
    genomic_dups = df[df.duplicated(subset=['CHROM', 'POS', 'REF', 'ALT'], keep=False)]
    if len(genomic_dups) > 0:
        errors.append(f"{len(genomic_dups)} genomic duplicates")
        print(f"   ‚ùå Genomic: {len(genomic_dups)}")
    else:
        print(f"   ‚úÖ No genomic duplicates")
    
    protein_dups = df[df.duplicated(subset=['Feature', 'mutation'], keep=False)]
    if len(protein_dups) > 0:
        warnings.append(f"{len(protein_dups)} protein duplicates (different nucleotides‚Üísame AA)")
        print(f"   ‚ö†Ô∏è  Protein: {len(protein_dups)} (OK - degeneracy)")
    else:
        print(f"   ‚úÖ No protein duplicates")
    
    # Check AF range
    print(f"\n6Ô∏è‚É£  Allele frequencies...")
    if 'AF' in df.columns:
        af_series = pd.to_numeric(df['AF'], errors='coerce')
        invalid_af = df[(af_series < 0) | (af_series > 1)]
        if len(invalid_af) > 0:
            errors.append(f"{len(invalid_af)} AF out of range")
            print(f"   ‚ùå Invalid AF: {len(invalid_af)}")
        else:
            print(f"   ‚úÖ All AF in [0,1]")
    
    return {'errors': errors, 'warnings': warnings}


# ============================================================================
# STEP 4: EXTERNAL VALIDATION (ALL OFFLINE)
# ============================================================================

def validate_external(df: pd.DataFrame, gene_symbol: str) -> Dict:
    """Validate against local UniProt and MANE references (NO online queries)"""
    print("\n" + "="*80)
    print("STEP 4: EXTERNAL VALIDATION (ALL OFFLINE)")
    print("="*80)
    
    your_transcript = df['Feature'].iloc[0]
    your_sequence = df['protein_seq'].iloc[0]
    
    print(f"\nüìÇ Your data:")
    print(f"   Transcript: {your_transcript}")
    
    if your_sequence is None:
        print(f"   ‚ö†Ô∏è  Sequence: NOT AVAILABLE (not in cache)")
        print(f"   Variants: {len(df)}")
    else:
        print(f"   Sequence: {len(your_sequence)} aa")
        print(f"   Variants: {len(df)}")
    
    issues = []
    all_sequences = {}
    
    if your_sequence:
        all_sequences['Your_Data'] = your_sequence
    
    # Query UniProt LOCALLY
    print(f"\nüî¨ Querying UniProt (LOCAL)...")
    uniprot_id, uniprot_seq = get_uniprot_info(gene_symbol)
    
    if uniprot_id and uniprot_seq:
        print(f"   ‚úÖ UniProt: {uniprot_id}")
        print(f"   Length: {len(uniprot_seq)} aa")
        
        all_sequences['UniProt'] = uniprot_seq
        
        if your_sequence:
            if uniprot_seq == your_sequence:
                print(f"   ‚úÖ Sequence MATCHES UniProt (100%)")
            else:
                issues.append(f"Sequence differs from UniProt {uniprot_id}")
                print(f"   ‚ùå Sequence DIFFERS from UniProt")
                print(f"      Your length: {len(your_sequence)}")
                print(f"      UniProt length: {len(uniprot_seq)}")
        else:
            print(f"   ‚ö†Ô∏è  Cannot compare - your sequence not available")
    else:
        issues.append(f"No UniProt entry found for {gene_symbol}")
        print(f"   ‚ùå No UniProt entry found")
        uniprot_id = None
    
    # Check MANE Select locally
    print(f"\nüìã Checking MANE Select (LOCAL)...")
    your_base = your_transcript.split(".")[0]
    
    mane_enst = GENE_TO_MANE_ENST.get(gene_symbol)
    if mane_enst:
        print(f"   ‚úÖ MANE Select ENST for {gene_symbol}: {mane_enst}")
        
        if your_base == mane_enst:
            print(f"   ‚úÖ Using MANE Select transcript")
        else:
            issues.append(f"Transcript {your_base} != MANE Select {mane_enst}")
            print(f"   ‚ö†Ô∏è  Different from MANE Select transcript")
        
        # Get corresponding RefSeq
        refseq_id = ENST_TO_NM.get(mane_enst)
        if refseq_id:
            print(f"   ‚úÖ Corresponding RefSeq (NM): {refseq_id}")
        else:
            print(f"   ‚ö†Ô∏è  No NM mapping found for {mane_enst}")
        
        # Use MANE ENST sequence for validation
        mane_seq = get_local_protein_seq(mane_enst)
        if mane_seq:
            all_sequences["MANE_ENST"] = mane_seq
            print(f"   ‚úÖ MANE Select sequence: {len(mane_seq)} aa")
            
            if your_sequence:
                if mane_seq == your_sequence:
                    print(f"   ‚úÖ Your sequence MATCHES MANE Select")
                else:
                    print(f"   ‚ùå Your sequence DIFFERS from MANE Select")
                    issues.append("Sequence differs from MANE Select transcript")
            else:
                print(f"   üí° Using MANE Select sequence as reference")
        else:
            print(f"   ‚ö†Ô∏è  No sequence found for MANE ENST {mane_enst} in cache")
    else:
        print(f"   ‚ö†Ô∏è  No MANE Select entry for {gene_symbol}")
        issues.append("No MANE Select entry in MANE summary")
    
    # Compare all sequences
    if len(all_sequences) > 1:
        print(f"\nüîç Cross-database sequence comparison:")
        unique_seqs = set(all_sequences.values())
        if len(unique_seqs) == 1:
            print(f"   ‚úÖ ALL SEQUENCES IDENTICAL across {len(all_sequences)} databases")
        else:
            print(f"   ‚ö†Ô∏è  Found {len(unique_seqs)} different sequences:")
            for db_name, seq in all_sequences.items():
                print(f"      {db_name}: {len(seq)} aa")
    elif len(all_sequences) == 0:
        print(f"\n‚ö†Ô∏è  No sequences available for comparison")
        issues.append("No sequences available for validation")
    
    # Validate gnomAD annotations
    print(f"\nüîç Validating gnomAD annotations...")
    
    # Check amino acid consistency
    mismatches = 0
    if 'Amino_acids' in df.columns:
        for idx, row in df.iterrows():
            if pd.notna(row['Amino_acids']):
                parts = row['Amino_acids'].split('/')
                if len(parts) == 2:
                    if parts[0] != row['ref_aa'] or parts[1] != row['alt_aa']:
                        mismatches += 1
    
    if mismatches == 0:
        print(f"   ‚úÖ Amino acids consistent ({len(df)} variants)")
    else:
        issues.append(f"{mismatches} amino acid mismatches")
        print(f"   ‚ùå {mismatches} mismatches")
    
    # Check consequences
    if 'Consequence' in df.columns:
        all_missense = df['Consequence'].str.contains('missense', case=False, na=False).all()
        if all_missense:
            print(f"   ‚úÖ All variants are missense")
        else:
            issues.append("Non-missense variants found")
            print(f"   ‚ùå Non-missense found")
    
    return {
        'issues': issues,
        'uniprot_id': uniprot_id,
        'all_sequences': all_sequences
    }


# ============================================================================
# STEP 5: ADD STRUCTURE IDS
# ============================================================================

def add_structure_ids(df: pd.DataFrame, uniprot_id: Optional[str]) -> pd.DataFrame:
    """Add UniProt and AlphaFold IDs"""
    print("\n" + "="*80)
    print("STEP 5: ADDING STRUCTURE IDS")
    print("="*80)
    
    if uniprot_id:
        df['uniprot_id'] = uniprot_id
        df['alphafold_id'] = f"AF-{uniprot_id}-F1"
        df['alphafold_pdb_url'] = f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.pdb"
        
        print(f"\n‚úÖ Added structure IDs:")
        print(f"   UniProt: {uniprot_id}")
        print(f"   AlphaFold: AF-{uniprot_id}-F1")
    else:
        print(f"\n‚ö†Ô∏è  No UniProt ID available")
    
    return df


# ============================================================================
# STEP 6: GENERATE REPORT
# ============================================================================

def generate_report(df: pd.DataFrame, internal_results: Dict, external_results: Dict, report_path: Path):
    """Generate validation report"""
    print("\n" + "="*80)
    print("STEP 6: GENERATING VALIDATION REPORT")
    print("="*80)
    
    with open(report_path, 'w') as f:
        f.write("HBB VARIANT VALIDATION REPORT (Fully Offline)\n")
        f.write("="*80 + "\n\n")
        
        f.write(f"Gene: HBB (Hemoglobin subunit beta)\n")
        f.write(f"Total variants: {len(df)}\n")
        f.write(f"Transcript: {df['Feature'].iloc[0]}\n")
        
        first_seq = df['protein_seq'].iloc[0]
        if first_seq:
            f.write(f"Protein sequence length: {len(first_seq)} aa\n\n")
        else:
            f.write(f"Protein sequence: NOT AVAILABLE (not in local cache)\n\n")
        
        f.write("INTERNAL VALIDATION\n")
        f.write("-"*80 + "\n")
        errors = internal_results['errors']
        warnings = internal_results['warnings']
        
        if errors:
            f.write(f"‚ùå ERRORS ({len(errors)}):\n")
            for err in errors:
                f.write(f"   - {err}\n")
        else:
            f.write("‚úÖ No errors\n")
        
        if warnings:
            f.write(f"\n‚ö†Ô∏è  WARNINGS ({len(warnings)}):\n")
            for warn in warnings:
                f.write(f"   - {warn}\n")
        else:
            f.write("\n‚úÖ No warnings\n")
        
        f.write("\n\nEXTERNAL VALIDATION (ALL OFFLINE)\n")
        f.write("-"*80 + "\n")
        issues = external_results['issues']
        uniprot_id = external_results['uniprot_id']
        all_sequences = external_results.get('all_sequences', {})
        
        if uniprot_id:
            f.write(f"‚úÖ UniProt ID: {uniprot_id} (from local cache)\n")
            f.write(f"‚úÖ AlphaFold ID: AF-{uniprot_id}-F1\n")
        
        if all_sequences:
            f.write(f"\nSequence Cross-Validation (all sources local):\n")
            for db_name, seq in all_sequences.items():
                f.write(f"   {db_name}: {len(seq)} aa\n")
            
            unique_seqs = set(all_sequences.values())
            if len(unique_seqs) == 1:
                f.write(f"   ‚úÖ ALL IDENTICAL\n")
            else:
                f.write(f"   ‚ö†Ô∏è  {len(unique_seqs)} different sequences detected\n")
        
        if issues:
            f.write(f"\n‚ö†Ô∏è  ISSUES ({len(issues)}):\n")
            for issue in issues:
                f.write(f"   - {issue}\n")
        else:
            f.write("\n‚úÖ No issues\n")
        
        f.write("\n\nVARIANT STATISTICS\n")
        f.write("-"*80 + "\n")
        f.write(f"Position range: {df['protein_pos'].min()} - {df['protein_pos'].max()}\n")
        f.write(f"Unique positions: {df['protein_pos'].nunique()}\n")
        f.write(f"Unique mutations: {df['mutation'].nunique()}\n")
        
        if 'AF' in df.columns:
            af_series = pd.to_numeric(df['AF'], errors='coerce')
            f.write(f"\nAllele Frequency Distribution:\n")
            f.write(f"   Ultra-rare (< 0.00001): {(af_series < 0.00001).sum()}\n")
            f.write(f"   Rare (0.00001-0.001): {((af_series >= 0.00001) & (af_series < 0.001)).sum()}\n")
            f.write(f"   Common (> 0.01): {(af_series >= 0.01).sum()}\n")
        
        f.write("\n\nFINAL VERDICT\n")
        f.write("="*80 + "\n")
        
        critical_errors = [e for e in errors if 'null' not in e.lower()]
        
        if not critical_errors and not issues:
            f.write("üéâ ‚úÖ DATA IS VALIDATED (FULLY OFFLINE)\n\n")
            f.write("Your data is:\n")
            f.write("  ‚úì Internally consistent\n")
            if all_sequences:
                f.write("  ‚úì Externally validated (")
                f.write(", ".join(all_sequences.keys()))
                f.write(" - all from local caches)\n")
            f.write("  ‚úì Ready for production use\n")
            f.write("  ‚úì Safe for downstream analysis\n")
            if warnings:
                f.write(f"\nNote: {len(warnings)} warnings (see above)\n")
        else:
            f.write("‚ö†Ô∏è  VALIDATION ISSUES DETECTED\n\n")
            f.write("Please review errors and issues above.\n")
    
    print(f"‚úÖ Report saved: {report_path}")


# ============================================================================
# MAIN PIPELINE
# ============================================================================

def main():
    """Main pipeline"""
    print("\n" + "="*80)
    print("HBB COMPLETE VALIDATION PIPELINE (FULLY OFFLINE)")
    print("="*80)
    print(f"\nGene: {GENE_SYMBOL}")
    print(f"Output: {OUTPUT_CSV}")
    print(f"\nüåê Network requirements: NONE (all validation is offline)")
    print("\n" + "="*80)
    
    try:
        # Initialize reference data
        initialize_reference_data()
        
        # Step 1: Fetch data
        gnomad_df = fetch_gnomad_data(GENE_SYMBOL)
        
        # Step 2: Process
        df = process_protein_level(gnomad_df)
        
        # Step 3: Internal validation
        internal_results = validate_internal(df)
        
        # Step 4: External validation
        external_results = validate_external(df, GENE_SYMBOL)
        
        # Step 5: Add structure IDs
        df = add_structure_ids(df, external_results['uniprot_id'])
        
        # Step 6: Generate report
        generate_report(df, internal_results, external_results, VALIDATION_REPORT)
        
        # Save data
        print("\n" + "="*80)
        print("SAVING FINAL DATA")
        print("="*80)
        
        df.to_csv(OUTPUT_CSV, index=False)
        print(f"‚úÖ CSV: {OUTPUT_CSV}")
        
        df.to_json(OUTPUT_JSON, orient='records', indent=2)
        print(f"‚úÖ JSON: {OUTPUT_JSON}")
        
        # Final summary
        print("\n" + "="*80)
        print("PIPELINE COMPLETE (FULLY OFFLINE)")
        print("="*80)
        
        errors = internal_results['errors']
        warnings = internal_results['warnings']
        issues = external_results['issues']
        all_sequences = external_results.get('all_sequences', {})
        
        critical_errors = [e for e in errors if 'null' not in e.lower()]
        
        if not critical_errors and not issues:
            print("\nüéâ ‚úÖ SUCCESS - DATA IS VALIDATED!")
            print(f"\n   Total variants: {len(df)}")
            print(f"   UniProt ID: {external_results['uniprot_id']} (from local cache)")
            
            if len(all_sequences) > 1:
                unique_seqs = set(all_sequences.values())
                if len(unique_seqs) == 1:
                    print(f"   ‚úÖ Sequences validated across {len(all_sequences)} databases (all local)")
                else:
                    print(f"   ‚ö†Ô∏è  {len(unique_seqs)} different sequences found")
            
            if warnings:
                print(f"\n   ‚ö†Ô∏è  {len(warnings)} warnings (non-critical)")
            
            print("\n   ‚úÖ SAFE TO PROCEED WITH ANALYSIS")
        else:
            print("\n‚ö†Ô∏è  VALIDATION COMPLETED WITH ISSUES")
            print(f"\n   Critical errors: {len(critical_errors)}")
            print(f"   Warnings: {len(warnings)}")
            print(f"   Issues: {len(issues)}")
            print(f"\n   See report: {VALIDATION_REPORT}")
        
        print("\n" + "="*80 + "\n")
        
        # Display sample
        print("üìä Sample output:")
        display_cols = ['CHROM', 'POS', 'mutation', 'AF', 'uniprot_id', 'alphafold_id']
        available = [c for c in display_cols if c in df.columns]
        print(df[available].head(10).to_string(index=False))
        
    except Exception as e:
        print(f"\n‚ùå PIPELINE FAILED: {e}")
        import traceback
        traceback.print_exc()
        return False
    
    return True


if __name__ == "__main__":
    success = main()
    exit(0 if success else 1)


HBB COMPLETE VALIDATION PIPELINE (FULLY OFFLINE)

Gene: HBB
Output: .cache/HBB_protein_level_validated.csv

üåê Network requirements: NONE (all validation is offline)


LOADING LOCAL REFERENCE DATA
‚úÖ Found: /jet/home/barazand/NEWOCEAN/ref_data/ensembl/Homo_sapiens.GRCh38.112.gtf.gz
‚úÖ Found: /jet/home/barazand/NEWOCEAN/ref_data/ensembl/Homo_sapiens.GRCh38.pep.all.fa.gz
‚úÖ Found: /jet/home/barazand/NEWOCEAN/ref_data/mane/MANE.GRCh38.v1.3.summary.txt.gz
‚úÖ Found: /jet/home/barazand/NEWOCEAN/ref_data/uniprot/uniprot_sprot.fasta.gz
‚úÖ Found: /jet/home/barazand/NEWOCEAN/ref_data/uniprot/HUMAN_9606_idmapping.dat.gz

üìö Loading Ensembl reference data (GTF + peptides)...
   Parsing GTF: Homo_sapiens.GRCh38.112.gtf.gz
      Found 111983 ENST‚ÜíENSP mappings
   Parsing peptide FASTA: Homo_sapiens.GRCh38.pep.all.fa.gz
      Found 246990 ENSP‚Üísequence mappings
   ‚úÖ Built 111983 ENST‚Üísequence mappings

üìö Loading MANE summary...
   Parsing MANE summary: MANE.GRCh38.v1.3.summary.tx

: 