In [None]:
import pandas as pd
import os
import glob

def load_transcript_mapping(mapping_file):
    """Load transcript to gene mapping from BioMart CSV output"""
    mapping = pd.read_csv(mapping_file)  
    print("Mapping file columns:", mapping.columns.tolist())  
    
    transcript_to_gene = dict(zip(
        mapping['Transcript stable ID'],
        zip(mapping['Gene stable ID'], mapping['Gene name'])
    ))
    return transcript_to_gene

def process_vep_file(input_file, transcript_to_gene, output_dir):
    """Process single VEP file and add gene information"""
    print(f"Processing {input_file}...")
    
    # Read the VEP file
    try:
        df = pd.read_csv(input_file)
    except Exception as e:
        print(f"Error reading file {input_file}: {e}")
        return None
    
    # Initialize new columns for gene IDs and names
    df['Gene_ID'] = df['Gene']  # Preserve original Gene column
    df['Gene_Name'] = 'N/A'
    
    # Count unmapped transcripts
    unmapped_transcripts = set()
    
    # Update gene information based on transcript IDs
    for idx, row in df.iterrows():
        transcript_id = row['Transcript']
        if transcript_id in transcript_to_gene:
            gene_id, gene_name = transcript_to_gene[transcript_id]
            df.at[idx, 'Gene_ID'] = gene_id
            df.at[idx, 'Gene_Name'] = gene_name
        else:
            unmapped_transcripts.add(transcript_id)
    
    # Create output filename
    base_name = os.path.basename(input_file)
    output_file = os.path.join(output_dir, f"annotated_{base_name}")
    
    # Save annotated file
    df.to_csv(output_file, index=False)
    print(f"Saved annotated file to {output_file}")
    
    # Print summary statistics
    total_transcripts = len(df)
    mapped_transcripts = sum(df['Gene_Name'] != 'N/A')
    print(f"Total transcripts: {total_transcripts}")
    print(f"Mapped transcripts: {mapped_transcripts}")
    print(f"Mapping rate: {mapped_transcripts/total_transcripts*100:.2f}%")
    print(f"Number of unique unmapped transcripts: {len(unmapped_transcripts)}")
    
    # Save unmapped transcripts to a file
    unmapped_file = os.path.join(output_dir, f"unmapped_transcripts_{base_name}.txt")
    with open(unmapped_file, 'w') as f:
        for transcript in sorted(unmapped_transcripts):
            f.write(f"{transcript}\n")
    print(f"Saved unmapped transcripts to {unmapped_file}\n")
    
    # Print first few rows of output for verification
    print("\nFirst few rows of annotated data:")
    print(df[['Transcript', 'Gene_ID', 'Gene_Name']].head())
    
    return df

def main():
    # Configuration
    input_dir = "."  
    output_dir = "annotated_vep"
    mapping_file = "transcript_to_gene_mapping.csv"  
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Load transcript to gene mapping
    print("Loading transcript to gene mapping...")
    transcript_to_gene = load_transcript_mapping(mapping_file)
    print(f"Loaded {len(transcript_to_gene)} transcript mappings")
    
    # Process all VEP files
    vep_files = glob.glob(os.path.join(input_dir, "d[A-E]_W[1-3]_filtered_vep_results.csv"))
    print(f"Found {len(vep_files)} VEP files to process")
    
    if not vep_files:
        print("No VEP files found matching the pattern!")
        return
    
    # Process each file
    for vep_file in vep_files:
        process_vep_file(vep_file, transcript_to_gene, output_dir)
    
    print("Processing complete!")

if __name__ == "__main__":
    main()