In [None]:
import requests
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import glob

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
import os
import glob
from concurrent.futures import ThreadPoolExecutor, as_completed

# --- Helper Functions ---

def parse_vcf_to_hgvs(vcf_file):
    """
    Parse a VCF file and convert the variants to HGVS notation.
    """
    variants_info = []
    # Read the VCF file with proper header handling
    with open(vcf_file, 'r') as f:
        # Skip header lines starting with ##
        header = None
        for line in f:
            if line.startswith('#CHROM'):
                header = line.strip().split('\t')
                break
        
        # Read the rest of the file as DataFrame
        df = pd.read_csv(f, sep='\t', names=header)
    
    # Iterate over each row and construct variant information
    for _, row in df.iterrows():
        chrom = row['#CHROM']
        pos = row['POS']
        ref = row['REF']
        alt = row['ALT']
        
        # Safely parse INFO field
        try:
            info_fields = [item.split('=') for item in row['INFO'].split(';') if '=' in item]
            info_dict = {k: v for k, v in info_fields}
        except:
            info_dict = {}
            
        variant_type = info_dict.get('VT', 'Unknown')
        variant_class = info_dict.get('VC', 'Unknown')
        
        # Create HGVS notation
        hgvs_variant = f"{chrom}:g.{pos}{ref}>{alt}"
        
        # Store all relevant information
        variant_info = {
            'hgvs': hgvs_variant,
            'chromosome': chrom,
            'position': pos,
            'ref_allele': ref,
            'alt_allele': alt,
            'variant_type': variant_type,
            'variant_classification': variant_class
        }
        
        variants_info.append(variant_info)
        
    print(f"Processed {len(variants_info)} variants from {vcf_file}")
    return variants_info

def query_ensembl_vep_batch(variants):
    """
    Query the Ensembl VEP REST API for a batch of variants (GRCh38).
    """
    server = "https://rest.ensembl.org"
    ext = "/vep/human/hgvs"
    headers = {"Content-Type": "application/json", "Accept": "application/json"}
    
    data = {
        "hgvs_notations": variants,
        "fields": ["canonical", "biotype", "consequence_terms", "transcript_id", "gene_symbol", "protein_id"]
    }
    
    response = requests.post(f"{server}{ext}", headers=headers, data=json.dumps(data))
    if not response.ok:
        print(f"Error: {response.status_code}, {response.text}")
        response.raise_for_status()
    
    return response.json()

def query_with_retries(batch, retries=3):
    """
    Query Ensembl VEP for a single batch with retry logic.
    """
    for attempt in range(retries):
        try:
            return query_ensembl_vep_batch(batch)
        except Exception as e:
            if attempt < retries - 1:
                print(f"Retrying batch... Attempt {attempt + 2}")
            else:
                print(f"Failed after {retries} attempts: {e}")
                raise

def parallel_batch_query_ensembl_vep(variants, batch_size=200, max_workers=4):
    """
    Parallelize Ensembl VEP batch queries using ThreadPoolExecutor.
    """
    batches = [variants[i:i + batch_size] for i in range(0, len(variants), batch_size)]
    all_results = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_batch = {executor.submit(query_with_retries, batch): batch for batch in batches}

        for future in as_completed(future_to_batch):
            batch = future_to_batch[future]
            try:
                results = future.result()
                all_results.extend(results)
            except Exception as e:
                print(f"Error processing batch: {batch[:5]}... -> {e}")

    return all_results

def parse_vep_results(results, sample_name, variant_info):
    """
    Parse the VEP API results and return a DataFrame.
    """
    parsed_data = []
    for result, var_info in zip(results, variant_info):
        variant = result.get("input")
        gene = result.get("gene_symbol", "N/A")
        transcript_consequences = result.get("transcript_consequences", [])
        
        for transcript in transcript_consequences:
            entry = {
                "Sample": sample_name,
                "Variant": variant,
                "Chromosome": var_info['chromosome'],
                "Position": var_info['position'],
                "Reference_Allele": var_info['ref_allele'],
                "Alternative_Allele": var_info['alt_allele'],
                "Variant_Type": var_info['variant_type'],
                "Variant_Classification": var_info['variant_classification'],
                "Gene": gene,
                "Transcript": transcript.get("transcript_id"),
                "Biotype": transcript.get("biotype"),
                "VEP_Consequence": transcript.get("consequence_terms", ["N/A"])[0],
                "Protein": transcript.get("protein_id", "N/A")
            }
            parsed_data.append(entry)
    
    return pd.DataFrame(parsed_data)

def plot_consequence_distribution(df, title, output_file):
    """
    Plot the distribution of variant consequences using seaborn.
    """
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df, y="VEP_Consequence", 
                  order=df["VEP_Consequence"].value_counts().index, 
                  palette="coolwarm")
    plt.title(title, fontsize=16)
    plt.xlabel("Count", fontsize=14)
    plt.ylabel("Consequence", fontsize=14)
    plt.tight_layout()
    plt.savefig(output_file)
    plt.show()

# --- Main Script ---

from tqdm import tqdm

# Input directory containing VCF files
input_directory = "./"  # Replace with  VCF files directory

# Get the list of all VCF files in the directory
vcf_files = glob.glob(os.path.join(input_directory, "*.vcf"))

# Check if VCF files are found
if not vcf_files:
    raise ValueError(f"No VCF files found in the directory: {input_directory}")

print(f"VCF files to process: {len(vcf_files)} files")

combined_results = []

# Initialize the progress bar for files
file_progress = tqdm(vcf_files, desc="Processing VCF Files", unit="file")

for vcf_file in file_progress:
    sample_name = os.path.splitext(os.path.basename(vcf_file))[0]

    # Parse VCF file and get all variant information
    variants_info = parse_vcf_to_hgvs(vcf_file)

    # Extract just HGVS notations for VEP query
    hgvs_variants = [v['hgvs'] for v in variants_info]

    # Query Ensembl VEP API with parallel processing
    print(f"Processing {len(hgvs_variants)} variants for sample: {sample_name}")
    batch_progress = tqdm(
        total=len(hgvs_variants),
        desc=f"Annotating {sample_name}",
        unit="variant",
    )
    def progress_tracking_batch(batch):
        result = query_with_retries(batch)
        batch_progress.update(len(batch))
        return result

    with ThreadPoolExecutor(max_workers=4) as executor:
        batches = [hgvs_variants[i:i + 200] for i in range(0, len(hgvs_variants), 200)]
        future_to_batch = {executor.submit(progress_tracking_batch, batch): batch for batch in batches}
        vep_results = []
        for future in as_completed(future_to_batch):
            vep_results.extend(future.result())
    batch_progress.close()

    # Parse VEP results with additional information
    df_sample = parse_vep_results(vep_results, sample_name, variants_info)

    # Save individual sample results to CSV
    sample_results_file = f"{sample_name}_vep_results.csv"
    df_sample.to_csv(sample_results_file, index=False)
    print(f"Saved sample results to {sample_results_file}")

    # Plot consequences for the sample
    plot_consequence_distribution(df_sample, f"Variant Consequences for {sample_name}", f"{sample_name}_consequence_plot.png")

    # Add the sample results to the combined DataFrame
    combined_results.append(df_sample)

file_progress.close()

# Combine results from all samples into one DataFrame
df_combined = pd.concat(combined_results, ignore_index=True)

# Save combined results to a CSV file
df_combined_file = "./combined_vep_results.csv"
df_combined.to_csv(df_combined_file, index=False)
print(f"Saved combined results to {df_combined_file}")

# Plot combined consequences
plot_consequence_distribution(df_combined, "Combined Variant Consequences for All Samples", "./combined_consequence_plot.png")


In [None]:
def classify_nucleotide_change(variant):
    """
    Extract the specific nucleotide change (e.g., T>C, C>T) from the Variant column.
    :param variant: Variant string in the form '1:g.809687G>C'
    :return: Nucleotide change (e.g., G>C, C>T)
    """
    ref = variant.split('>')[0][-1]  # Reference allele
    alt = variant.split('>')[1]  # Alternate allele
    return f"{ref}>{alt}"

def plot_nucleotide_change_distribution(df, output_file):
    """
    Plot the distribution of specific nucleotide changes as a bar plot.
    :param df: Pandas DataFrame containing nucleotide changes
    :param output_file: Path to save the plot image
    """
    plt.figure(figsize=(8, 6))
    sns.countplot(data=df, y="Nucleotide_Change", palette="coolwarm", order=df["Nucleotide_Change"].value_counts().index)
    plt.title("Nucleotide Change Distribution", fontsize=16)
    plt.xlabel("Count", fontsize=14)
    plt.ylabel("Nucleotide Change", fontsize=14)
    plt.tight_layout()
    plt.savefig(output_file)
    plt.show()
    
# Load the combined VEP results from the CSV file
combined_vep_file = "combined_vep_results.csv"  # Replace with the path to your combined VEP CSV file
df_combined = pd.read_csv(combined_vep_file)

# Add a Nucleotide_Change column by extracting specific changes (e.g., T>C, C>T)
df_combined['Nucleotide_Change'] = df_combined['Variant'].apply(classify_nucleotide_change)

# Save the updated DataFrame with nucleotide changes
df_combined.to_csv("combined_vep_with_nucleotide_changes.csv", index=False)
# Plot the distribution of specific nucleotide changes (e.g., T>C, C>T)
plot_nucleotide_change_distribution(df_combined, "combined_nucleotide_change_plot.png")

# Display the DataFrame with nucleotide changes
print(df_combined[['Variant', 'Nucleotide_Change']].head())

In [None]:
def plot_transcript_mutation_distribution(df, output_file):
    """
    Plot the distribution of mutations affecting transcripts.
    :param df: Pandas DataFrame containing VEP results
    :param output_file: Path to save the plot image
    """
    plt.figure(figsize=(10, 6))
    top_transcripts = df['Transcript'].value_counts().nlargest(20)  # Top 20 most affected transcripts
    sns.barplot(y=top_transcripts.index, x=top_transcripts.values, palette="coolwarm")
    plt.title("Top 20 Transcripts Affected by Mutations", fontsize=16)
    plt.xlabel("Mutation Count", fontsize=14)
    plt.ylabel("Transcript", fontsize=14)
    plt.tight_layout()
    plt.savefig(output_file)
    plt.show()

# Plot transcript-specific mutation distribution
plot_transcript_mutation_distribution(df_combined, "transcript_mutation_distribution.png")


In [None]:
def plot_shared_mutations(df, output_file):
    """
    Plot the distribution of mutations shared across samples.
    :param df: Pandas DataFrame containing VEP results
    :param output_file: Path to save the plot image
    """
    shared_mutations = df['Variant'].value_counts()
    plt.figure(figsize=(10, 6))
    sns.histplot(shared_mutations, bins=30, kde=False, color='skyblue')
    plt.title("Distribution of Shared Mutations Across Samples", fontsize=16)
    plt.xlabel("Number of Samples Sharing a Mutation", fontsize=14)
    plt.ylabel("Count of Mutations", fontsize=14)
    plt.tight_layout()
    plt.savefig(output_file)
    plt.show()

# Plot the distribution of shared mutations
plot_shared_mutations(df_combined, "shared_mutations_distribution.png")


In [None]:
def plot_biotype_distribution(df, output_file):
    """
    Plot the distribution of biotype categories from the VEP results.
    :param df: Pandas DataFrame containing VEP results
    :param output_file: Path to save the plot image
    """
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df, y="Biotype", order=df["Biotype"].value_counts().index, palette="coolwarm")
    plt.title("Biotype Distribution of Variants", fontsize=16)
    plt.xlabel("Count", fontsize=14)
    plt.ylabel("Biotype", fontsize=14)
    plt.tight_layout()
    plt.savefig(output_file)
    plt.show()

# Load combined VEP results from the CSV file
#combined_vep_file = "combined_vep_results.csv"  # Replace with your file path
#df_combined = pd.read_csv(combined_vep_file)

# Plot biotype distribution
plot_biotype_distribution(df_combined, "biotype_distribution_plot.png")