# VCF File Parser

This notebook implements a parser for Variant Call Format (VCF) files, which are used in bioinformatics to store gene sequence variations.

## Features
- Parse standard VCF files
- Handle gzip-compressed VCF files (.vcf.gz)
- Exception handling for robust error management
- Unit tests for validation
- Visualizations of variant data

In [None]:
import gzip
import os
import unittest
from io import StringIO
from typing import Dict, List, Optional, Tuple, Union
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

## Custom Exceptions

In [None]:
class VCFError(Exception):
    """Base exception for VCF parsing errors."""
    pass


class VCFFileNotFoundError(VCFError):
    """Raised when the VCF file cannot be found."""
    pass


class VCFFormatError(VCFError):
    """Raised when the VCF file format is invalid."""
    pass


class VCFHeaderError(VCFError):
    """Raised when the VCF header is missing or malformed."""
    pass

## VCF Parser Class

In [None]:
class VCFParser:
    """
    A parser for Variant Call Format (VCF) files.
    
    Supports both uncompressed (.vcf) and gzip-compressed (.vcf.gz) files.
    
    Attributes:
        file_path: Path to the VCF file
        metadata: Dictionary containing file metadata from ## lines
        header: List of column names from the #CHROM line
        variants: DataFrame containing variant records
        samples: List of sample names (if present)
    """
    
    MANDATORY_COLUMNS = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
    
    def __init__(self, file_path: Optional[str] = None):
        """Initialize the VCF parser."""
        self.file_path = file_path
        self.metadata: Dict[str, List[str]] = {}
        self.header: List[str] = []
        self.variants: pd.DataFrame = pd.DataFrame()
        self.samples: List[str] = []
        
        if file_path:
            self.parse(file_path)
    
    def _is_gzipped(self, file_path: str) -> bool:
        """Check if the file is gzip-compressed."""
        # Check by extension first
        if file_path.endswith('.gz'):
            return True
        
        # Check magic bytes
        try:
            with open(file_path, 'rb') as f:
                magic = f.read(2)
                return magic == b'\x1f\x8b'
        except IOError:
            return False
    
    def _open_file(self, file_path: str):
        """Open a VCF file, handling both compressed and uncompressed formats."""
        if not os.path.exists(file_path):
            raise VCFFileNotFoundError(f"VCF file not found: {file_path}")
        
        if self._is_gzipped(file_path):
            return gzip.open(file_path, 'rt', encoding='utf-8')
        else:
            return open(file_path, 'r', encoding='utf-8')
    
    def _parse_metadata_line(self, line: str) -> Tuple[str, str]:
        """Parse a metadata line (## prefix)."""
        line = line.lstrip('#')
        if '=' in line:
            key, value = line.split('=', 1)
            return key.strip(), value.strip()
        return line.strip(), ''
    
    def _parse_header_line(self, line: str) -> List[str]:
        """Parse the header line (#CHROM line)."""
        if not line.startswith('#CHROM'):
            raise VCFHeaderError("Header line must start with #CHROM")
        
        columns = line.lstrip('#').strip().split('\t')
        
        # Validate mandatory columns
        for col in self.MANDATORY_COLUMNS:
            if col not in columns:
                raise VCFHeaderError(f"Missing mandatory column: {col}")
        
        return columns
    
    def _parse_variant_line(self, line: str) -> List[str]:
        """Parse a single variant data line."""
        fields = line.strip().split('\t')
        if len(fields) < len(self.MANDATORY_COLUMNS):
            raise VCFFormatError(
                f"Variant line has {len(fields)} fields, expected at least {len(self.MANDATORY_COLUMNS)}"
            )
        return fields
    
    def parse(self, file_path: str) -> 'VCFParser':
        """Parse a VCF file and store the data."""
        self.file_path = file_path
        self.metadata = {}
        self.header = []
        variants_data = []
        
        try:
            with self._open_file(file_path) as f:
                header_found = False
                line_number = 0
                
                for line in f:
                    line_number += 1
                    line = line.strip()
                    
                    if not line:
                        continue
                    
                    # Metadata lines (##)
                    if line.startswith('##'):
                        key, value = self._parse_metadata_line(line)
                        if key not in self.metadata:
                            self.metadata[key] = []
                        self.metadata[key].append(value)
                    
                    # Header line (#CHROM)
                    elif line.startswith('#'):
                        self.header = self._parse_header_line(line)
                        header_found = True
                        
                        # Extract sample names (columns after FORMAT)
                        if 'FORMAT' in self.header:
                            format_idx = self.header.index('FORMAT')
                            self.samples = self.header[format_idx + 1:]
                    
                    # Variant data lines
                    else:
                        if not header_found:
                            raise VCFHeaderError(f"Data found before header at line {line_number}")
                        
                        try:
                            fields = self._parse_variant_line(line)
                            variants_data.append(fields)
                        except VCFFormatError as e:
                            raise VCFFormatError(f"Error at line {line_number}: {e}")
                
                if not header_found:
                    raise VCFHeaderError("No header line found in VCF file")
        
        except (IOError, OSError) as e:
            raise VCFError(f"Error reading file {file_path}: {e}")
        
        # Create DataFrame
        if variants_data:
            self.variants = pd.DataFrame(variants_data, columns=self.header)
            
            # Convert POS to integer
            self.variants['POS'] = pd.to_numeric(self.variants['POS'], errors='coerce')
            
            # Convert QUAL to numeric (can be '.')
            self.variants['QUAL'] = pd.to_numeric(self.variants['QUAL'], errors='coerce')
        
        return self
    
    def parse_string(self, vcf_string: str) -> 'VCFParser':
        """Parse VCF data from a string (useful for testing)."""
        import tempfile
        
        with tempfile.NamedTemporaryFile(mode='w', suffix='.vcf', delete=False) as f:
            f.write(vcf_string)
            temp_path = f.name
        
        try:
            self.parse(temp_path)
        finally:
            os.unlink(temp_path)
        
        return self
    
    def get_variant_count(self) -> int:
        """Return the total number of variants."""
        return len(self.variants)
    
    def get_chromosomes(self) -> List[str]:
        """Return a list of unique chromosomes."""
        if self.variants.empty:
            return []
        return self.variants['CHROM'].unique().tolist()
    
    def get_variants_by_chromosome(self, chrom: str) -> pd.DataFrame:
        """Return variants for a specific chromosome."""
        return self.variants[self.variants['CHROM'] == chrom].copy()
    
    def get_variant_types(self) -> pd.Series:
        """Classify variants by type (SNP, insertion, deletion, etc.)."""
        def classify(row):
            ref = row['REF']
            alt = row['ALT']
            
            # Handle multiple alternates
            alts = alt.split(',')
            
            types = []
            for a in alts:
                if a == '.':
                    types.append('NO_VARIATION')
                elif len(ref) == 1 and len(a) == 1:
                    types.append('SNP')
                elif len(ref) > len(a):
                    types.append('DELETION')
                elif len(ref) < len(a):
                    types.append('INSERTION')
                else:
                    types.append('MNP')  # Multi-nucleotide polymorphism
            
            return ','.join(types)
        
        if self.variants.empty:
            return pd.Series(dtype=str)
        
        return self.variants.apply(classify, axis=1)
    
    def get_info_field(self, field: str) -> pd.Series:
        """Extract a specific field from the INFO column."""
        def extract(info_str):
            if pd.isna(info_str) or info_str == '.':
                return None
            
            for item in info_str.split(';'):
                if '=' in item:
                    key, value = item.split('=', 1)
                    if key == field:
                        return value
                elif item == field:
                    return True  # Flag field
            return None
        
        if self.variants.empty:
            return pd.Series(dtype=object)
        
        return self.variants['INFO'].apply(extract)
    
    def summary(self) -> Dict:
        """Return a summary of the VCF file."""
        return {
            'file_path': self.file_path,
            'total_variants': self.get_variant_count(),
            'chromosomes': self.get_chromosomes(),
            'num_chromosomes': len(self.get_chromosomes()),
            'samples': self.samples,
            'num_samples': len(self.samples),
            'metadata_keys': list(self.metadata.keys())
        }

## Unit Tests

In [None]:
class TestVCFParser(unittest.TestCase):
    """Unit tests for the VCF parser."""
    
    def setUp(self):
        """Set up test fixtures."""
        self.valid_vcf = """##fileformat=VCFv4.2
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##FILTER=<ID=PASS,Description="All filters passed">
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
chr1\t100\trs123\tA\tG\t30\tPASS\tDP=100
chr1\t200\trs456\tC\tT\t25\tPASS\tDP=50
chr2\t300\t.\tGG\tG\t20\tPASS\tDP=75
"""
        
        self.vcf_with_samples = """##fileformat=VCFv4.2
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE1\tSAMPLE2
chr1\t100\trs123\tA\tG\t30\tPASS\tDP=100\tGT:DP\t0/1:30\t1/1:40
"""
        
        self.invalid_vcf_no_header = """##fileformat=VCFv4.2
chr1\t100\trs123\tA\tG\t30\tPASS\tDP=100
"""
        
        self.invalid_vcf_missing_columns = """##fileformat=VCFv4.2
#CHROM\tPOS\tID\tREF
chr1\t100\trs123\tA
"""
    
    def test_parse_valid_vcf(self):
        """Test parsing a valid VCF string."""
        parser = VCFParser()
        parser.parse_string(self.valid_vcf)
        
        self.assertEqual(parser.get_variant_count(), 3)
        self.assertEqual(set(parser.get_chromosomes()), {'chr1', 'chr2'})
    
    def test_parse_metadata(self):
        """Test metadata parsing."""
        parser = VCFParser()
        parser.parse_string(self.valid_vcf)
        
        self.assertIn('fileformat', parser.metadata)
        self.assertEqual(parser.metadata['fileformat'][0], 'VCFv4.2')
    
    def test_parse_samples(self):
        """Test sample extraction."""
        parser = VCFParser()
        parser.parse_string(self.vcf_with_samples)
        
        self.assertEqual(parser.samples, ['SAMPLE1', 'SAMPLE2'])
    
    def test_get_variants_by_chromosome(self):
        """Test filtering variants by chromosome."""
        parser = VCFParser()
        parser.parse_string(self.valid_vcf)
        
        chr1_variants = parser.get_variants_by_chromosome('chr1')
        self.assertEqual(len(chr1_variants), 2)
    
    def test_variant_types(self):
        """Test variant type classification."""
        parser = VCFParser()
        parser.parse_string(self.valid_vcf)
        
        types = parser.get_variant_types()
        self.assertEqual(types.iloc[0], 'SNP')  # A->G
        self.assertEqual(types.iloc[2], 'DELETION')  # GG->G
    
    def test_get_info_field(self):
        """Test INFO field extraction."""
        parser = VCFParser()
        parser.parse_string(self.valid_vcf)
        
        dp_values = parser.get_info_field('DP')
        self.assertEqual(dp_values.iloc[0], '100')
    
    def test_missing_header_error(self):
        """Test that missing header raises error."""
        parser = VCFParser()
        with self.assertRaises(VCFHeaderError):
            parser.parse_string(self.invalid_vcf_no_header)
    
    def test_missing_columns_error(self):
        """Test that missing mandatory columns raises error."""
        parser = VCFParser()
        with self.assertRaises(VCFHeaderError):
            parser.parse_string(self.invalid_vcf_missing_columns)
    
    def test_file_not_found_error(self):
        """Test that missing file raises error."""
        parser = VCFParser()
        with self.assertRaises(VCFFileNotFoundError):
            parser.parse('/nonexistent/path/file.vcf')
    
    def test_summary(self):
        """Test summary generation."""
        parser = VCFParser()
        parser.parse_string(self.valid_vcf)
        
        summary = parser.summary()
        self.assertEqual(summary['total_variants'], 3)
        self.assertEqual(summary['num_chromosomes'], 2)


# Run tests
print("Running unit tests...\n")
suite = unittest.TestLoader().loadTestsFromTestCase(TestVCFParser)
runner = unittest.TextTestRunner(verbosity=2)
result = runner.run(suite)

print(f"\n{'='*50}")
print(f"Tests run: {result.testsRun}")
print(f"Failures: {len(result.failures)}")
print(f"Errors: {len(result.errors)}")

## Create Sample VCF Files for Demonstration

In [None]:
# Create a sample VCF file for demonstration
sample_vcf_content = """##fileformat=VCFv4.2
##fileDate=20240101
##source=VCFParserDemo
##reference=GRCh38
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership">
##FILTER=<ID=PASS,Description="All filters passed">
##FILTER=<ID=LowQual,Description="Low quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE1\tSAMPLE2\tSAMPLE3
chr1\t10000\trs001\tA\tG\t50\tPASS\tDP=120;AF=0.25;DB\tGT:DP\t0/1:40\t0/0:35\t0/1:45
chr1\t20000\trs002\tC\tT\t45\tPASS\tDP=95;AF=0.15\tGT:DP\t0/0:30\t0/1:32\t0/0:33
chr1\t35000\trs003\tG\tA\t35\tLowQual\tDP=40;AF=0.05\tGT:DP\t0/0:12\t0/0:15\t0/1:13
chr1\t50000\trs004\tT\tC\t55\tPASS\tDP=150;AF=0.30\tGT:DP\t0/1:50\t0/1:48\t1/1:52
chr2\t15000\trs005\tAA\tA\t40\tPASS\tDP=85;AF=0.10\tGT:DP\t0/1:28\t0/0:27\t0/0:30
chr2\t30000\trs006\tG\tGT\t42\tPASS\tDP=90;AF=0.20\tGT:DP\t0/0:30\t0/1:28\t0/1:32
chr2\t45000\t.\tC\tA\t48\tPASS\tDP=110;AF=0.18\tGT:DP\t0/1:38\t0/1:35\t0/0:37
chr3\t10000\trs008\tT\tG\t52\tPASS\tDP=130;AF=0.22\tGT:DP\t0/0:42\t0/1:44\t0/1:44
chr3\t25000\trs009\tA\tC\t38\tLowQual\tDP=55;AF=0.08\tGT:DP\t0/0:18\t0/0:17\t0/1:20
chr3\t40000\trs010\tGG\tTT\t60\tPASS\tDP=145;AF=0.35;DB\tGT:DP\t0/1:48\t0/1:47\t1/1:50
chr4\t5000\trs011\tC\tG\t44\tPASS\tDP=100;AF=0.12\tGT:DP\t0/1:33\t0/0:32\t0/0:35
chr4\t20000\trs012\tT\tA\t46\tPASS\tDP=105;AF=0.16\tGT:DP\t0/0:35\t0/1:34\t0/0:36
chr4\t35000\trs013\tA\tT,G\t58\tPASS\tDP=140;AF=0.28,0.12\tGT:DP\t0/1:46\t0/2:45\t1/2:49
chr5\t12000\trs014\tG\tC\t36\tLowQual\tDP=48;AF=0.06\tGT:DP\t0/0:16\t0/0:15\t0/1:17
chr5\t28000\trs015\tC\tT\t50\tPASS\tDP=115;AF=0.24\tGT:DP\t0/1:38\t0/1:36\t0/0:41
"""

# Write sample VCF file
sample_vcf_path = 'sample.vcf'
with open(sample_vcf_path, 'w') as f:
    f.write(sample_vcf_content)

# Create compressed version
sample_vcf_gz_path = 'sample.vcf.gz'
with gzip.open(sample_vcf_gz_path, 'wt', encoding='utf-8') as f:
    f.write(sample_vcf_content)

print(f"Created sample VCF file: {sample_vcf_path}")
print(f"Created compressed VCF file: {sample_vcf_gz_path}")

## Parse the Sample VCF File

In [None]:
# Parse the uncompressed VCF file
parser = VCFParser(sample_vcf_path)

# Display summary
print("VCF File Summary")
print("="*50)
summary = parser.summary()
for key, value in summary.items():
    print(f"{key}: {value}")

In [None]:
# Test parsing compressed file
parser_gz = VCFParser(sample_vcf_gz_path)
print(f"\nCompressed file parsed successfully!")
print(f"Variants in compressed file: {parser_gz.get_variant_count()}")

In [None]:
# Display the variants DataFrame
print("\nVariant Data:")
parser.variants

In [None]:
# Add variant types to the dataframe for visualization
parser.variants['VARIANT_TYPE'] = parser.get_variant_types()
parser.variants['AF'] = parser.get_info_field('AF').apply(
    lambda x: float(x.split(',')[0]) if x and x != 'None' else np.nan
)
parser.variants['DP_INFO'] = pd.to_numeric(parser.get_info_field('DP'), errors='coerce')

parser.variants[['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'VARIANT_TYPE', 'AF', 'DP_INFO']]

## Visualizations

### Visualization 1: Variant Distribution by Chromosome

In [None]:
# Count variants per chromosome
chrom_counts = parser.variants['CHROM'].value_counts().sort_index()

fig, ax = plt.subplots(figsize=(10, 6))
colors = sns.color_palette("viridis", len(chrom_counts))
bars = ax.bar(chrom_counts.index, chrom_counts.values, color=colors, edgecolor='black')

# Add value labels on bars
for bar, count in zip(bars, chrom_counts.values):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1, 
            str(count), ha='center', va='bottom', fontweight='bold')

ax.set_xlabel('Chromosome', fontsize=12)
ax.set_ylabel('Number of Variants', fontsize=12)
ax.set_title('Variant Distribution by Chromosome', fontsize=14, fontweight='bold')
ax.set_ylim(0, max(chrom_counts.values) + 1)

plt.tight_layout()
plt.savefig('viz1_variants_by_chromosome.png', dpi=150, bbox_inches='tight')
plt.show()

### Visualization 2: Variant Types Distribution (Pie Chart)

In [None]:
# Count variant types
type_counts = parser.variants['VARIANT_TYPE'].value_counts()

fig, ax = plt.subplots(figsize=(8, 8))
colors = sns.color_palette("Set2", len(type_counts))
explode = [0.05] * len(type_counts)

wedges, texts, autotexts = ax.pie(
    type_counts.values, 
    labels=type_counts.index,
    autopct='%1.1f%%',
    colors=colors,
    explode=explode,
    shadow=True,
    startangle=90
)

# Style the text
for autotext in autotexts:
    autotext.set_fontweight('bold')

ax.set_title('Distribution of Variant Types', fontsize=14, fontweight='bold')

# Add legend with counts
legend_labels = [f"{label} (n={count})" for label, count in zip(type_counts.index, type_counts.values)]
ax.legend(wedges, legend_labels, title="Variant Types", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))

plt.tight_layout()
plt.savefig('viz2_variant_types.png', dpi=150, bbox_inches='tight')
plt.show()

### Visualization 3: Quality Score Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
ax1 = axes[0]
qual_data = parser.variants['QUAL'].dropna()
ax1.hist(qual_data, bins=15, color='steelblue', edgecolor='black', alpha=0.7)
ax1.axvline(qual_data.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {qual_data.mean():.1f}')
ax1.axvline(qual_data.median(), color='orange', linestyle='--', linewidth=2, label=f'Median: {qual_data.median():.1f}')
ax1.set_xlabel('Quality Score (QUAL)', fontsize=12)
ax1.set_ylabel('Frequency', fontsize=12)
ax1.set_title('Quality Score Distribution', fontsize=14, fontweight='bold')
ax1.legend()

# Box plot by filter status
ax2 = axes[1]
filter_groups = parser.variants.groupby('FILTER')['QUAL'].apply(list).to_dict()
box_data = [filter_groups.get(f, []) for f in ['PASS', 'LowQual']]
bp = ax2.boxplot(box_data, tick_labels=['PASS', 'LowQual'], patch_artist=True)

colors_box = ['lightgreen', 'lightcoral']
for patch, color in zip(bp['boxes'], colors_box):
    patch.set_facecolor(color)

ax2.set_xlabel('Filter Status', fontsize=12)
ax2.set_ylabel('Quality Score (QUAL)', fontsize=12)
ax2.set_title('Quality Score by Filter Status', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('viz3_quality_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

### Visualization 4: Allele Frequency vs Read Depth

In [None]:
fig, ax = plt.subplots(figsize=(10, 7))

# Filter out NaN values
plot_data = parser.variants.dropna(subset=['AF', 'DP_INFO'])

# Create scatter plot colored by variant type
variant_types = plot_data['VARIANT_TYPE'].unique()
colors = sns.color_palette("husl", len(variant_types))
color_map = dict(zip(variant_types, colors))

for vtype in variant_types:
    mask = plot_data['VARIANT_TYPE'] == vtype
    subset = plot_data[mask]
    ax.scatter(
        subset['AF'], 
        subset['DP_INFO'],
        c=[color_map[vtype]],
        s=subset['QUAL'] * 3,  # Size by quality
        alpha=0.7,
        label=vtype,
        edgecolors='black',
        linewidth=0.5
    )

ax.set_xlabel('Allele Frequency (AF)', fontsize=12)
ax.set_ylabel('Read Depth (DP)', fontsize=12)
ax.set_title('Allele Frequency vs Read Depth\n(point size = quality score)', fontsize=14, fontweight='bold')
ax.legend(title='Variant Type')

plt.tight_layout()
plt.savefig('viz4_af_vs_depth.png', dpi=150, bbox_inches='tight')
plt.show()

### Visualization 5: Genomic Position Density Plot

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

# Get unique chromosomes in order
chromosomes = sorted(parser.variants['CHROM'].unique())
colors = sns.color_palette("tab10", len(chromosomes))

# Create chromosome offset for visualization
chrom_offsets = {}
offset = 0
tick_positions = []
tick_labels = []

for i, chrom in enumerate(chromosomes):
    chrom_data = parser.variants[parser.variants['CHROM'] == chrom].copy()
    positions = chrom_data['POS'].values + offset
    
    # Plot variants as vertical lines
    for pos in positions:
        ax.axvline(pos, color=colors[i], alpha=0.7, linewidth=2)
    
    # Store tick position (middle of chromosome region)
    max_pos = chrom_data['POS'].max()
    tick_positions.append(offset + max_pos / 2)
    tick_labels.append(chrom)
    
    # Add separator and update offset
    offset += max_pos + 10000
    if i < len(chromosomes) - 1:
        ax.axvline(offset - 5000, color='gray', linestyle=':', alpha=0.5)

ax.set_xticks(tick_positions)
ax.set_xticklabels(tick_labels)
ax.set_xlabel('Chromosome', fontsize=12)
ax.set_ylabel('Variant Presence', fontsize=12)
ax.set_title('Variant Positions Across Chromosomes', fontsize=14, fontweight='bold')
ax.set_yticks([])

# Add legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor=colors[i], label=chrom) for i, chrom in enumerate(chromosomes)]
ax.legend(handles=legend_elements, title='Chromosome', loc='upper right')

plt.tight_layout()
plt.savefig('viz5_genomic_positions.png', dpi=150, bbox_inches='tight')
plt.show()

## Summary Statistics

In [None]:
print("VCF Analysis Summary")
print("=" * 60)
print(f"\nFile: {parser.file_path}")
print(f"Total Variants: {parser.get_variant_count()}")
print(f"Chromosomes: {', '.join(parser.get_chromosomes())}")
print(f"Samples: {', '.join(parser.samples) if parser.samples else 'None'}")

print(f"\nVariant Types:")
for vtype, count in parser.variants['VARIANT_TYPE'].value_counts().items():
    print(f"  - {vtype}: {count}")

print(f"\nFilter Status:")
for filt, count in parser.variants['FILTER'].value_counts().items():
    print(f"  - {filt}: {count}")

print(f"\nQuality Score Statistics:")
qual_stats = parser.variants['QUAL'].describe()
print(f"  - Mean: {qual_stats['mean']:.2f}")
print(f"  - Std: {qual_stats['std']:.2f}")
print(f"  - Min: {qual_stats['min']:.2f}")
print(f"  - Max: {qual_stats['max']:.2f}")

print(f"\nRead Depth Statistics:")
dp_stats = parser.variants['DP_INFO'].describe()
print(f"  - Mean: {dp_stats['mean']:.2f}")
print(f"  - Std: {dp_stats['std']:.2f}")
print(f"  - Min: {dp_stats['min']:.2f}")
print(f"  - Max: {dp_stats['max']:.2f}")

## Cleanup

In [None]:
# Optionally clean up generated files
# Uncomment the following lines to remove generated files

# import os
# for f in ['sample.vcf', 'sample.vcf.gz']:
#     if os.path.exists(f):
#         os.remove(f)
#         print(f"Removed {f}")

print("\nGenerated files:")
print("- sample.vcf (sample VCF file)")
print("- sample.vcf.gz (compressed sample VCF file)")
print("- viz1_variants_by_chromosome.png")
print("- viz2_variant_types.png")
print("- viz3_quality_distribution.png")
print("- viz4_af_vs_depth.png")
print("- viz5_genomic_positions.png")