# CRAM File Processing and Analysis

This notebook demonstrates:
1. Generation of simulated sequencing data
2. Creation of CRAM files
3. Decompression and parsing of CRAM files
4. Conversion to BAM and SAM formats
5. Visualization of alignment characteristics

In [None]:
# Install required packages (uncomment if needed)
!pip install pysam numpy matplotlib seaborn pandas

In [None]:
import pysam
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from collections import defaultdict, Counter
import os
import random

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## Step 1: Generate Simulated Reference Genome

In [None]:
def generate_reference_genome(length=10000, output_file='reference.fasta'):
    """
    Generate a simulated reference genome.
    
    Args:
        length: Length of the reference sequence
        output_file: Output FASTA file path
    """
    bases = ['A', 'C', 'G', 'T']
    sequence = ''.join(random.choices(bases, k=length))
    
    with open(output_file, 'w') as f:
        f.write('>chr1\n')
        # Write sequence in lines of 80 characters
        for i in range(0, len(sequence), 80):
            f.write(sequence[i:i+80] + '\n')
    
    print(f"Generated reference genome: {output_file} ({length} bp)")
    return output_file

# Generate reference
ref_file = generate_reference_genome(length=10000)

# Index the reference (required for CRAM)
!samtools faidx {ref_file}
print(f"Indexed reference genome")

## Step 2: Generate Simulated Sequencing Reads and Create CRAM File

In [None]:
def generate_simulated_reads(ref_file, num_reads=1000, read_length=100):
    """
    Generate simulated sequencing reads aligned to a reference genome.
    
    Args:
        ref_file: Reference FASTA file
        num_reads: Number of reads to generate
        read_length: Length of each read
    
    Returns:
        Path to the generated BAM file
    """
    # Read reference sequence
    ref_fasta = pysam.FastaFile(ref_file)
    ref_seq = ref_fasta.fetch('chr1')
    ref_length = len(ref_seq)
    
    # Create BAM file header
    header = {
        'HD': {'VN': '1.6', 'SO': 'coordinate'},
        'SQ': [{'LN': ref_length, 'SN': 'chr1'}],
        'PG': [{'ID': 'simulate', 'PN': 'simulator', 'VN': '1.0'}]
    }
    
    # Create temporary unsorted BAM file
    temp_bam = 'temp_unsorted.bam'
    bamfile = pysam.AlignmentFile(temp_bam, 'wb', header=header)
    
    # Generate reads
    for i in range(num_reads):
        # Random position on reference
        pos = random.randint(0, ref_length - read_length - 1)
        
        # Extract sequence from reference
        seq = ref_seq[pos:pos + read_length]
        
        # Introduce some mutations (SNPs) - 1% error rate
        seq_list = list(seq)
        for j in range(len(seq_list)):
            if random.random() < 0.01:
                bases = ['A', 'C', 'G', 'T']
                bases.remove(seq_list[j])
                seq_list[j] = random.choice(bases)
        seq = ''.join(seq_list)
        
        # Generate quality scores (Phred scale)
        qual = ''.join([chr(random.randint(30, 40) + 33) for _ in range(read_length)])
        
        # Create alignment
        a = pysam.AlignedSegment()
        a.query_name = f'read_{i:06d}'
        a.query_sequence = seq
        a.flag = 0 if random.random() > 0.5 else 16  # Random forward/reverse
        a.reference_id = 0  # chr1
        a.reference_start = pos
        a.mapping_quality = random.randint(20, 60)
        a.cigar = [(0, read_length)]  # Match
        a.query_qualities = pysam.qualitystring_to_array(qual)
        
        # Add some tags
        a.set_tag('NM', random.randint(0, 3))  # Edit distance
        a.set_tag('AS', random.randint(80, 100))  # Alignment score
        
        bamfile.write(a)
    
    bamfile.close()
    ref_fasta.close()
    
    # Sort BAM file
    sorted_bam = 'simulated_sorted.bam'
    pysam.sort('-o', sorted_bam, temp_bam)
    pysam.index(sorted_bam)
    
    # Clean up temporary file
    os.remove(temp_bam)
    
    print(f"Generated {num_reads} simulated reads in {sorted_bam}")
    return sorted_bam

# Generate simulated reads
bam_file = generate_simulated_reads(ref_file, num_reads=1000, read_length=100)

In [None]:
# Convert BAM to CRAM
cram_file = 'simulated.cram'

# Use samtools to convert BAM to CRAM (requires reference)
!samtools view -C -T {ref_file} -o {cram_file} {bam_file}
!samtools index {cram_file}

print(f"\nCreated CRAM file: {cram_file}")

# Compare file sizes
bam_size = os.path.getsize(bam_file)
cram_size = os.path.getsize(cram_file)
print(f"\nFile size comparison:")
print(f"BAM: {bam_size:,} bytes")
print(f"CRAM: {cram_size:,} bytes")
print(f"Compression ratio: {bam_size/cram_size:.2f}x")

## Step 3: Parse and Uncompress CRAM File

In [None]:
def parse_cram_file(cram_file, ref_file, num_reads=10):
    """
    Parse and display information from a CRAM file.
    
    Args:
        cram_file: Path to CRAM file
        ref_file: Path to reference FASTA file
        num_reads: Number of example reads to display
    """
    cramfile = pysam.AlignmentFile(cram_file, 'rc', reference_filename=ref_file)
    
    print("CRAM File Header:")
    print("=" * 60)
    print(f"Version: {cramfile.header.get('HD', {}).get('VN', 'N/A')}")
    print(f"Sort order: {cramfile.header.get('HD', {}).get('SO', 'N/A')}")
    print(f"\nReference sequences:")
    for sq in cramfile.header.get('SQ', []):
        print(f"  {sq['SN']}: {sq['LN']:,} bp")
    
    print(f"\nFirst {num_reads} reads:")
    print("=" * 60)
    
    for i, read in enumerate(cramfile.fetch()):
        if i >= num_reads:
            break
        
        strand = '-' if read.is_reverse else '+'
        print(f"\nRead {i+1}: {read.query_name}")
        print(f"  Position: {read.reference_name}:{read.reference_start}-{read.reference_end} ({strand})")
        print(f"  Sequence: {read.query_sequence[:50]}..." if len(read.query_sequence) > 50 else f"  Sequence: {read.query_sequence}")
        print(f"  MAPQ: {read.mapping_quality}")
        print(f"  CIGAR: {read.cigarstring}")
        tags_str = ', '.join([f'{tag}={val}' for tag, val in read.get_tags()])
        print(f"  Flags: {read.flag} (Tags: {tags_str})")
    
    # Count total reads
    total_reads = cramfile.count()
    print(f"\n\nTotal reads in CRAM file: {total_reads:,}")
    
    cramfile.close()
    return total_reads

# Parse CRAM file
total_reads = parse_cram_file(cram_file, ref_file, num_reads=5)

## Step 4: Convert CRAM to BAM and SAM Formats

In [None]:
# Convert CRAM to BAM
output_bam = 'output_from_cram.bam'
!samtools view -b -T {ref_file} -o {output_bam} {cram_file}
!samtools index {output_bam}
print(f"Created BAM file from CRAM: {output_bam}")

# Convert CRAM to SAM
output_sam = 'output_from_cram.sam'
!samtools view -h -T {ref_file} -o {output_sam} {cram_file}
print(f"Created SAM file from CRAM: {output_sam}")

# Verify files
print(f"\nOutput file sizes:")
print(f"BAM: {os.path.getsize(output_bam):,} bytes")
print(f"SAM: {os.path.getsize(output_sam):,} bytes")

## Step 5: Analyze SAM File and Create Visualizations

In [None]:
def analyze_sam_file(sam_file):
    """
    Analyze SAM file and extract statistics for visualization.
    
    Args:
        sam_file: Path to SAM file
    
    Returns:
        Dictionary containing various statistics
    """
    samfile = pysam.AlignmentFile(sam_file, 'r')
    
    stats = {
        'mapping_qualities': [],
        'read_lengths': [],
        'positions': [],
        'strands': {'forward': 0, 'reverse': 0},
        'flags': Counter(),
        'edit_distances': [],
        'alignment_scores': [],
        'base_qualities': [],
        'gc_content': []
    }
    
    for read in samfile.fetch():
        # Mapping quality
        stats['mapping_qualities'].append(read.mapping_quality)
        
        # Read length
        stats['read_lengths'].append(read.query_length)
        
        # Position
        stats['positions'].append(read.reference_start)
        
        # Strand
        if read.is_reverse:
            stats['strands']['reverse'] += 1
        else:
            stats['strands']['forward'] += 1
        
        # Flags
        stats['flags'][read.flag] += 1
        
        # Edit distance
        nm_tag = read.get_tag('NM') if read.has_tag('NM') else 0
        stats['edit_distances'].append(nm_tag)
        
        # Alignment score
        as_tag = read.get_tag('AS') if read.has_tag('AS') else 0
        stats['alignment_scores'].append(as_tag)
        
        # Base qualities
        if read.query_qualities is not None:
            stats['base_qualities'].extend(read.query_qualities)
        
        # GC content
        if read.query_sequence:
            gc = (read.query_sequence.count('G') + read.query_sequence.count('C')) / len(read.query_sequence) * 100
            stats['gc_content'].append(gc)
    
    samfile.close()
    
    return stats

# Analyze SAM file
print("Analyzing SAM file...")
stats = analyze_sam_file(output_sam)
print("Analysis complete!")

### Visualization 1: Mapping Quality Distribution

In [None]:
plt.figure(figsize=(12, 6))
plt.hist(stats['mapping_qualities'], bins=30, edgecolor='black', alpha=0.7, color='skyblue')
plt.xlabel('Mapping Quality (MAPQ)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Distribution of Mapping Quality Scores', fontsize=14, fontweight='bold')
plt.axvline(np.mean(stats['mapping_qualities']), color='red', linestyle='--', 
            label=f'Mean: {np.mean(stats["mapping_qualities"]):.2f}')
plt.axvline(np.median(stats['mapping_qualities']), color='green', linestyle='--', 
            label=f'Median: {np.median(stats["mapping_qualities"]):.2f}')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Mapping Quality Statistics:")
print(f"  Mean: {np.mean(stats['mapping_qualities']):.2f}")
print(f"  Median: {np.median(stats['mapping_qualities']):.2f}")
print(f"  Std Dev: {np.std(stats['mapping_qualities']):.2f}")

### Visualization 2: Read Coverage Along Reference

In [None]:
# Calculate coverage in bins
num_bins = 50
ref_length = 10000
bin_size = ref_length // num_bins
coverage = np.zeros(num_bins)

for pos in stats['positions']:
    bin_idx = min(pos // bin_size, num_bins - 1)
    coverage[bin_idx] += 1

plt.figure(figsize=(14, 6))
x_positions = np.arange(num_bins) * bin_size
plt.bar(x_positions, coverage, width=bin_size*0.9, edgecolor='black', alpha=0.7, color='coral')
plt.xlabel('Position on Reference (bp)', fontsize=12)
plt.ylabel('Number of Reads', fontsize=12)
plt.title('Read Coverage Distribution Along Reference Genome', fontsize=14, fontweight='bold')
plt.axhline(np.mean(coverage), color='red', linestyle='--', 
            label=f'Mean Coverage: {np.mean(coverage):.2f}')
plt.legend()
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

print(f"Coverage Statistics:")
print(f"  Mean: {np.mean(coverage):.2f} reads/bin")
print(f"  Max: {np.max(coverage):.0f} reads/bin")
print(f"  Min: {np.min(coverage):.0f} reads/bin")

### Visualization 3: Strand Bias and Base Quality Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Strand distribution pie chart
strands = list(stats['strands'].keys())
counts = list(stats['strands'].values())
colors = ['lightblue', 'lightcoral']
explode = (0.05, 0.05)

axes[0].pie(counts, labels=strands, autopct='%1.1f%%', startangle=90, 
            colors=colors, explode=explode, shadow=True)
axes[0].set_title('Strand Distribution', fontsize=14, fontweight='bold')

# Base quality distribution
axes[1].hist(stats['base_qualities'], bins=40, edgecolor='black', alpha=0.7, color='lightgreen')
axes[1].set_xlabel('Base Quality (Phred Score)', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].set_title('Base Quality Score Distribution', fontsize=14, fontweight='bold')
axes[1].axvline(np.mean(stats['base_qualities']), color='red', linestyle='--', 
                label=f'Mean: {np.mean(stats["base_qualities"]):.2f}')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nStrand Bias:")
print(f"  Forward: {stats['strands']['forward']} ({stats['strands']['forward']/sum(counts)*100:.1f}%)")
print(f"  Reverse: {stats['strands']['reverse']} ({stats['strands']['reverse']/sum(counts)*100:.1f}%)")
print(f"\nBase Quality Statistics:")
print(f"  Mean: {np.mean(stats['base_qualities']):.2f}")
print(f"  Median: {np.median(stats['base_qualities']):.2f}")

### Visualization 4: Edit Distance and Alignment Score Analysis

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Edit distance distribution
ed_counts = Counter(stats['edit_distances'])
ed_sorted = sorted(ed_counts.items())
axes[0].bar([x[0] for x in ed_sorted], [x[1] for x in ed_sorted], 
            edgecolor='black', alpha=0.7, color='mediumpurple')
axes[0].set_xlabel('Edit Distance (NM tag)', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Edit Distance Distribution', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3, axis='y')

# Alignment score distribution
axes[1].hist(stats['alignment_scores'], bins=30, edgecolor='black', alpha=0.7, color='gold')
axes[1].set_xlabel('Alignment Score (AS tag)', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].set_title('Alignment Score Distribution', fontsize=14, fontweight='bold')
axes[1].axvline(np.mean(stats['alignment_scores']), color='red', linestyle='--', 
                label=f'Mean: {np.mean(stats["alignment_scores"]):.2f}')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nEdit Distance Statistics:")
print(f"  Mean: {np.mean(stats['edit_distances']):.2f}")
print(f"  Median: {np.median(stats['edit_distances']):.2f}")
print(f"\nAlignment Score Statistics:")
print(f"  Mean: {np.mean(stats['alignment_scores']):.2f}")
print(f"  Median: {np.median(stats['alignment_scores']):.2f}")

### Visualization 5: GC Content and Read Length Analysis

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# GC content distribution
axes[0].hist(stats['gc_content'], bins=30, edgecolor='black', alpha=0.7, color='teal')
axes[0].set_xlabel('GC Content (%)', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('GC Content Distribution', fontsize=14, fontweight='bold')
axes[0].axvline(np.mean(stats['gc_content']), color='red', linestyle='--', 
                label=f'Mean: {np.mean(stats["gc_content"]):.2f}%')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Read length distribution
rl_counts = Counter(stats['read_lengths'])
rl_sorted = sorted(rl_counts.items())
axes[1].bar([x[0] for x in rl_sorted], [x[1] for x in rl_sorted], 
            edgecolor='black', alpha=0.7, color='salmon')
axes[1].set_xlabel('Read Length (bp)', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].set_title('Read Length Distribution', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

print(f"\nGC Content Statistics:")
print(f"  Mean: {np.mean(stats['gc_content']):.2f}%")
print(f"  Median: {np.median(stats['gc_content']):.2f}%")
print(f"  Std Dev: {np.std(stats['gc_content']):.2f}%")
print(f"\nRead Length Statistics:")
print(f"  Mean: {np.mean(stats['read_lengths']):.2f} bp")
print(f"  Median: {np.median(stats['read_lengths']):.2f} bp")

## Summary Statistics Table

In [None]:
# Create summary DataFrame
summary_data = {
    'Metric': [
        'Total Reads',
        'Mean Mapping Quality',
        'Mean Read Length',
        'Mean Base Quality',
        'Mean Edit Distance',
        'Mean Alignment Score',
        'Mean GC Content',
        'Forward Strand %',
        'Reverse Strand %'
    ],
    'Value': [
        len(stats['mapping_qualities']),
        f"{np.mean(stats['mapping_qualities']):.2f}",
        f"{np.mean(stats['read_lengths']):.2f} bp",
        f"{np.mean(stats['base_qualities']):.2f}",
        f"{np.mean(stats['edit_distances']):.2f}",
        f"{np.mean(stats['alignment_scores']):.2f}",
        f"{np.mean(stats['gc_content']):.2f}%",
        f"{stats['strands']['forward']/len(stats['mapping_qualities'])*100:.1f}%",
        f"{stats['strands']['reverse']/len(stats['mapping_qualities'])*100:.1f}%"
    ]
}

summary_df = pd.DataFrame(summary_data)
print("\n" + "="*60)
print("SUMMARY STATISTICS")
print("="*60)
print(summary_df.to_string(index=False))
print("="*60)

## File Cleanup (Optional)

In [None]:
# Uncomment to remove generated files
# import os
# files_to_remove = [
#     'reference.fasta', 'reference.fasta.fai',
#     'simulated_sorted.bam', 'simulated_sorted.bam.bai',
#     'simulated.cram', 'simulated.cram.crai',
#     'output_from_cram.bam', 'output_from_cram.bam.bai',
#     'output_from_cram.sam'
# ]
# for f in files_to_remove:
#     if os.path.exists(f):
#         os.remove(f)
#         print(f"Removed: {f}")