# PostgreSQL SQL Warmups: Clinical Sequencing QC Data

## Installation Instructions for PostgreSQL on Mac

### Step 1: Install PostgreSQL using Homebrew

```bash
# Install Homebrew if not already installed
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"

# Install PostgreSQL
brew install postgresql@15

# Add PostgreSQL to your PATH
echo 'export PATH="/opt/homebrew/opt/postgresql@15/bin:$PATH"' >> ~/.zshrc
source ~/.zshrc
```

### Step 2: Start PostgreSQL Service

```bash
# Start PostgreSQL service
brew services start postgresql@15

# Verify it's running
brew services list | grep postgresql
```

### Step 3: Create a Practice Database

```bash
# Create a database called 'sequencing_qc'
createdb sequencing_qc

# Test connection
psql sequencing_qc -c "SELECT version();"
```

### Step 4: Install Python Dependencies

```bash
pip install psycopg2-binary pandas matplotlib seaborn jupyter
```

---

## Setup: Import Libraries and Connect to Database

In [None]:
import psycopg2
from psycopg2 import sql
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
# Database connection parameters
DB_NAME = "sequencing_qc"
DB_USER = "your_username"  # Usually your Mac username, or 'postgres'
DB_HOST = "localhost"
DB_PORT = "5432"

# Create connection
def get_connection():
    return psycopg2.connect(
        dbname=DB_NAME,
        user=DB_USER,
        host=DB_HOST,
        port=DB_PORT
    )

# Test connection
try:
    conn = get_connection()
    print("✓ Successfully connected to PostgreSQL!")
    conn.close()
except Exception as e:
    print(f"✗ Connection failed: {e}")
    print("\nTip: You may need to update DB_USER to your Mac username.")
    print("Run this in terminal to find it: whoami")

## Database Schema Creation

In [None]:
# Create tables for clinical sequencing QC data
schema_sql = """
DROP TABLE IF EXISTS variant_calls CASCADE;
DROP TABLE IF EXISTS coverage_metrics CASCADE;
DROP TABLE IF EXISTS sequencing_runs CASCADE;
DROP TABLE IF EXISTS samples CASCADE;
DROP TABLE IF EXISTS patients CASCADE;

-- Patients table
CREATE TABLE patients (
    patient_id VARCHAR(20) PRIMARY KEY,
    age INTEGER,
    gender VARCHAR(10),
    clinical_indication TEXT
);

-- Samples table
CREATE TABLE samples (
    sample_id VARCHAR(20) PRIMARY KEY,
    patient_id VARCHAR(20) REFERENCES patients(patient_id),
    sample_type VARCHAR(50),
    collection_date DATE,
    tumor_purity DECIMAL(5,2)
);

-- Sequencing runs table
CREATE TABLE sequencing_runs (
    run_id VARCHAR(30) PRIMARY KEY,
    sample_id VARCHAR(20) REFERENCES samples(sample_id),
    sequencer VARCHAR(50),
    run_date DATE,
    flowcell_id VARCHAR(30),
    total_reads BIGINT,
    mapped_reads BIGINT,
    duplicate_reads BIGINT,
    properly_paired_reads BIGINT,
    mean_insert_size DECIMAL(8,2),
    q30_bases_pct DECIMAL(5,2),
    contamination_pct DECIMAL(5,3),
    pass_qc BOOLEAN
);

-- Coverage metrics table
CREATE TABLE coverage_metrics (
    metric_id SERIAL PRIMARY KEY,
    run_id VARCHAR(30) REFERENCES sequencing_runs(run_id),
    chromosome VARCHAR(10),
    mean_coverage DECIMAL(8,2),
    median_coverage DECIMAL(8,2),
    pct_bases_10x DECIMAL(5,2),
    pct_bases_20x DECIMAL(5,2),
    pct_bases_30x DECIMAL(5,2),
    uniformity DECIMAL(5,2)
);

-- Variant calls table
CREATE TABLE variant_calls (
    variant_id SERIAL PRIMARY KEY,
    run_id VARCHAR(30) REFERENCES sequencing_runs(run_id),
    chromosome VARCHAR(10),
    position INTEGER,
    ref_allele VARCHAR(255),
    alt_allele VARCHAR(255),
    variant_type VARCHAR(20),
    quality_score DECIMAL(8,2),
    depth INTEGER,
    allele_frequency DECIMAL(5,4),
    gene_name VARCHAR(50),
    clinical_significance VARCHAR(50)
);
"""

conn = get_connection()
cur = conn.cursor()

try:
    cur.execute(schema_sql)
    conn.commit()
    print("✓ Database schema created successfully!")
except Exception as e:
    print(f"✗ Error creating schema: {e}")
    conn.rollback()
finally:
    cur.close()
    conn.close()

## Populate Database with Faux Clinical Sequencing Data

In [None]:
# Generate realistic sequencing QC data for 10 clinical samples
np.random.seed(42)

conn = get_connection()
cur = conn.cursor()

# Clinical indications pool
indications = [
    'Breast cancer screening',
    'Colorectal cancer',
    'Hereditary cancer syndrome',
    'Lung adenocarcinoma',
    'Prostate cancer',
    'Ovarian cancer',
    'Melanoma',
    'Lynch syndrome screening'
]

# Insert patients
patients_data = []
for i in range(10):
    patient_id = f"PT{1000+i}"
    age = np.random.randint(35, 75)
    gender = np.random.choice(['Male', 'Female'])
    indication = np.random.choice(indications)
    patients_data.append((patient_id, age, gender, indication))

cur.executemany(
    "INSERT INTO patients VALUES (%s, %s, %s, %s)",
    patients_data
)

# Insert samples
samples_data = []
sample_types = ['Blood', 'Tumor tissue', 'Saliva', 'Buccal swab']
base_date = datetime(2024, 1, 1)

for i, (patient_id, _, _, _) in enumerate(patients_data):
    sample_id = f"SAM{2000+i}"
    sample_type = np.random.choice(sample_types)
    collection_date = base_date + timedelta(days=np.random.randint(0, 365))
    tumor_purity = np.random.uniform(30, 95) if 'tumor' in sample_type.lower() else None
    samples_data.append((sample_id, patient_id, sample_type, collection_date, tumor_purity))

cur.executemany(
    "INSERT INTO samples VALUES (%s, %s, %s, %s, %s)",
    samples_data
)

# Insert sequencing runs
sequencers = ['NovaSeq 6000', 'NextSeq 550', 'MiSeq']
runs_data = []

for i, (sample_id, _, _, _, _) in enumerate(samples_data):
    run_id = f"RUN_{datetime.now().strftime('%Y%m%d')}_{i+1:03d}"
    sequencer = np.random.choice(sequencers)
    run_date = base_date + timedelta(days=np.random.randint(0, 365))
    flowcell_id = f"FC{np.random.randint(100000, 999999)}"
    
    # Generate realistic read counts based on sequencer
    if 'NovaSeq' in sequencer:
        total_reads = np.random.randint(250_000_000, 350_000_000)
    elif 'NextSeq' in sequencer:
        total_reads = np.random.randint(80_000_000, 120_000_000)
    else:  # MiSeq
        total_reads = np.random.randint(15_000_000, 25_000_000)
    
    mapping_rate = np.random.uniform(0.92, 0.99)
    mapped_reads = int(total_reads * mapping_rate)
    duplicate_rate = np.random.uniform(0.05, 0.25)
    duplicate_reads = int(mapped_reads * duplicate_rate)
    properly_paired = int(mapped_reads * np.random.uniform(0.95, 0.99))
    
    mean_insert_size = np.random.uniform(300, 450)
    q30_bases_pct = np.random.uniform(85, 95)
    contamination_pct = np.random.uniform(0.1, 2.5)
    pass_qc = contamination_pct < 2.0 and q30_bases_pct > 80 and mapping_rate > 0.90
    
    runs_data.append((
        run_id, sample_id, sequencer, run_date, flowcell_id,
        total_reads, mapped_reads, duplicate_reads, properly_paired,
        mean_insert_size, q30_bases_pct, contamination_pct, pass_qc
    ))

cur.executemany(
    """INSERT INTO sequencing_runs VALUES 
       (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
    runs_data
)

# Insert coverage metrics
chromosomes = [str(i) for i in range(1, 23)] + ['X', 'Y']
coverage_data = []

for run_id, _, _, _, _, _, _, _, _, _, _, _, _ in runs_data:
    base_coverage = np.random.uniform(80, 150)
    for chrom in chromosomes:
        # Add variation per chromosome
        mean_cov = base_coverage * np.random.uniform(0.8, 1.2)
        median_cov = mean_cov * np.random.uniform(0.95, 1.05)
        pct_10x = np.random.uniform(95, 99.9)
        pct_20x = np.random.uniform(92, 99.5)
        pct_30x = np.random.uniform(88, 98)
        uniformity = np.random.uniform(75, 95)
        
        coverage_data.append((
            run_id, chrom, mean_cov, median_cov,
            pct_10x, pct_20x, pct_30x, uniformity
        ))

cur.executemany(
    """INSERT INTO coverage_metrics 
       (run_id, chromosome, mean_coverage, median_coverage, 
        pct_bases_10x, pct_bases_20x, pct_bases_30x, uniformity)
       VALUES (%s, %s, %s, %s, %s, %s, %s, %s)""",
    coverage_data
)

# Insert variant calls
variant_types = ['SNV', 'INDEL', 'MNV']
genes = ['BRCA1', 'BRCA2', 'TP53', 'KRAS', 'EGFR', 'PIK3CA', 'PTEN', 'APC', 'MLH1', 'MSH2']
clinical_sig = ['Pathogenic', 'Likely pathogenic', 'VUS', 'Likely benign', 'Benign']
variants_data = []

for run_id, _, _, _, _, _, _, _, _, _, _, _, _ in runs_data:
    # Generate 50-200 variants per sample
    num_variants = np.random.randint(50, 200)
    
    for _ in range(num_variants):
        chrom = np.random.choice(chromosomes[:22])  # Autosomes only for simplicity
        position = np.random.randint(1000000, 200000000)
        
        var_type = np.random.choice(variant_types, p=[0.85, 0.13, 0.02])
        
        if var_type == 'SNV':
            bases = ['A', 'C', 'G', 'T']
            ref = np.random.choice(bases)
            alt = np.random.choice([b for b in bases if b != ref])
        elif var_type == 'INDEL':
            if np.random.random() < 0.5:  # Deletion
                ref = ''.join(np.random.choice(['A','C','G','T'], size=np.random.randint(1,10)))
                alt = ref[0]
            else:  # Insertion
                ref = np.random.choice(['A','C','G','T'])
                alt = ref + ''.join(np.random.choice(['A','C','G','T'], size=np.random.randint(1,10)))
        else:  # MNV
            ref = ''.join(np.random.choice(['A','C','G','T'], size=2))
            alt = ''.join(np.random.choice(['A','C','G','T'], size=2))
        
        quality = np.random.uniform(30, 500)
        depth = int(np.random.uniform(50, 300))
        allele_freq = np.random.uniform(0.05, 0.95)
        gene = np.random.choice(genes)
        significance = np.random.choice(clinical_sig, p=[0.05, 0.10, 0.50, 0.20, 0.15])
        
        variants_data.append((
            run_id, chrom, position, ref, alt, var_type,
            quality, depth, allele_freq, gene, significance
        ))

# Insert in batches for efficiency
batch_size = 1000
for i in range(0, len(variants_data), batch_size):
    batch = variants_data[i:i+batch_size]
    cur.executemany(
        """INSERT INTO variant_calls 
           (run_id, chromosome, position, ref_allele, alt_allele, variant_type,
            quality_score, depth, allele_frequency, gene_name, clinical_significance)
           VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
        batch
    )

conn.commit()
print("✓ Database populated successfully!")
print(f"  - {len(patients_data)} patients")
print(f"  - {len(samples_data)} samples")
print(f"  - {len(runs_data)} sequencing runs")
print(f"  - {len(coverage_data)} coverage metrics")
print(f"  - {len(variants_data)} variant calls")

cur.close()
conn.close()

---
# SQL Warmup Exercises

## Basic SELECT Queries

In [None]:
# Warmup 1: Select all patients
query = "SELECT * FROM patients;"

df = pd.read_sql(query, get_connection())
print(f"Query 1: Retrieved {len(df)} patients\n")
df

In [None]:
# Warmup 2: Select specific columns from samples
query = "SELECT sample_id, patient_id, sample_type, collection_date FROM samples;"

df = pd.read_sql(query, get_connection())
df

In [None]:
# Warmup 3: Count total number of sequencing runs
query = "SELECT COUNT(*) as total_runs FROM sequencing_runs;"

df = pd.read_sql(query, get_connection())
df

## WHERE Clause Filtering

In [None]:
# Warmup 4: Find all female patients
query = "SELECT * FROM patients WHERE gender = 'Female';"

df = pd.read_sql(query, get_connection())
print(f"Found {len(df)} female patients\n")
df

In [None]:
# Warmup 5: Find runs that passed QC
query = "SELECT run_id, sample_id, sequencer, pass_qc FROM sequencing_runs WHERE pass_qc = TRUE;"

df = pd.read_sql(query, get_connection())
print(f"Runs passing QC: {len(df)}\n")
df

In [None]:
# Warmup 6: Find patients older than 60
query = "SELECT patient_id, age, gender, clinical_indication FROM patients WHERE age > 60;"

df = pd.read_sql(query, get_connection())
df

In [None]:
# Warmup 7: Find runs with high Q30 percentage (>90%)
query = """
SELECT run_id, sample_id, q30_bases_pct, sequencer 
FROM sequencing_runs 
WHERE q30_bases_pct > 90
ORDER BY q30_bases_pct DESC;
"""

df = pd.read_sql(query, get_connection())
df

## Aggregation Functions

In [None]:
# Warmup 8: Calculate average age of patients
query = "SELECT AVG(age) as average_age, MIN(age) as min_age, MAX(age) as max_age FROM patients;"

df = pd.read_sql(query, get_connection())
df

In [None]:
# Warmup 9: Get total and average reads per sequencer
query = """
SELECT 
    sequencer,
    COUNT(*) as num_runs,
    AVG(total_reads) as avg_reads,
    AVG(q30_bases_pct) as avg_q30
FROM sequencing_runs
GROUP BY sequencer
ORDER BY avg_reads DESC;
"""

df = pd.read_sql(query, get_connection())
df

In [None]:
# Warmup 10: Count samples by type
query = """
SELECT sample_type, COUNT(*) as count 
FROM samples 
GROUP BY sample_type
ORDER BY count DESC;
"""

df = pd.read_sql(query, get_connection())
df

## JOIN Operations

In [None]:
# Warmup 11: Join patients and samples
query = """
SELECT 
    p.patient_id,
    p.age,
    p.gender,
    s.sample_id,
    s.sample_type
FROM patients p
JOIN samples s ON p.patient_id = s.patient_id;
"""

df = pd.read_sql(query, get_connection())
df

In [None]:
# Warmup 12: Join samples and sequencing runs with QC info
query = """
SELECT 
    s.sample_id,
    s.sample_type,
    sr.run_id,
    sr.sequencer,
    sr.pass_qc,
    sr.contamination_pct
FROM samples s
JOIN sequencing_runs sr ON s.sample_id = sr.sample_id;
"""

df = pd.read_sql(query, get_connection())
df

In [None]:
# Warmup 13: Three-table join - patients, samples, runs
query = """
SELECT 
    p.patient_id,
    p.clinical_indication,
    s.sample_id,
    sr.run_id,
    sr.total_reads,
    sr.pass_qc
FROM patients p
JOIN samples s ON p.patient_id = s.patient_id
JOIN sequencing_runs sr ON s.sample_id = sr.sample_id
ORDER BY p.patient_id;
"""

df = pd.read_sql(query, get_connection())
df

## Advanced Filtering and Calculations

In [None]:
# Warmup 14: Calculate mapping percentage
query = """
SELECT 
    run_id,
    total_reads,
    mapped_reads,
    ROUND((mapped_reads::NUMERIC / total_reads * 100), 2) as mapping_pct
FROM sequencing_runs
ORDER BY mapping_pct DESC;
"""

df = pd.read_sql(query, get_connection())
df

In [None]:
# Warmup 15: Calculate duplicate percentage
query = """
SELECT 
    run_id,
    sequencer,
    duplicate_reads,
    mapped_reads,
    ROUND((duplicate_reads::NUMERIC / mapped_reads * 100), 2) as duplicate_pct
FROM sequencing_runs
ORDER BY duplicate_pct DESC;
"""

df = pd.read_sql(query, get_connection())
df

In [None]:
# Warmup 16: Find pathogenic variants
query = """
SELECT 
    gene_name,
    COUNT(*) as variant_count,
    AVG(quality_score) as avg_quality
FROM variant_calls
WHERE clinical_significance IN ('Pathogenic', 'Likely pathogenic')
GROUP BY gene_name
ORDER BY variant_count DESC;
"""

df = pd.read_sql(query, get_connection())
df

In [None]:
# Warmup 17: Coverage statistics by chromosome
query = """
SELECT 
    chromosome,
    AVG(mean_coverage) as avg_coverage,
    AVG(uniformity) as avg_uniformity,
    AVG(pct_bases_30x) as avg_30x_coverage
FROM coverage_metrics
GROUP BY chromosome
ORDER BY 
    CASE 
        WHEN chromosome ~ '^[0-9]+$' THEN chromosome::INTEGER
        ELSE 999 
    END,
    chromosome;
"""

df = pd.read_sql(query, get_connection())
df.head(15)

## Subqueries and Complex Queries

In [None]:
# Warmup 18: Find runs with above-average read counts
query = """
SELECT run_id, total_reads, sequencer
FROM sequencing_runs
WHERE total_reads > (SELECT AVG(total_reads) FROM sequencing_runs)
ORDER BY total_reads DESC;
"""

df = pd.read_sql(query, get_connection())
df

In [None]:
# Warmup 19: Samples with multiple variants in cancer genes
query = """
SELECT 
    run_id,
    gene_name,
    COUNT(*) as variant_count
FROM variant_calls
WHERE gene_name IN ('BRCA1', 'BRCA2', 'TP53', 'KRAS')
GROUP BY run_id, gene_name
HAVING COUNT(*) > 5
ORDER BY variant_count DESC;
"""

df = pd.read_sql(query, get_connection())
df

In [None]:
# Warmup 20: Patients with failed QC runs
query = """
SELECT 
    p.patient_id,
    p.clinical_indication,
    s.sample_id,
    sr.run_id,
    sr.contamination_pct,
    sr.q30_bases_pct
FROM patients p
JOIN samples s ON p.patient_id = s.patient_id
JOIN sequencing_runs sr ON s.sample_id = sr.sample_id
WHERE sr.pass_qc = FALSE;
"""

df = pd.read_sql(query, get_connection())
df

## Date and Time Queries

In [None]:
# Warmup 21: Runs by month
query = """
SELECT 
    DATE_TRUNC('month', run_date) as month,
    COUNT(*) as runs_count,
    AVG(total_reads) as avg_reads
FROM sequencing_runs
GROUP BY month
ORDER BY month;
"""

df = pd.read_sql(query, get_connection())
df

In [None]:
# Warmup 22: Time between sample collection and sequencing
query = """
SELECT 
    s.sample_id,
    s.collection_date,
    sr.run_date,
    sr.run_date - s.collection_date as days_to_sequencing
FROM samples s
JOIN sequencing_runs sr ON s.sample_id = sr.sample_id
ORDER BY days_to_sequencing DESC;
"""

df = pd.read_sql(query, get_connection())
df

## Advanced Analytics

In [None]:
# Warmup 23: Variant distribution by type and significance
query = """
SELECT 
    variant_type,
    clinical_significance,
    COUNT(*) as count,
    AVG(quality_score) as avg_quality,
    AVG(depth) as avg_depth
FROM variant_calls
GROUP BY variant_type, clinical_significance
ORDER BY variant_type, count DESC;
"""

df = pd.read_sql(query, get_connection())
df

In [None]:
# Warmup 24: QC summary report per sample
query = """
SELECT 
    sr.sample_id,
    sr.sequencer,
    sr.total_reads,
    ROUND((sr.mapped_reads::NUMERIC / sr.total_reads * 100), 2) as mapping_pct,
    ROUND((sr.duplicate_reads::NUMERIC / sr.mapped_reads * 100), 2) as duplicate_pct,
    sr.mean_insert_size,
    sr.q30_bases_pct,
    sr.contamination_pct,
    sr.pass_qc,
    COUNT(vc.variant_id) as total_variants
FROM sequencing_runs sr
LEFT JOIN variant_calls vc ON sr.run_id = vc.run_id
GROUP BY sr.run_id, sr.sample_id, sr.sequencer, sr.total_reads, 
         sr.mapped_reads, sr.duplicate_reads, sr.mean_insert_size,
         sr.q30_bases_pct, sr.contamination_pct, sr.pass_qc
ORDER BY sr.sample_id;
"""

df = pd.read_sql(query, get_connection())
df

In [None]:
# Warmup 25: Complete clinical report
query = """
SELECT 
    p.patient_id,
    p.age,
    p.gender,
    p.clinical_indication,
    s.sample_type,
    sr.sequencer,
    sr.pass_qc,
    COUNT(DISTINCT CASE WHEN vc.clinical_significance IN ('Pathogenic', 'Likely pathogenic') 
                        THEN vc.gene_name END) as pathogenic_genes,
    COUNT(DISTINCT CASE WHEN vc.clinical_significance = 'VUS' 
                        THEN vc.gene_name END) as vus_genes
FROM patients p
JOIN samples s ON p.patient_id = s.patient_id
JOIN sequencing_runs sr ON s.sample_id = sr.sample_id
LEFT JOIN variant_calls vc ON sr.run_id = vc.run_id
GROUP BY p.patient_id, p.age, p.gender, p.clinical_indication, 
         s.sample_type, sr.sequencer, sr.pass_qc
ORDER BY p.patient_id;
"""

df = pd.read_sql(query, get_connection())
df

---
# Data Visualizations

## Visualization 1: QC Pass Rate by Sequencer

In [None]:
query = """
SELECT 
    sequencer,
    SUM(CASE WHEN pass_qc THEN 1 ELSE 0 END)::FLOAT / COUNT(*) * 100 as pass_rate,
    COUNT(*) as total_runs
FROM sequencing_runs
GROUP BY sequencer;
"""

df = pd.read_sql(query, get_connection())

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot of pass rates
colors = sns.color_palette('Set2', n_colors=len(df))
ax1.bar(df['sequencer'], df['pass_rate'], color=colors)
ax1.set_ylabel('QC Pass Rate (%)', fontsize=12)
ax1.set_xlabel('Sequencer', fontsize=12)
ax1.set_title('QC Pass Rate by Sequencer Platform', fontsize=14, fontweight='bold')
ax1.set_ylim([0, 105])
for i, v in enumerate(df['pass_rate']):
    ax1.text(i, v + 2, f'{v:.1f}%', ha='center', fontweight='bold')

# Pie chart of total runs
ax2.pie(df['total_runs'], labels=df['sequencer'], autopct='%1.0f%%', 
        colors=colors, startangle=90)
ax2.set_title('Distribution of Sequencing Runs', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

## Visualization 2: Read Depth Distribution

In [None]:
query = """
SELECT total_reads, sequencer, pass_qc
FROM sequencing_runs;
"""

df = pd.read_sql(query, get_connection())
df['total_reads_millions'] = df['total_reads'] / 1_000_000

fig, ax = plt.subplots(figsize=(12, 6))

# Box plot by sequencer
sns.boxplot(data=df, x='sequencer', y='total_reads_millions', 
            hue='pass_qc', palette='Set1', ax=ax)
ax.set_ylabel('Total Reads (Millions)', fontsize=12)
ax.set_xlabel('Sequencer Platform', fontsize=12)
ax.set_title('Read Depth Distribution by Sequencer and QC Status', 
             fontsize=14, fontweight='bold')
ax.legend(title='Passed QC', loc='upper right')
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## Visualization 3: Coverage Uniformity Across Chromosomes

In [None]:
query = """
SELECT 
    chromosome,
    AVG(mean_coverage) as avg_coverage,
    AVG(uniformity) as avg_uniformity
FROM coverage_metrics
WHERE chromosome ~ '^[0-9]+$'
GROUP BY chromosome
ORDER BY chromosome::INTEGER;
"""

df = pd.read_sql(query, get_connection())

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))

# Coverage by chromosome
ax1.plot(df['chromosome'], df['avg_coverage'], marker='o', linewidth=2, 
         markersize=8, color='#2E86AB')
ax1.fill_between(range(len(df)), df['avg_coverage'], alpha=0.3, color='#2E86AB')
ax1.set_ylabel('Average Coverage', fontsize=12)
ax1.set_xlabel('Chromosome', fontsize=12)
ax1.set_title('Mean Coverage Distribution Across Chromosomes', 
              fontsize=14, fontweight='bold')
ax1.grid(True, alpha=0.3)

# Uniformity by chromosome
ax2.bar(df['chromosome'], df['avg_uniformity'], color='#A23B72', alpha=0.7)
ax2.set_ylabel('Uniformity (%)', fontsize=12)
ax2.set_xlabel('Chromosome', fontsize=12)
ax2.set_title('Coverage Uniformity Across Chromosomes', 
              fontsize=14, fontweight='bold')
ax2.axhline(y=85, color='red', linestyle='--', label='Target (85%)')
ax2.legend()
ax2.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## Visualization 4: Variant Classification Distribution

In [None]:
query = """
SELECT 
    clinical_significance,
    variant_type,
    COUNT(*) as count
FROM variant_calls
GROUP BY clinical_significance, variant_type;
"""

df = pd.read_sql(query, get_connection())
pivot_df = df.pivot(index='clinical_significance', columns='variant_type', values='count').fillna(0)

fig, ax = plt.subplots(figsize=(12, 6))

pivot_df.plot(kind='bar', stacked=True, ax=ax, 
              color=['#E63946', '#F1FA8C', '#A8DADC'])
ax.set_ylabel('Variant Count', fontsize=12)
ax.set_xlabel('Clinical Significance', fontsize=12)
ax.set_title('Variant Distribution by Clinical Significance and Type', 
             fontsize=14, fontweight='bold')
ax.legend(title='Variant Type', loc='upper right')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## Visualization 5: QC Metrics Correlation Heatmap

In [None]:
query = """
SELECT 
    total_reads,
    mapped_reads::FLOAT / total_reads * 100 as mapping_pct,
    duplicate_reads::FLOAT / mapped_reads * 100 as duplicate_pct,
    q30_bases_pct,
    contamination_pct,
    mean_insert_size
FROM sequencing_runs;
"""

df = pd.read_sql(query, get_connection())
correlation = df.corr()

fig, ax = plt.subplots(figsize=(10, 8))

sns.heatmap(correlation, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1, ax=ax,
            cbar_kws={'label': 'Correlation Coefficient'})
ax.set_title('QC Metrics Correlation Matrix', fontsize=14, fontweight='bold', pad=20)

# Adjust labels
labels = ['Total Reads', 'Mapping %', 'Duplicate %', 'Q30 %', 'Contamination %', 'Insert Size']
ax.set_xticklabels(labels, rotation=45, ha='right')
ax.set_yticklabels(labels, rotation=0)

plt.tight_layout()
plt.show()

## Visualization 6: Pathogenic Variants by Gene

In [None]:
query = """
SELECT 
    gene_name,
    COUNT(*) as pathogenic_count,
    AVG(allele_frequency) as avg_af,
    AVG(quality_score) as avg_qual
FROM variant_calls
WHERE clinical_significance IN ('Pathogenic', 'Likely pathogenic')
GROUP BY gene_name
ORDER BY pathogenic_count DESC;
"""

df = pd.read_sql(query, get_connection())

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Bar chart of pathogenic variant counts
colors = plt.cm.Reds(np.linspace(0.4, 0.9, len(df)))
ax1.barh(df['gene_name'], df['pathogenic_count'], color=colors)
ax1.set_xlabel('Number of Pathogenic/Likely Pathogenic Variants', fontsize=12)
ax1.set_ylabel('Gene', fontsize=12)
ax1.set_title('Pathogenic Variant Burden by Gene', fontsize=14, fontweight='bold')
ax1.grid(axis='x', alpha=0.3)

# Scatter plot of AF vs Quality
scatter = ax2.scatter(df['avg_af'], df['avg_qual'], s=df['pathogenic_count']*20,
                     c=df['pathogenic_count'], cmap='Reds', alpha=0.6, edgecolors='black')
ax2.set_xlabel('Average Allele Frequency', fontsize=12)
ax2.set_ylabel('Average Quality Score', fontsize=12)
ax2.set_title('Pathogenic Variant Quality Metrics', fontsize=14, fontweight='bold')

# Add gene labels
for idx, row in df.iterrows():
    ax2.annotate(row['gene_name'], (row['avg_af'], row['avg_qual']), 
                fontsize=9, alpha=0.7)

plt.colorbar(scatter, ax=ax2, label='Variant Count')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Visualization 7: Sample Processing Timeline

In [None]:
query = """
SELECT 
    s.sample_id,
    s.collection_date,
    sr.run_date,
    sr.run_date - s.collection_date as turnaround_time,
    sr.pass_qc
FROM samples s
JOIN sequencing_runs sr ON s.sample_id = sr.sample_id
ORDER BY s.collection_date;
"""

df = pd.read_sql(query, get_connection())
df['turnaround_days'] = df['turnaround_time'].dt.days

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))

# Timeline plot
colors = ['green' if qc else 'red' for qc in df['pass_qc']]
ax1.scatter(df['collection_date'], df['turnaround_days'], 
           s=100, c=colors, alpha=0.6, edgecolors='black')
ax1.set_ylabel('Turnaround Time (days)', fontsize=12)
ax1.set_xlabel('Collection Date', fontsize=12)
ax1.set_title('Sample Processing Turnaround Times', fontsize=14, fontweight='bold')
ax1.axhline(y=df['turnaround_days'].mean(), color='blue', linestyle='--', 
           label=f"Mean: {df['turnaround_days'].mean():.1f} days")
ax1.legend()
ax1.grid(True, alpha=0.3)

# Distribution histogram
ax2.hist(df['turnaround_days'], bins=15, color='#457B9D', alpha=0.7, edgecolor='black')
ax2.axvline(df['turnaround_days'].median(), color='red', linestyle='--', linewidth=2,
           label=f"Median: {df['turnaround_days'].median():.0f} days")
ax2.set_xlabel('Turnaround Time (days)', fontsize=12)
ax2.set_ylabel('Frequency', fontsize=12)
ax2.set_title('Distribution of Processing Turnaround Times', fontsize=14, fontweight='bold')
ax2.legend()
ax2.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nTurnaround Time Statistics:")
print(f"  Mean: {df['turnaround_days'].mean():.1f} days")
print(f"  Median: {df['turnaround_days'].median():.0f} days")
print(f"  Range: {df['turnaround_days'].min()}-{df['turnaround_days'].max()} days")

---
## Summary

You've completed 25 SQL warmup exercises covering:
- Basic SELECT queries
- WHERE clause filtering
- Aggregation functions (COUNT, AVG, MIN, MAX)
- GROUP BY operations
- JOIN operations (2-table and 3-table joins)
- Calculated fields
- Subqueries
- HAVING clause
- Date/time operations
- Complex multi-table analytics

Plus 7 visualizations showing:
1. QC pass rates by sequencer
2. Read depth distributions
3. Coverage uniformity across chromosomes
4. Variant classification distributions
5. QC metrics correlations
6. Pathogenic variants by gene
7. Sample processing timelines

### Next Steps:
- Practice modifying these queries
- Try combining concepts from multiple exercises
- Create your own queries to explore the data
- Build custom visualizations
- Connect this to real sequencing data pipelines!