<a href="https://colab.research.google.com/github/priyadarshinikp1/Genomics/blob/main/genomics_SNP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 📦 Install dependencies and build Picard & GATK from source

In [None]:

!sudo apt update
!sudo apt install -y openjdk-17-jdk-headless

# Set JAVA_HOME so Java knows where to look
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64"

# Clone and build Picard
!git clone https://github.com/broadinstitute/picard.git
%cd picard
!./gradlew shadowJar
%cd ..

# Clone and build GATK
!git clone https://github.com/broadinstitute/gatk.git
%cd gatk
!./gradlew bundle
%cd ..

# Set paths to the newly built JARs
PICARD_PATH = "./picard/build/libs/picard.jar"
GATK_PATH = "./gatk/build/libs/gatk.jar"

print("✅ Picard and GATK successfully built.")


Set up paths and parameters

In [None]:
# 📁 Set up paths and parameters

# Reference files
REFERENCE_GENOME = "./ref/genomic.fna"
KNOWN_SITES = "./Homo_sapiens_assembly38.dbsnp138.vcf"

# Input/output directories
INPUT_DIR = "./trimmed_reads"
OUTPUT_DIR = "./output"

# Tool paths
GATK_PATH = "./gatk/build/libs/gatk.jar"
PICARD_PATH = "./picard/build/libs/picard.jar"
SNPEFF_JAR = "./snpEff/snpEff.jar"
SNPEFF_DB = "GRCh38.99"

# ANNOVAR paths
ANNOVAR_DIR = "./annovar"
HUMANDB = f"{ANNOVAR_DIR}/humandb"

# Create output dir if needed
import os
os.makedirs(OUTPUT_DIR, exist_ok=True)


Utility function to log time

In [None]:
from datetime import datetime

def log_time(msg):
    print(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - {msg}")


FASTQ to Variant Calling Pipeline (Bowtie2 → GATK)

In [None]:
import glob
import os

fastq_files = glob.glob(os.path.join(INPUT_DIR, "*_trimmed.fastq"))

for fastq in fastq_files:
    sample_name = os.path.basename(fastq).replace("_trimmed.fastq", "")
    log_time(f"Processing sample: {sample_name}")

    !bowtie2 -x {REFERENCE_GENOME[:-4]} -U {fastq} -S {OUTPUT_DIR}/{sample_name}.sam
    !samtools view -bS {OUTPUT_DIR}/{sample_name}.sam | samtools sort -o {OUTPUT_DIR}/{sample_name}.sorted.bam
    !samtools index {OUTPUT_DIR}/{sample_name}.sorted.bam

    !java -jar {PICARD_PATH} MarkDuplicates \
        -I {OUTPUT_DIR}/{sample_name}.sorted.bam \
        -O {OUTPUT_DIR}/{sample_name}.marked_duplicates.bam \
        -M {OUTPUT_DIR}/{sample_name}.marked_duplicates.metrics

    !java -jar {PICARD_PATH} AddOrReplaceReadGroups \
        -I {OUTPUT_DIR}/{sample_name}.marked_duplicates.bam \
        -O {OUTPUT_DIR}/reads_with_RG_{sample_name}.sorted.bam \
        -RGID {sample_name} -RGLB lib1 -RGPL illumina -RGSM {sample_name} -RGPU unit1 \
        -CREATE_INDEX true

    !java -jar {GATK_PATH} BaseRecalibrator \
        -R {REFERENCE_GENOME} \
        -I {OUTPUT_DIR}/reads_with_RG_{sample_name}.sorted.bam \
        --known-sites {KNOWN_SITES} \
        -O {OUTPUT_DIR}/{sample_name}.recal_data.table

    !java -jar {GATK_PATH} ApplyBQSR \
        -R {REFERENCE_GENOME} \
        -I {OUTPUT_DIR}/reads_with_RG_{sample_name}.sorted.bam \
        --bqsr-recal-file {OUTPUT_DIR}/{sample_name}.recal_data.table \
        -O {OUTPUT_DIR}/{sample_name}.recalibrated.bam

    !java -jar {GATK_PATH}HaplotypeCaller \
        -R {REFERENCE_GENOME} \
        -I {OUTPUT_DIR}/reads_with_RG_{sample_name}.sorted.bam \
        -O {OUTPUT_DIR}/{sample_name}.vcf.gz


Annotation

In [None]:
 !java -Xmx4g -jar {SNPEFF_JAR} ann {SNPEFF_DB} {OUTPUT_DIR}/{sample_name}.vcf.gz > {OUTPUT_DIR}/{sample_name}.snpeff.vcf

    !{ANNOVAR_DIR}/convert2annovar.pl -format vcf4 {OUTPUT_DIR}/{sample_name}.snpeff.vcf > {OUTPUT_DIR}/{sample_name}.avinput

    !{ANNOVAR_DIR}/table_annovar.pl {OUTPUT_DIR}/{sample_name}.avinput {HUMANDB} -buildver hg38 \
        -out {OUTPUT_DIR}/{sample_name}_annovar -remove \
        -protocol refGene,clinvar_20150330,exac03,gnomad30_genome \
        -operation g,f,f,f -nastring . -vcfinput

    !bcftools merge {OUTPUT_DIR}/{sample_name}.snpeff.vcf {OUTPUT_DIR}/{sample_name}_annovar.hg38_multianno.vcf \
        -o {OUTPUT_DIR}/{sample_name}_combined.vcf -O v

In [None]:
log_time("Pipeline execution complete. Results are in the output directory.")
