# nf-core CHIP-seq pipeline 
20240715 

Sarah Fong
# How to 
1. Set up environment for running nf-core chip-seq pipeline
2. Run nf-core CHIP-seq pipeline using parallelized jobs on Wynton
   
See nf-core documentation: https://nf-co.re/chipseq/2.0.0/

2. run the pipeline once through the command line while connceted to the internet (i.e. NOT in a job submission!)

# Setting up environment

1. Add miniconda into your .bashrc (do once)

        /wynton/group/ahituv/bin/miniconda3/bin/conda init bash

2. source the new bashrc

        source ~/.bashrc 

3. create nf core virtual environment

       conda env create -n env_nf -f /wynton/group/ahituv/bin/pipelines/env/env_nf.yml

4. activate the environment

       conda activate env_nf

5. make qsub output dir
    
        mkdir $HOME/qsub_log/

# Run nf-core ATAC-seq pipeline

## if running pipeline for the first time, you need to do the following steps:
1. export the nf_singluarity_cache
   
       export $NXF_SINGULARITY_CACHEDIR=/wynton/group/ahituv/bin/pipelines/singularity-cache

2. do test run

        nextflow run /wynton/group/ahituv/bin/pipelines/nf-core-chipseq-dev/workflow/main.nf -profile test,singularity --outdir /wynton/group/ahituv/bin/pipelines/nf-core-chipseq-dev/test --igenomes_base /wynton/group/ahituv/bin/pipelines/igenomes/references --aligner bowtie2

3. copy ./work/singularity/* to cache_dir

       cp ./work/singularity/* /wynton/group/ahituv/bin/pipelines/singularity-cache/

## Make sample.csv
   
   - headers: sample,fastq_1,fastq_2,replicate,antibody,control,control_replicate
        - sample: sample label (e.g. control/treated)
        - fastq_1: PE file 1
        - fastq_2: PE file 2
        - replicate: replicate number
        - antibody: antibody used, if any
        - control: if antibody used, specific which control

    example:

    # head /wynton/group/ahituv/fongsl/projects/EMF/data/ChIP/samples.csv
    
    sample,fastq1,fastq2,Antibody,Control
    H3K27ac_noPEMF_1,/wynton/group/ahituv/fongsl/projects/EMF/data/ChIP/01.RawData/H3K27ac_noPEMF_1/H3K27ac_noPEMF_1_CKDL240022566-1A_22CCGWLT4_L7_1.fq.gz,/wynton/group/ahituv/fongsl/projects/EMF/data/ChIP/01.RawData/H3K27ac_noPEMF_1/H3K27ac_noPEMF_1_CKDL240022566-1A_22CCGWLT4_L7_2.fq.gz,H3K27ac,input_noPEMF_1
    H3K27ac_noPEMF_2,/wynton/group/ahituv/fongsl/projects/EMF/data/ChIP/01.RawData/H3K27ac_noPEMF_2/H3K27ac_noPEMF_2_CKDL240022566-1A_22CCGWLT4_L7_1.fq.gz,/wynton/group/ahituv/fongsl/projects/EMF/data/ChIP/01.RawData/H3K27ac_noPEMF_2/H3K27ac_noPEMF_2_CKDL240022566-1A_22CCGWLT4_L7_2.fq.gz,H3K27ac,input_noPEMF_2
    H3K27ac_noPEMF_3,/wynton/group/ahituv/fongsl/projects/EMF/data/ChIP/01.RawData/H3K27ac_noPEMF_3/H3K27ac_noPEMF_3_CKDL240022566-1A_22CCGWLT4_L7_1.fq.gz,/wynton/group/ahituv/fongsl/projects/EMF/data/ChIP/01.RawData/H3K27ac_noPEMF_3/H3K27ac_noPEMF_3_CKDL240022566-1A_22CCGWLT4_L7_2.fq.gz,H3K27ac,input_noPEMF_3
    H3K27ac_PEMF_1,/wynton/group/ahituv/fongsl/projects/EMF/data/ChIP/01.RawData/H3K27ac_PEMF_1/H3K27ac_PEMF_1_CKDL240022566-1A_22CCGWLT4_L7_1.fq.gz,/wynton/group/ahituv/fongsl/projects/EMF/data/ChIP/01.RawData/H3K27ac_PEMF_1/H3K27ac_PEMF_1_CKDL240022566-1A_22CCGWLT4_L7_2.fq.gz,H3K27ac,input_PEMF_1
       

##### Save this in a directory that you can find again later!!

## Fill out the following variables
- Run parallelized SGE pipeline with nfcore-atac-wynton.sh 
- NOTE: If your dataset is larger than 8 samples you will need to increase the L_RT time variable 
- For more info on parallel job submission, see https://wynton.ucsf.edu/hpc/scheduler/submit-jobs.html

In [1]:
import os, sys

## Run nextflow with parallelized Wynton SGE job 

In [3]:
# Input/output files
SAMPLE_FILE =  "/wynton/group/ahituv/fongsl/projects/EMF/data/ChIP/samples.csv" # <<FULL_PATH_TO_SAMPLE_FILE>>
# e.g. SAMPLE_FILE = "/wynton/group/ahituv/fongsl/projects/EMF/data/ATAC/WTC11NGN2.1/samples.csv"

OUTDIR = "/wynton/group/ahituv/fongsl/projects/EMF/results/ChIP/WTC11NGN2/" # <<FULL_PATH_TO_OUTPUT_DIRECTORY>>
# e.g. OUTDIR = "/wynton/group/ahituv/fongsl/projects/EMF/results/ATAC"


# Sequencing params
READ_LEN = 150  # PE is 150bp


# Wynton params
N_SLOTS = 8  # n slots to run in parallel 
MEM_PER_SLOT = 8  # Gb per slot. If 8 slots at 8G/slot = 64G total
SH_QSUB_SCRIPT = "/wynton/group/ahituv/bin/pipelines/nf-core-chipseq-dev/nfcore-chip-wynton.sh"  # custom script to run nf-core ATAC-seq pipeline
H_RT="08:20:00"   # amount of time, HH:MM:SS


# put the command together. 
cmd = f"qsub -pe smp {N_SLOTS} -l mem_free={MEM_PER_SLOT}G -l h_rt={H_RT} {SH_QSUB_SCRIPT} {SAMPLE_FILE} {OUTDIR} {READ_LEN}"

# the pipeline command
print(cmd)

qsub -pe smp 8 -l mem_free=8G -l h_rt=08:20:00 /wynton/group/ahituv/bin/pipelines/nf-core-chipseq-dev/nfcore-chip-wynton.sh /wynton/group/ahituv/fongsl/projects/EMF/data/ChIP/samples.csv /wynton/group/ahituv/fongsl/projects/EMF/results/ChIP/WTC11NGN2/ 150


### copy the command above and run in command line. 

if increasing time argument

## Alternative - Run nextflow from the command line

In [8]:
# if running in command line - set cachedir 
NXF_CONDA_CACHEDIR="/wynton/group/ahituv/bin/pipelines/singularity-cache/"
set_cachedir = f"NXF_CONDA_CACHEDIR={NXF_CONDA_CACHEDIR}"
print(set_cachedir)


cmd_cmdline = f"nextflow run /wynton/group/ahituv/bin/pipelines/nf-core-chipseq-dev/workflow/ \
                --input {SAMPLE_FILE} \
                --outdir {OUTDIR} \
                --fasta /wynton/group/ahituv/data/dna/hg38/hg38.fa.gz \
                --gtf /wynton/group/ahituv/data/dna/hg38/hg38.knownGene.gtf.gz \
                --genome GRCh38 \
                --read_length {READ_LEN}  \
                -profile singularity \
                --narrow_peak \
                --container-cache-utilisation amend \
                --igenomes_base /wynton/group/ahituv/bin/pipelines/igenomes/references \
                --aligner bowtie2 \
                -resume"
print(cmd_cmdline)

NXF_CONDA_CACHEDIR=/wynton/group/ahituv/bin/pipelines/singularity-cache/
nextflow run /wynton/group/ahituv/bin/pipelines/nf-core-chipseq-dev/workflow/                 --input /wynton/group/ahituv/fongsl/projects/EMF/data/ChIP/samples.csv                 --outdir /wynton/group/ahituv/fongsl/projects/EMF/results/ChIP/WTC11NGN2/                 --fasta /wynton/group/ahituv/data/dna/hg38/hg38.fa.gz                 --gtf /wynton/group/ahituv/data/dna/hg38/hg38.knownGene.gtf.gz                 --genome GRCh38                 --read_length 150                  -profile singularity                 --narrow_peak                 --container-cache-utilisation amend                 --igenomes_base /wynton/group/ahituv/bin/pipelines/igenomes/references                 --aligner bowtie2                 -resume


### copy the command above and run in command line. 

if increasing time argument