# Quality Filtering

In [None]:
nano qual_filter.sh

----------------------------

#!/bin/bash
#SBATCH -D /home/seuge91/heat/scripts/
#SBATCH --job-name=qual_filter
#SBATCH --nodes=1
#SBATCH --time=04:00:00
#SBATCH --ntasks=8
#SBATCH --mem=32GB
#SBATCH --partition=bmm

# for calculating the amount of time the job takes
begin=`date +%s`
echo $HOSTNAME

# loading modules
module load bbmap
source /home/csantosm/initconda
conda activate TRIMMOMATIC

# running commands
path=${1}
sample=${2}

cd ${path}

trimmomatic PE -threads 8 -phred33 \
  ./reads/raw/${sample}_R1_001.fq.gz ./reads/raw/${sample}_R2_001.fq.gz \
  ./reads/trimmed/${sample}_R1_trimmed.fq.gz ./reads/unpaired/${sample}_R1_unpaired.fq.gz \
  ./reads/trimmed/${sample}_R2_trimmed.fq.gz ./reads/unpaired/${sample}_R2_unpaired.fq.gz \
  ILLUMINACLIP:/home/csantosm/databases/TruSeq3-PE.fa:2:30:10 \
  SLIDINGWINDOW:4:30 MINLEN:50

  bbduk.sh \
    in1=./reads/trimmed/${sample}_R1_trimmed.fq.gz \
    in2=./reads/trimmed/${sample}_R2_trimmed.fq.gz \
    out1=./reads/rmphix/${sample}_R1_rmphix.fq.gz \
    out2=./reads/rmphix/${sample}_R2_rmphix.fq.gz \
    ref=/home/csantosm/databases/phix174_ill.ref.fa \
    k=31 \
    hdist=1 \
    stats=./reads/stats/${sample}_stats.txt \
    -Xmx20g

  bbduk.sh \
    in=./reads/unpaired/${sample}_R1_unpaired.fq.gz \
    out=./reads/rmphix_unpaired/${sample}_R1_rmphix_unpaired.fq.gz \
    ref=/home/csantosm/databases/phix174_ill.ref.fa \
    k=31 \
    hdist=1 \
    stats=./reads/stats/${sample}_R1_stats.txt \
    -Xmx20g

  bbduk.sh \
    in=./reads/unpaired/${sample}_R2_unpaired.fq.gz \
    out=./reads/rmphix_unpaired/${sample}_R2_rmphix_unpaired.fq.gz \
    ref=/home/csantosm/databases/phix174_ill.ref.fa \
    k=31 \
    hdist=1 \
    stats=./reads/stats/${sample}_R2_stats.txt \
    -Xmx20g

# finished commands

# getting end time to calculate time elapsed
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
    
----------------------------

# to run commands (heat)

bash
for sample in $(</home/seuge91/heat/sampleIDs.txt)
do
sbatch \
--output=/home/seuge91/heat/reads/log/${sample}.qf.log \
--error=/home/seuge91/heat/reads/err/${sample}.qf.err \
/home/seuge91/heat/scripts/qual_filter.sh /home/seuge91/heat/ $sample
done

# Assembly

In [None]:
cd ~/heat/scripts
nano megahit.sh

----------------------------

#!/bin/bash
#SBATCH -D /home/seuge91/heat/scripts/
#SBATCH --job-name=megahit
#SBATCH --nodes=1
#SBATCH -t 24:00:00
#SBATCH --ntasks=8
#SBATCH --partition=bmh

# for calculaticdng the amount of time the job takes
begin=`date +%s`
echo $HOSTNAME

# loading modules
source /home/csantosm/initconda
conda activate MEGAHIT

# move to rawreads folder
cd /home/seuge91/heat/reads/
sample=${1}

megahit -1 ./rmphix_combined/${sample}_R1_rmphix_combined.fq.gz \
-2 ./rmphix_combined/${sample}_R2_rmphix_combined.fq.gz \
-r ./rmphix_combined_unpaired/${sample}_R1_rmphix_combined_unpaired.fq.gz,./rmphix_combined_unpaired/${sample}_R2_rmphix_combined_unpaired.fq.gz \
-o ../megahit/${sample} \
--out-prefix ${sample} \
--min-contig-len 10000 --presets meta-large \
-t 8 --continue

# getting end time to calculate time elapsed
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
    
----------------------------

# to run commands
bash
for sample in $(</home/seuge91/heat/sampleIDs.txt)
do
sbatch \
--output=/home/seuge91/heat/megahit/log/${sample}.mh.log \
--error=/home/seuge91/heat/megahit/err/${sample}.mh.err \
/home/seuge91/heat/scripts/megahit.sh $sample
done

# to run samples that were suspended/unfinished
# make updated sampleIDs list with samples that need to be resubmitted 
# I made unfinishedIDs.txt 

nano unfinishedIDs.txt

# to run commands

bash
for sample in $(</home/seuge91/heat/unfinishedIDs.txt)
do
sbatch \
--output=/home/seuge91/heat/megahit/log/${sample}.mh.log \
--error=/home/seuge91/heat/megahit/err/${sample}.mh.err \
/home/seuge91/heat/scripts/megahit.sh $sample
done

----------------------------

# to count how many contigs assembled in each sample

grep -rc "k127" 

# Viral Detection

In [None]:
cd megahit
mkdir contigs
mkdir renamed_contigs

# move all contig fasta files into one folder
cd /home/seuge91/heat/megahit
mv */*contigs.fa contigs
cd contigs

# rename contig files so that it shows exactly what sample each contig came from
module load bbmap

for sample in $(<../../sample.txt)
do
rename.sh in=${sample}.contigs.fa out=${sample}.renamed.contigs.fa prefix=${sample}_contig_
done

for sample in $(<../../sampleIDs.txt)
do
rename.sh in=${sample}.contigs.fa out=${sample}.renamed.contigs.fa prefix=${sample}_contig_
done

----------------------------

#!/bin/bash
#SBATCH -D /home/seuge91/heat/scripts/
#SBATCH --job-name=vbrnt
#SBATCH --nodes=1
#SBATCH -t 12:00:00
#SBATCH --ntasks=4
#SBATCH --partition=bmh

# for calculating the amount of time the job takes
begin=`date +%s`
echo $HOSTNAME

#activate personal conda env
source /home/csantosm/initconda
conda activate VIBRANT

# move to main folder
cd /home/seuge91/heat/
sample=${1}

VIBRANT_run.py -i ./megahit/renamed_contigs/${sample}.renamed.contigs.fa \
-folder ./vibrant/${sample}_vibrant \
-t 4 -f nucl -virome

#getting end time to calculate time elapsed
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
    
----------------------------

# to run commands

bash
for sample in $(</home/seuge91/heat/sampleIDs.txt)
do
sbatch \
--output=/home/seuge91/heat/vibrant/log/${sample}.vb.log \
--error=/home/seuge91/heat/vibrant/err/${sample}.vb.err \
/home/seuge91/heat/scripts/vibrant.sh $sample
done

# Dereplication

In [None]:
# prepare files for dereplication
# in vibrant folder, make new directory for vibrant contigs
mkdir updated_vibrant_contigs

# copy all contigs to directory
cp */*/VIBRANT_phages*/*phages_combined.fna ./vibrant_contigs

# in project directory, make drep directory
mkdir updated_drep

# make subdirectory
mkdir all

# make folder for split contigs in each subdirectory
mkdir split_contigs

# move all viral contigs from each sample from vibrant to the folder called "split_contigs" ...
# ...by first moving" into vibrant folder and then vibrant_contigs directory
cd vibrant
cd updated_vibrant_contigs

# concatenate all viral contigs into a single file
cat *phages_combined.fna > all.vib.contigs.fna

# count how many total contigs
grep -c contig all.vib.contigs.fna

# first cluster using CD HIT
# rename file to use for CD Hit (input.fna)
cp all.vib.contigs.fna ./input.fna

# create CD HIT script
nano cdhit.sh

-----------------
#!/bin/bash
#SBATCH --job-name=cdhit
#SBATCH --nodes=1
#SBATCH -t 24:00:00
#SBATCH --ntasks=10
#SBATCH --output=cdhit%j.out
#SBATCH --error=cdhit%j.err
#SBATCH --partition=bmh

module load cdhit
cd-hit-est -i input.fna -o clustered_heat.fna \
-c 0.95 -aS 0.85 -M 7000 -T 10
-----------------

# run CD HIT script
sbatch cdhit.sh

# number of contigs after CD HIT: 141,638 --> 34,948

# now run dereplication

# take the concatenated file and move it to the appropriate split_contig folder within a drep/"subset"/split_contigs
mv clustered_heat.fna cd /home/seuge91/heat/updated_drep/all/split_contigs

# this works even if there is some weird error that shows up

# separate the concatenated fasta file into multiple ones
# use screen if you have a lot of contigs (this will take awhile)
awk '/^>/ {OUT=substr($0,2) ".fa"}; OUT {print >OUT}' *.fna 

    # if using screen...you can monitor how close it is to done by counting file names
        ls -1 | wc -l

# once done, remove the concatenated file from folder (you can move up a folder)
mv clustered_heat.fna ../

# make updated_drep script
nano updated_drep.sh
-----------------------------------------------------------------------------------------

#!/bin/bash
#SBATCH -D /home/seuge91/heat/updated_drep
#SBATCH --job-name=drep
#SBATCH --nodes=1
#SBATCH -t 72:00:00
#SBATCH --ntasks=24
#SBATCH --mem=900GB
#SBATCH --partition=bmh

# for calculating the amount of time the job takes
begin=`date +%s`
echo $HOSTNAME

# activate personal conda env
source /home/csantosm/initconda
conda activate DREP

treatment=${1}

cd /home/seuge91/heat/updated_drep/${treatment}

dRep dereplicate ./${treatment}_dRep \
-g ./split_contigs/*.fa \
--S_algorithm ANImf \
-sa 0.95 \
-nc 0.85 \
-l 10000 \
-N50W 0 \
-sizeW 1 \
--ignoreGenomeQuality \
--clusterAlg single \
-p 24

#getting end time to calculate time elapsed
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
    
-----------------------------------------------------------------------------------------

# run the script giving the variable of the subset folder that has the split_contigs directory
sbatch updated_drep.sh all

-----------------------------------------------------------------------------------------

# number of contigs after drep: 34,948 --> 18,869

# to zip all.drep.contigs.fa file and download so that database of dereplicated vOTUs can be uploaded to zenodo
gzip -k all.drep.contigs.fa
scp seuge91@farm.cse.ucdavis.edu:/home/seuge91/heat/updated_bowtie/all.drep.contigs.fa.gz /mnt/c/Users/segeo/Downloads

# Read Mapping

In [None]:
# make the directory structure for bowtie
mkdir bowtie
cd bowtie
mkdir err log alignments ref coverm

# concatenate all the dereplicated vOTUs into a single fasta file and save the new file in the bowtie folder.

# the name of the new database file is all.drep.contigs.fa (updated_heat)

cat /home/seuge91/heat/updated_drep/all/all_dRep/dereplicated_genomes/* >
/home/seuge91/heat/updated_bowtie/all.drep.contigs.fa

# make bowtie reference script 
cd /home/seuge91/heat/scripts
nano updated_bowtie_ref.sh

# updated_bowtie_ref.sh
----------------------------

#!/bin/bash
#SBATCH --job-name=bt2ref
#SBATCH --nodes=1
#SBATCH -t 12:00:00
#SBATCH --ntasks=1
#SBATCH --partition=bmh

# for calculating the amount of time the job takes
begin=`date +%s`
echo $HOSTNAME

module load bowtie2

cd /home/seuge91/heat/updated_bowtie/ref/

bowtie2-build ../all.drep.contigs.fa updated_all_vibrant_drep

# getting end time to calculate time elapsed
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
    
----------------------------

cd /home/seuge91/heat/scripts
sbatch --output=../updated_bowtie/log/ref.log --error=../bowtie/err/ref.err updated_bowtie_ref.sh

# map the reads against the database
# make mapping script

cd /home/seuge91/heat/scripts
nano updated_bowtie_map.sh

----------------------------

#!/bin/bash
#SBATCH --job-name=bt2map
#SBATCH --nodes=1
#SBATCH -t 2:00:00
#SBATCH --ntasks=48
#SBATCH --partition=bmh

# for calculating the amount of time the job takes
begin=`date +%s`
echo $HOSTNAME

#load modules
module load bowtie2
module load samtools

path=/home/seuge91/heat/
sample=${1}

cd /home/seuge91/heat/updated_bowtie/ref/

bowtie2 -x updated_all_vibrant_drep -p 48 \
-1 ${path}reads/rmphix/${sample}_R1_rmphix.fq.gz \
-2 ${path}reads/rmphix/${sample}_R2_rmphix.fq.gz \
-S ${path}updated_bowtie/alignments/${sample}.vib.sam \
--sensitive

cd /home/seuge91/heat/updated_bowtie/alignments
samtools view -F 4 -bS ${sample}.vib.sam | samtools sort > ${sample}.vib.sI.bam
samtools index ${sample}.vib.sI.bam

rm ${sample}.vib.sam

# getting end time to calculate time elapsed
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed

----------------------------

# run mapping script
cd /home/seuge91/heat/scripts

for sample in $(<../sampleIDs.txt)
do
  sbatch --output=../updated_bowtie/log/${sample}.map.log --error=../updated_bowtie/err/${sample}.map.err updated_bowtie_map.sh $sample
done

# Generate vOTU table

In [None]:
# make coverM script
nano coverm_tmp.sh
nano coverm.sh

nano updated_coverm.sh
----------------------------

#!/bin/bash
#SBATCH --job-name=coverm
#SBATCH --nodes=1
#SBATCH -t 10:00:00
#SBATCH --ntasks=1
#SBATCH --partition=bmh

# for calculating the amount of time the job takes
begin=`date +%s`
echo $HOSTNAME

## you need to run the command from my coverm folder
cd /home/csantosm/software/coverm-x86_64-unknown-linux-musl-0.6.1/

path=/home/seuge91/heat/updated_bowtie/

./coverm contig -m trimmed_mean --min-covered-fraction 0.75 -b ${path}/alignments/*.bam > ${path}/coverm/all.75.tmean.tsv

# getting end time to calculate time elapsed
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
    
----------------------------

# run script

cd /home/seuge91/heat/scripts
sbatch --output=../updated_bowtie/log/coverm.log --output=../updated_bowtie/err/coverm.err updated_coverm.sh

# download table

scp seuge91@farm.cse.ucdavis.edu:/home/seuge91/heat/updated_bowtie/coverm/*.tsv /mnt/c/Users/segeo/Downloads

# Generating Count Table for Differential Abundance

In [None]:
# make covermcount script
nano updated_covermcount.sh

----------------------------

#!/bin/bash
#SBATCH --job-name=covermcount
#SBATCH --nodes=1
#SBATCH -t 10:00:00
#SBATCH --ntasks=1
#SBATCH --partition=bmh

# for calculating the amount of time the job takes
begin=`date +%s`
echo $HOSTNAME

## you need to run the command from my coverm folder
cd /home/csantosm/software/coverm-x86_64-unknown-linux-musl-0.6.1/

path=/home/seuge91/heat/updated_bowtie/

./coverm contig -m count -b ${path}/alignments/*.bam > ${path}/coverm/all.count.tsv

# getting end time to calculate time elapsed
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
    
----------------------------

# Download table
    
scp seuge91@farm.cse.ucdavis.edu:/home/seuge91/heat/updated_bowtie/coverm/*.count.tsv /mnt/c/Users/segeo/Downloads

# Host Prediction

In [None]:
#!/bin/bash
#SBATCH -D /home/seuge91/heat/scripts/
#SBATCH --job-name=iphop_split
#SBATCH --nodes=1
#SBATCH -t 1:00:00
#SBATCH --ntasks=1
#SBATCH --partition=bmm

# for calculating the amount of time the job takes
begin=`date +%s`
echo $HOSTNAME

source /home/csantosm/initconda
conda activate IPHOP

set=${1}

full_db=/home/seuge91/heat/updated_bowtie/all.drep.contigs.fa
split_db=/home/seuge91/heat/iphop/split_db

iphop split --input_file ${full_db} --split_dir ${split_db}

#getting end time to calculate time elapsed
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed

----------------------------

#!/bin/bash
#SBATCH -D /home/seuge91/heat/scripts/
#SBATCH --job-name=iphop_predict
#SBATCH --nodes=1
#SBATCH -t 24:00:00
#SBATCH --mem=128GB
#SBATCH --ntasks=24
#SBATCH --partition=bmm

# for calculating the amount of time the job takes
begin=`date +%s`
echo $HOSTNAME

source /home/csantosm/initconda
conda activate IPHOP

batch=${1}

iphop_db=/home/csantosm/databases/IPHOP_db/Sept_2021_pub

cd /home/seuge91/heat/iphop

iphop predict --fa_file ./split_db/${batch}.fna --out_dir ./results/${batch} --db_dir ${iphop_db} --num_threads 24

#getting end time to calculate time elapsed
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
    
----------------------------
    
cd /home/seuge91/heat/scripts

batches="batch_00001 batch_00002 batch_00003 batch_00004 batch_00005 batch_00006 batch_00007 batch_00008 batch_00009 batch_00010 batch_00011 batch_00012 batch_00013 batch_00014 batch_00015 batch_00016 batch_00017 batch_00018"
for batch in $batches
do
sbatch --output=../iphop/log/${batch}.log --error=../iphop/err/${batch}.err iphop_predict.sh $batch
done

batches="batch_00000"
for batch in $batches
do
sbatch --output=../iphop/log/${batch}.log --error=../iphop/err/${batch}.err iphop_predict.sh $batch
done

# combine results from batches
# get header from one of the batches
cd batch_00000
head -n 1 Host_prediction_to_genome_m90.csv > all_Host_prediction_to_genome_m90.csv
head -n 10 Detailed_output_by_tool.csv > all_Detailed_output_by_tool.csv
# move files up a folder
mv all_Host_prediction_to_genome_m90.csv ..
#get body of each batch and concatenate to main file
tail -n +2 -q batch_*/Host_prediction_to_genome_m90.csv >> all_Host_prediction_to_genome_m90.csv
tail -n +2 -q batch_*/Host_prediction_to_genus_m90.csv >> all_Host_prediction_to_genus_m90.csv
tail -n +11 batch_*/Detailed_output_by_tool.csv >> all_Detailed_output_by_tool.csv

# download files
scp seuge91@farm.cse.ucdavis.edu:/home/seuge91/heat/iphop/results/*.csv /mnt/c/Users/segeo/Downloads

# vcontact

In [None]:
# make vcontact directory for vibrant contig proteins!!
mkdir vcontact

# within vcontact directory, make more directories
mkdir err log

# concatenate all viral contigs from drep into a single file
cat *.fa > allcontigs.drep.fa 

# count how many total contigs
grep -c contig allcontigs.drep.fa
#18,869

# move file into vcontact directory
mv allcontigs.drep.fa ../../../vcontact

# predict proteins 
#-i flag means input file, all contigs from drep 
#-a flag means output file that is protein, needs to be named with end of ".faa" 
 
source /home/csantosm/inticonda 
conda activate PRODIGAL 
prodigal -i allcontigs.drep.fa -a votus.prodigal.faa -p meta


# create gene2genome.csv file
# make script
nano gtg.sh

----------------------------

#!/bin/bash
#SBATCH -D /home/seuge91/heat/scripts/
#SBATCH --job-name=g2g
#SBATCH --nodes=1
#SBATCH -t 1:00:00
#SBATCH --partition=bmm

# for calculating the amount of time the job takes
begin=`date +%s`
echo $HOSTNAME

source /home/csantosm/initconda
conda activate VCONTACT2

cd /home/seuge91/heat/vcontact/

vcontact2_gene2genome -p ./votus.prodigal.faa  \
-o ./votus.gene2genome.csv \
-s Prodigal-FAA

# getting end time to calculate time elapsed
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed

----------------------------

# run script
sbatch --output=../vcontact/log/g2g.log --error=../vcontact/err/g2g.err g2g.sh

# create vcontact script
cd /home/seuge91/heat/scripts/
nano vcontact

----------------------------

#!/bin/bash
#SBATCH -D /home/seuge91/heat/scripts/
#SBATCH --job-name=vcontact
#SBATCH --nodes=1
#SBATCH -t 48:00:00
#SBATCH --ntasks=16
#SBATCH --partition=bmm

# for calculating the amount of time the job takes
begin=`date +%s`
echo $HOSTNAME

source /home/csantosm/initconda
conda activate VCONTACT2
module load deprecated/java

cd /home/seuge91/heat/vcontact/

vcontact2 --raw-proteins votus.prodigal.faa \
--rel-mode 'Diamond' \
--db 'ProkaryoticViralRefSeq85-Merged' \
--proteins-fp votus.gene2genome.csv \
--pcs-mode MCL \
--vcs-mode ClusterONE \
--threads 16 \
--c1-bin /home/csantosm/miniconda3/bin/cluster_one-1.0.jar \
--output-dir vcontact_out

# getting end time to calculate time elapsed
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
    
----------------------------

# run script
sbatch --output=../vcontact/log/vc2.log --error=../vcontact/err/vc2.err vcontact.sh

# download files
scp seuge91@farm.cse.ucdavis.edu:/home/seuge91/heat/vcontact/vcontact_out/genome_by_genome_overview.csv /mnt/c/Users/segeo/Downloads
scp seuge91@farm.cse.ucdavis.edu:/home/seuge91/heat/vcontact/vcontact_out/c1.ntw /mnt/c/Users/segeo/Downloads