In [None]:
# download 7 runs from 2019 study (Quail Ridge and McLaughlin - Woodland and Chaparral)

SRR18906547 -> QR_2_2
SRR18906548 -> QR_1_2
SRR18906549 -> QR_1_1
SRR18906550 -> McL_6_2
SRR18906551 -> McL_6_1
SRR18906557 -> McL_1_2
SRR18906558 -> McL_1_1

module load sratoolkit/
# configure toolkit: https://github.com/ncbi/sra-tools/wiki/03.-Quick-Toolkit-Configuration
vdb-config -i
# download files
prefetch SRR18906558
# extract files
fasterq-dump SRR18906558/

# move all 2019 study files to natrsv directory
mkdir SRA
mkdir reads

# process reads
cd reads

# QUALITY FILTERING
nano qual_filter.sh
 
-----------------

#!/bin/bash
#SBATCH -D /home/seuge91/natrsv/reads
#SBATCH --job-name=qual_filter
#SBATCH --nodes=1
#SBATCH --time=04:00:00
#SBATCH --ntasks=8
#SBATCH --mem=32GB
#SBATCH --partition=bmm

# for calculating the amount of time the job takes
begin=`date +%s`
echo $HOSTNAME

# loading modules
module load bbmap
source /home/csantosm/initconda
conda activate TRIMMOMATIC

# running commands
path=${1}
sample=${2}

cd ${path}

trimmomatic PE -threads 8 -phred33 \
  ./reads/raw/${sample}_1.fastq ./reads/raw/${sample}_2.fastq \
  ./reads/trimmed/${sample}_R1_trimmed.fq ./reads/unpaired/${sample}_R1_unpaired.fq \
  ./reads/trimmed/${sample}_R2_trimmed.fq ./reads/unpaired/${sample}_R2_unpaired.fq \
  ILLUMINACLIP:/home/csantosm/databases/TruSeq3-PE.fa:2:30:10 \
  SLIDINGWINDOW:4:30 MINLEN:50

  bbduk.sh \
    in1=./reads/trimmed/${sample}_R1_trimmed.fq \
    in2=./reads/trimmed/${sample}_R2_trimmed.fq \
    out1=./reads/rmphix/${sample}_R1_rmphix.fq \
    out2=./reads/rmphix/${sample}_R2_rmphix.fq \
    ref=/home/csantosm/databases/phix174_ill.ref.fa \
    k=31 \
    hdist=1 \
    stats=./reads/stats/${sample}_stats.txt \
    -Xmx20g

  bbduk.sh \
    in=./reads/unpaired/${sample}_R1_unpaired.fq \
    out=./reads/rmphix_unpaired/${sample}_R1_rmphix_unpaired.fq \
    ref=/home/csantosm/databases/phix174_ill.ref.fa \
    k=31 \
    hdist=1 \
    stats=./reads/stats/${sample}_R1_stats.txt \
    -Xmx20g

  bbduk.sh \
    in=./reads/unpaired/${sample}_R2_unpaired.fq \
    out=./reads/rmphix_unpaired/${sample}_R2_rmphix_unpaired.fq \
    ref=/home/csantosm/databases/phix174_ill.ref.fa \
    k=31 \
    hdist=1 \
    stats=./reads/stats/${sample}_R2_stats.txt \
    -Xmx20g

# finished commands

# getting end time to calculate time elapsed
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
    
----------------------------

# to run commands

bash
for sample in $(</home/seuge91/natrsv/sample.txt)
do
sbatch \
--output=/home/seuge91/natrsv/reads/log/${sample}.qf.log \
--error=/home/seuge91/natrsv/reads/err/${sample}.qf.err \
/home/seuge91/natrsv/reads/qual_filter.sh /home/seuge91/natrsv $sample
done

----------------------------------------------------------------------------------------------------------------

# ASSEMBLY
cd ~natrsv/megahit
nano megahit.sh

----------------------------

#!/bin/bash
#SBATCH -D /home/seuge91/natrsv/megahit/
#SBATCH --job-name=megahit2
#SBATCH --nodes=1
#SBATCH -t 24:00:00
#SBATCH --ntasks=8
#SBATCH --partition=high2
#SBATCH --mem=50gb

# for calculating the amount of time the job takes
begin=`date +%s`
echo $HOSTNAME

# loading modules
source /home/csantosm/initconda
conda activate MEGAHIT

# move to rawreads folder
cd /home/seuge91/natrsv/reads/
sample=${1}

megahit -1 ./rmphix/${sample}_R1_rmphix.fq \
-2 ./rmphix/${sample}_R2_rmphix.fq \
-r ./rmphix_unpaired/${sample}_R1_rmphix_unpaired.fq,./rmphix_unpaired/${sample}_R2_rmphix_unpaired.fq \
-o ../megahit/${sample} \
--out-prefix ${sample} \
--min-contig-len 10000 --presets meta-large \
-t 8 --continue

# getting end time to calculate time elapsed
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
    
----------------------------

# to run commands

bash
for sample in $(</home/seuge91/natrsv/sample.txt)
do
sbatch \
--output=/home/seuge91/natrsv/megahit/log/${sample}.mh.log \
--error=/home/seuge91/natrsv/megahit/err/${sample}.mh.err \
/home/seuge91/natrsv/megahit/megahit2.sh $sample
done

# to run samples that were suspended/unfinished
# make updated sampleIDs list with samples that need to be resubmitted 
# I made redo.txt 

# to run commands

bash
for sample in $(</home/seuge91/natrsv/redo.txt)
do
sbatch \
--output=/home/seuge91/natrsv/megahit/log/${sample}.mh.log \
--error=/home/seuge91/natrsv/megahit/err/${sample}.mh.err \
/home/seuge91/natrsv/megahit/megahit2.sh $sample
done

----------------------------

# to count how many contigs assembled in each sample

grep -rc "k127"

----------------------------------------------------------------------------------------------------------------

# VIRAL DETECTION

cd megahit
mkdir contigs
mkdir renamed_contigs

# move all contig fasta files into one folder
cd /home/seuge91/natrsv/megahit
mv */*contigs.fa contigs
cd contigs

# rename contig files so that it shows exactly what sample each contig came from
module load bbmap

for sample in $(<../../sample.txt)
do
rename.sh in=${sample}.contigs.fa out=${sample}.renamed.contigs.fa prefix=${sample}_contig_
done

# it didn't work because java wasnt installed
# had to load deprecated version for it to work
module load deprecated/java

mv *renamed.contigs.fa ../renamed_contigs

cd /home/seuge91/natrsv
mkdir vibrant
cd vibrant
mkdir err log

# make vibrant script
nano vibrant.sh

----------------------------

#!/bin/bash
#SBATCH -D /home/seuge91/natrsv/vibrant/
#SBATCH --job-name=vbrnt
#SBATCH --nodes=1
#SBATCH -t 12:00:00
#SBATCH --ntasks=4
#SBATCH --partition=high2
#SBATCH --mem=50gb

# for calculating the amount of time the job takes
begin=`date +%s`
echo $HOSTNAME

#activate personal conda env
source /home/csantosm/initconda
conda activate VIBRANT

# move to main folder
cd /home/seuge91/natrsv/
sample=${1}

VIBRANT_run.py -i ./megahit/renamed_contigs/${sample}.renamed.contigs.fa \
-folder ./vibrant/${sample}_vibrant \
-t 4 -f nucl -virome

#getting end time to calculate time elapsed
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
        
----------------------------

# to run commands

bash
for sample in $(</home/seuge91/natrsv/sample.txt)
do
sbatch \
--output=/home/seuge91/natrsv/vibrant/log/${sample}.vb.log \
--error=/home/seuge91/natrsv/vibrant/err/${sample}.vb.err \
/home/seuge91/natrsv/vibrant/vibrant.sh $sample
done

# prepare files for dereplication
# in vibrant folder, make new directory for vibrant contigs
mkdir vibrant_contigs

# copy all contigs to directory
cp */*/VIBRANT_phages*/*phages_combined.fna ./vibrant_contigs

# in project directory, make drep directory
mkdir drep

# make subdirectory
mkdir all

# make folder for split contigs in each subdirectory
mkdir split_contigs

# move all viral contigs from each sample from vibrant to the folder called "split_contigs" ...
# ...by first moving" into vibrant folder and then vibrant_contigs directory
cd vibrant
cd updated_vibrant_contigs

# concatenate all viral contigs into a single file
cat *phages_combined.fna > natrsv.vib.contigs.fna

# count how many total contigs
grep -c contig natrsv.vib.contigs.fna

# natrsv = 436 contigs

# move combined file to LNU directory... continue on to next cell

In [None]:
# Move LNU predicted vOTUs to a new directory
mkdir LNU
cd fire/vibrant
cp lnu.vib.contigs.fna ../../LNU/
cd ~/LNU

# starting input files
lnu.vib.contigs.fna
    # count how many total contigs
    grep -c contig lnu.vib.contigs.fna
    # 77,433 contigs 
    grep -c contig natrsv.vib.contigs.fna
    # 436 contigs
# combined input file
    # count how many total contigs
    grep -c contig all.vib.contigs.fna
    # 77,869 contigs
    
# make cdhit directory
mkdir cdhit

# cluster using CD HIT
# rename file to use for CD Hit (input.fna)
cp all.vib.contigs.fna ./input.fna

# create CD HIT script
nano cdhitlnu.sh

-----------------
#!/bin/bash
#SBATCH --job-name=cdhit
#SBATCH --nodes=1
#SBATCH -t 24:00:00
#SBATCH --ntasks=10
#SBATCH --output=cdhit%j.out
#SBATCH --error=cdhit%j.err
#SBATCH --partition=bmh

module load cdhit
cd-hit-est -i input.fna -o clustered_lnu.fna \
-c 0.95 -aS 0.85 -d 0 -M 14000 -T 10
-----------------

# run CD HIT script for LNU contigs
sbatch --output=./log/lnu.log --error=./err/lnu.err cdhitlnu.sh

# number of contigs after CD HIT: 77,869 --> 70,135

# now run dereplication

# take the concatenated file and move it to the appropriate split_contig folder within a drep/"subset"/split_contigs
mv clustered_lnu.fna ../drep/all/split_contigs

# separate the concatenated fasta file into multiple ones
# use screen because this will take awhile...
screen
cd ~/LNU/drep/all/split_contigs
awk '/^>/ {OUT=substr($0,2) ".fa"}; OUT {print >OUT}' *.fna 
    # if using screen...you can monitor how close it is to done by counting file names
        ls -1 | wc -l        
# detach from screen: ctrl a + d
# to see screens running
screen -r
# to end screen
exit

# once done, remove the concatenated file from folder (you can move up a folder)
mv clustered_all.fna ../

# divide contigs into ~ six batches 
mkdir batch1 batch2 batch3 batch4

# within each batch, make a split_contigs folder
cd batch1
mkdir split_contigs

# because 70,136 is too many for one run of drep, it will run out of memory
# each batch needs to be less than 50,000, so divide into 4 batches
cd all/split_contigs
ls | head -n 17534 | xargs -I {} mv {} ../../batch1/split_contigs
ls | head -n 17534 | xargs -I {} mv {} ../../batch2/split_contigs
ls | head -n 17534 | xargs -I {} mv {} ../../batch3/split_contigs
ls | head -n 17534 | xargs -I {} mv {} ../../batch4/split_contigs

# make drep script
nano drep.sh
-----------------------------------------------------------------------------------------

#!/bin/bash
#SBATCH -D /home/seuge91/LNU/drep
#SBATCH --job-name=drep
#SBATCH --nodes=1
#SBATCH -t 72:00:00
#SBATCH --ntasks=24
#SBATCH --mem=900GB
#SBATCH --partition=bmh

# for calculating the amount of time the job takes
begin=`date +%s`
echo $HOSTNAME

# activate personal conda env
source /home/csantosm/initconda
conda activate DREP

treatment=${1}

cd /home/seuge91/LNU/drep/${treatment}

dRep dereplicate ./${treatment}_dRep \
-g ./split_contigs/*.fa \
--S_algorithm ANImf \
-sa 0.95 \
-nc 0.85 \
-l 10000 \
-N50W 0 \
-sizeW 1 \
--ignoreGenomeQuality \
--clusterAlg single \
-p 24

#getting end time to calculate time elapsed
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
    
-----------------------------------------------------------------------------------------
# run the script giving the variable of the subset folder that has the split_contigs directory
sbatch drep.sh batch1
sbatch drep.sh batch2
sbatch drep.sh batch3
sbatch drep.sh batch4

-----------------------------------------------------------------------------------------

# number of contigs after drep: 70,135 --> ?? 
# batch1: 17,534 --> 16,858
# batch2: 17,534 --> 16,968
# batch3: 17,534 --> 16,975
# batch4: 17,534 --> 16,969

# Ok whatever, I'm just going to use the CD HIT output...

# how to download the CD HIT output file
scp seuge91@farm.cse.ucdavis.edu:/home/seuge91/blodgett_combined/cdhit/output/clustered_all.fna.clstr /mnt/c/Users/segeo/Downloads

In [None]:
# read mapping

# make the directory structure for bowtie
mkdir bowtie
cd bowtie
mkdir err log alignments ref coverm

# input will be final CD HIT file
drep/all/clustered_lnu.fna

# convert this to a .fa file
cp clustered_lnu.fna clustered_all.fa

# make bowtie reference script
nano bowtie_ref.sh

# I needed to increase memory to 200GB because such a large dataset...
----------------------------

#!/bin/bash
#SBATCH --job-name=bt2ref
#SBATCH --nodes=1
#SBATCH -t 12:00:00
#SBATCH --ntasks=1
#SBATCH --mem=200GB
#SBATCH --partition=bmh

# for calculating the amount of time the job takes
begin=`date +%s`
echo $HOSTNAME

module load bowtie2

cd /home/seuge91/LNU/bowtie/ref/

bowtie2-build ../clustered_all.fa all_vibrant_cdhit

# getting end time to calculate time elapsed
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
    
----------------------------

# run the script to index the reference database
sbatch --output=./log/ref.log --error=./err/ref.err bowtie_ref.sh

# map the reads against the database
# make mapping script
nano bowtie_map.sh

----------------------------

#!/bin/bash
#SBATCH --job-name=bt2map
#SBATCH --nodes=1
#SBATCH -t 6:00:00
#SBATCH --ntasks=48
#SBATCH --partition=bmh
#SBATCH --mem=50gb

# for calculating the amount of time the job takes
begin=`date +%s`
echo $HOSTNAME

#load modules
module load bowtie2
module load samtools

path=/home/seuge91/LNU/
sample=${1}

cd /home/seuge91/LNU/bowtie/ref/

bowtie2 -x all_vibrant_cdhit -p 48 \
-1 ${path}reads/rmphix/${sample}_R1_rmphix.fq.gz \
-2 ${path}reads/rmphix/${sample}_R2_rmphix.fq.gz \
-S ${path}bowtie/alignments/${sample}.vib.sam \
--sensitive

cd /home/seuge91/LNU/bowtie/alignments
samtools view -F 4 -bS ${sample}.vib.sam | samtools sort > ${sample}.vib.sI.bam
samtools index ${sample}.vib.sI.bam

rm ${sample}.vib.sam

# getting end time to calculate time elapsed
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
    
----------------------------

# edit file names first:
for file in *_rmphix_combined.fq.gz; do mv "$file" "${file/_combined/}"; done

# run mapping script
for sample in $(<../sample.txt)
do
  sbatch --output=./log/${sample}.map.log --error=./err/${sample}.map.err bowtie_map.sh $sample
done


In [None]:
# generate vOTU table

# make coverM script
nano coverm.sh

----------------------------

#!/bin/bash
#SBATCH --job-name=coverm
#SBATCH --nodes=1
#SBATCH -t 10:00:00
#SBATCH --ntasks=1
#SBATCH --partition=high2

# for calculating the amount of time the job takes
begin=`date +%s`
echo $HOSTNAME

## you need to run the command from my coverm folder
cd /home/csantosm/software/coverm-x86_64-unknown-linux-musl-0.6.1/

path=/home/seuge91/LNU/bowtie/

./coverm contig -m trimmed_mean --min-covered-fraction 0.75 -b ${path}/alignments/*.bam > ${path}/coverm/all.good.75.tmean.tsv

# getting end time to calculate time elapsed
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
    
----------------------------

# run script
sbatch --output=./bowtie/log/coverm.log --output=./bowtie/err/coverm.err coverm.sh

# download table

scp seuge91@farm.cse.ucdavis.edu:/home/seuge91/LNU/bowtie/coverm/*.tsv /mnt/c/Users/segeo/Downloads

In [None]:
# Generating Count Table for Differential Abundance

# make covermcount script
nano covermcount.sh

----------------------------

#!/bin/bash
#SBATCH --job-name=covermcount
#SBATCH --nodes=1
#SBATCH -t 10:00:00
#SBATCH --ntasks=1
#SBATCH --partition=bmh

# for calculating the amount of time the job takes
begin=`date +%s`
echo $HOSTNAME

## you need to run the command from my coverm folder
cd /home/csantosm/software/coverm-x86_64-unknown-linux-musl-0.6.1/

path=/home/seuge91/blodgett_combined/bowtie/

./coverm contig -m count -b ${path}/alignments/*.bam > ${path}/coverm/all.count.tsv

# getting end time to calculate time elapsed
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
        
----------------------------
    
# Download table
    
scp seuge91@farm.cse.ucdavis.edu:/home/seuge91/heat/bowtie/coverm/*.count.tsv /mnt/c/Users/segeo/Downloads

In [None]:
# host prediction

mkdir iphop
mkdir err log split_db results
nano iphop_split.sh

----------------------------

#!/bin/bash
#SBATCH -D /home/seuge91/LNU/iphop/
#SBATCH --job-name=iphop_split
#SBATCH --nodes=1
#SBATCH -t 1:00:00
#SBATCH --ntasks=1
#SBATCH --partition=high2

# for calculating the amount of time the job takes
begin=`date +%s`
echo $HOSTNAME

source /home/csantosm/initconda
conda activate IPHOP

set=${1}

full_db=/home/seuge91/LNU/bowtie/clustered_all.fa
split_db=/home/seuge91/LNU/iphop/split_db

iphop split --input_file ${full_db} --split_dir ${split_db}

#getting end time to calculate time elapsed
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed

----------------------------

sbatch --output=./log/iphop_split.log --error=./err/iphop_split.err iphop_split.sh

nano iphop_predict.sh

----------------------------

#!/bin/bash
#SBATCH -D /home/seuge91/LNU/iphop/
#SBATCH --job-name=iphop_predict
#SBATCH --nodes=1
#SBATCH -t 24:00:00
#SBATCH --mem=128GB
#SBATCH --ntasks=24
#SBATCH --partition=high2

# for calculating the amount of time the job takes
begin=`date +%s`
echo $HOSTNAME

source /home/csantosm/initconda
conda activate IPHOP

batch=${1}

iphop_db=/home/csantosm/databases/IPHOP_db/Sept_2021_pub

cd /home/seuge91/LNU/iphop

iphop predict --fa_file ./split_db/${batch}.fna --out_dir ./results/${batch} --db_dir ${iphop_db} --num_threads 24

#getting end time to calculate time elapsed
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
    
----------------------------

batches="batch_00000 batch_00009 batch_00018 batch_00027 batch_00036 batch_00045 batch_00054 batch_00063 batch_00001 batch_00010 batch_00019 batch_00028 batch_00037 batch_00046 batch_00055 batch_00064 batch_00002 batch_00011 batch_00020 batch_00029 batch_00038 batch_00047 batch_00056 batch_00065 batch_00003 batch_00012 batch_00021 batch_00030 batch_00039 batch_00048 batch_00057 batch_00066 batch_00004 batch_00013 batch_00022 batch_00031 batch_00040 batch_00049 batch_00058 batch_00067 batch_00005 batch_00014 batch_00023 batch_00032 batch_00041 batch_00050 batch_00059 batch_00068 batch_00006 batch_00015 batch_00024 batch_00033 batch_00042 batch_00051 batch_00060 batch_00069 batch_00007 batch_00016 batch_00025 batch_00034 batch_00043 batch_00052 batch_00061 batch_00070 batch_00008 batch_00017 batch_00026 batch_00035 batch_00044 batch_00053 batch_00062"
for batch in $batches
do
sbatch --output=./log/${batch}.log --error=./err/${batch}.err iphop_predict.sh $batch
done


# others that didn't finish or whatever...

batches="batch_00087 batch_00077 batch_00073"
for batch in $batches
do
sbatch --output=./log/${batch}.log --error=./err/${batch}.err iphop_predict.sh $batch
done

# combine results from batches
# get header from one of the batches
cd results
cd batch_00000
head -n 1 Host_prediction_to_genome_m90.csv > all_Host_prediction_to_genome_m90.csv
head -n 1 Host_prediction_to_genus_m90.csv > all_Host_prediction_to_genus_m90.csv
head -n 10 Detailed_output_by_tool.csv > all_Detailed_output_by_tool.csv
# move files up a folder
mv all_Host_prediction_to_genome_m90.csv ..
mv all_Host_prediction_to_genus_m90.csv ..
mv all_Detailed_output_by_tool.csv ..
# get body of each batch and concatenate to main file
cd ../
tail -n +2 -q batch_*/Host_prediction_to_genome_m90.csv >> all_Host_prediction_to_genome_m90.csv
tail -n +2 -q batch_*/Host_prediction_to_genus_m90.csv >> all_Host_prediction_to_genus_m90.csv
tail -n +11 batch_*/Detailed_output_by_tool.csv >> all_Detailed_output_by_tool.csv

# download files
scp seuge91@farm.cse.ucdavis.edu:/home/seuge91/blodgett_combined/iphop/results/*.csv /mnt/c/Users/segeo/Downloads

In [None]:
## DID NOT NEED TO DO THIS...

# PIGEON
# download PIGEON database, be sure to be logged out

rsync -r --progress /mnt/c/Users/segeo/Downloads/PIGEON.tar.gz seuge91@farm:~/
    
tar -xvzf PIGEON.tar.gz

# How many contigs in PIGEON.fa?
grep ">" PIGEON.fa | wc -l
    # 515,763 - correct!

# add my contigs from my experiments to PIGEON.fa
cat blodgett_PIGEON.fa >> PIGEON.fa   
# changed PIGEON.fa to PIGEON3.0.fa

# now, I need to decide to do vcontact or read mapping...

# read mapping
# add LNU contigs to PIGEON
cat clustered_all.fa >> ../PIGEON3.0.fa
mv PIGEON3.0.fa PIGEON3.0_LNU.fa
nano cdhitPIGEON.sh

----------------------------

#!/bin/bash
#SBATCH --job-name=cdhit
#SBATCH --nodes=1
#SBATCH -t 48:00:00
#SBATCH --ntasks=10
#SBATCH --output=cdhit%j.out
#SBATCH --error=cdhit%j.err
#SBATCH --partition=high2
#SBATCH --mem=50gb

module load cdhit
cd-hit-est -i PIGEON3.0_LNU.fa -o clustered_PIGEON_lnu.fna \
-c 0.95 -aS 0.85 -M 14000 -T 10

----------------------------

sbatch --output=./log/cdhit_pigeon_lnu.log --error=./err/cdhit_pigeon_lnu.err cdhitPIGEON.sh


In [None]:
# vcontact

# make vcontact directory for vibrant contig proteins!!
mkdir vcontact

# within vcontact directory, make more directories
mkdir err log

# copy CD-Hit ouput file into vcontact directory
cp clustered_lnu.fna /../../vcontact

# predict proteins 
#-i flag means input file, all contigs from CD-HIT 
#-a flag means output file that is protein, needs to be named with end of ".faa" 
 
source /home/csantosm/initconda 
conda activate PRODIGAL 
prodigal -i clustered_lnu.fna -a votus.prodigal.faa -p meta


# create gene2genome.csv file
# make script
nano gtg.sh

----------------------------

#!/bin/bash
#SBATCH -D /home/seuge91/heat/scripts/
#SBATCH --job-name=g2g
#SBATCH --nodes=1
#SBATCH -t 1:00:00
#SBATCH --partition=bmm

# for calculating the amount of time the job takes
begin=`date +%s`
echo $HOSTNAME

source /home/csantosm/initconda
conda activate VCONTACT2

cd /home/seuge91/heat/vcontact/

vcontact2_gene2genome -p ./votus.prodigal.faa  \
-o ./votus.gene2genome.csv \
-s Prodigal-FAA

# getting end time to calculate time elapsed
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed

----------------------------

# run script
sbatch --output=../vcontact/log/g2g.log --error=../vcontact/err/g2g.err g2g.sh

# create vcontact script
cd /home/seuge91/heat/scripts/
nano vcontact

----------------------------

#!/bin/bash
#SBATCH -D /home/seuge91/heat/scripts/
#SBATCH --job-name=vcontact
#SBATCH --nodes=1
#SBATCH -t 48:00:00
#SBATCH --ntasks=16
#SBATCH --partition=bmm

# for calculating the amount of time the job takes
begin=`date +%s`
echo $HOSTNAME

source /home/csantosm/initconda
conda activate VCONTACT2
module load deprecated/java

cd /home/seuge91/heat/vcontact/

vcontact2 --raw-proteins votus.prodigal.faa \
--rel-mode 'Diamond' \
--db 'ProkaryoticViralRefSeq85-Merged' \
--proteins-fp votus.gene2genome.csv \
--pcs-mode MCL \
--vcs-mode ClusterONE \
--threads 16 \
--c1-bin /home/csantosm/miniconda3/bin/cluster_one-1.0.jar \
--output-dir vcontact_out

# getting end time to calculate time elapsed
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
    
----------------------------

# run script
sbatch --output=../vcontact/log/vc2.log --error=../vcontact/err/vc2.err vcontact.sh

# download files
scp seuge91@farm.cse.ucdavis.edu:/home/seuge91/heat/vcontact/vcontact_out/genome_by_genome_overview.csv /mnt/c/Users/segeo/Downloads
scp seuge91@farm.cse.ucdavis.edu:/home/seuge91/heat/vcontact/vcontact_out/c1.ntw /mnt/c/Users/segeo/Downloads

In [None]:
# map back all reads to PIGEON database

# bow tie reference database in Annie's folder
/home/amhorst/PIGEON/bowtie/

220711_pigeon

# map the reads against the database
# make mapping script
nano bowtie_map.sh

----------------------------

#!/bin/bash
#SBATCH --job-name=bt2map
#SBATCH --nodes=1
#SBATCH -t 6:00:00
#SBATCH --ntasks=48
#SBATCH --partition=high2
#SBATCH --mem=50gb

# for calculating the amount of time the job takes
begin=`date +%s`
echo $HOSTNAME

#load modules
module load bowtie2
module load samtools

path=/home/seuge91/LNU/
sample=${1}

cd /home/amhorst/PIGEON/bowtie/

bowtie2 -x 220711_pigeon -p 48 \
-1 ${path}reads/rmphix/${sample}_R1_rmphix.fq.gz \
-2 ${path}reads/rmphix/${sample}_R2_rmphix.fq.gz \
-S ${path}bowtie_PIGEON/alignments/${sample}.vib.sam \
--sensitive

cd /home/seuge91/LNU/bowtie_PIGEON/alignments
samtools view -F 4 -bS ${sample}.vib.sam | samtools sort > ${sample}.vib.sI.bam
samtools index ${sample}.vib.sI.bam

rm ${sample}.vib.sam

# getting end time to calculate time elapsed
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
    
----------------------------

# run mapping script
for sample in $(<../sample.txt)
do
  sbatch --output=./log/${sample}.map.log --error=./err/${sample}.map.err bowtie_map.sh $sample
done

----------------------------------------------------------------------------------------------------------------

# map all reads to blodgett

# bow tie reference database in blodgett_combined folder
/home/seuge91/blodgett_combined/bowtie/

220711_pigeon

# map the reads against the database
# make mapping script
nano bowtie_map.sh

----------------------------

#!/bin/bash
#SBATCH --job-name=bt2map
#SBATCH --nodes=1
#SBATCH -t 6:00:00
#SBATCH --ntasks=48
#SBATCH --partition=high2
#SBATCH --mem=50gb

# for calculating the amount of time the job takes
begin=`date +%s`
echo $HOSTNAME

#load modules
module load bowtie2
module load samtools

path=/home/seuge91/LNU/
sample=${1}

cd /home/seuge91/blodgett_combined/bowtie/ref

bowtie2 -x all_vibrant_cdhit -p 48 \
-1 ${path}reads/rmphix/${sample}_R1_rmphix.fq.gz \
-2 ${path}reads/rmphix/${sample}_R2_rmphix.fq.gz \
-S ${path}bowtie_blodgett/alignments/${sample}.vib.sam \
--sensitive

cd /home/seuge91/LNU/bowtie_blodgett/alignments
samtools view -F 4 -bS ${sample}.vib.sam | samtools sort > ${sample}.vib.sI.bam
samtools index ${sample}.vib.sI.bam

rm ${sample}.vib.sam

# getting end time to calculate time elapsed
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
    
----------------------------

# run mapping script
for sample in $(<../sample.txt)
do
  sbatch --output=./log/${sample}.map.log --error=./err/${sample}.map.err bowtie_map.sh $sample
done

# coverage tables
# fish for contigs that were found in my dataset using bbmap
# anything present - put it all in one fasta file


# then cluster with LNU dataset using CD HIT



