# This notebook provides steps to determine which source metagenomes are suitable for the sourcetracker analysis 
Comparing freshwater, gut, ocean, Pacific Ocean, soil, and wastewater metagenomes 

# Make sure to install parallel and kaiju before continuing 
parallel: 
wget http://ftpmirror.gnu.org/parallel/parallel-20150322.tar.bz2 
bzip2 -dc parallel-20150322.tar.bz2 | tar xvf - 
cd parallel-20150322 
./configure && make && make install

kaiju:
git clone https://github.com/bioinformatics-centre/kaiju.git

kaijudb: 
mkdir kaijudb 
cd kaijudb 
kaiju-makedb -s nr_euk

In [None]:
# Each source should have its own directory for organization purposes 
# each .txt and wget_*.sh are available in Metagenome_Seqs_Input
mkdir freshwater
cd freshwater # freshwater.txt should be in this directory 
./wget_freshwater.sh 
cd ..
mkdir gut 
cd gut # gut.txt should be in this directory 
./wget_gut.sh
cd ..
mkdir ocean 
cd ocean # ocean.txt should be in this directory 
./wget_ocean.sh
cd ..
mkdir pacific 
cd pacific  # pacific.txt should be in this directory 
./wget_pacific.sh
cd ..
mkdir soil 
cd soil  # soil.txt should be in this directory 
./wget_soil.sh
cd ..
mkdir wastewater 
cd wastewater  # wastewater.txt should be in this directory 
./wget_wastewater.sh

In [None]:
# in each directory, run the following: 
ls *.fastq.gz | parallel -j15 --max-args=2 kaiju -t ~/[path_to_kaijudb]/nodes.dmp -f ~/[path_to_kaijudb]/kaiju_db_nr_euk.fmi -i {1} -j {2} -o ~/[path_to_project]/kaiju_out/{1.}.out

In [None]:
# in each directory, run the following: 
ls *.out | parallel -j15 kaiju2table -t ~/[path_to_kaijudb]/nodes.dmp -n ~/[path_to_kaijudb]/names.dmp -e -r species -l domain,superkingdom,phylum,class,order,family,genus,species -o {.}.tsv {}

In [None]:
# in each directory, run the following to convert .tsv to OTU table 
# change lines 47, 50, 52, 54, and 56 to name each csv with source name 
# i.e. "freshwater_OTU_metagenome.csv"
python3 kaiju_table_to_OTU.py

In [None]:
# run the following python script to merge all metagenomes, but change "metagenome" for individual domains if desired 
python3 Combine_All_Metagenomes.py 

# To determine which metagenomes to keep, use NMDS to determine dissimilarity between samples 
# First run ST_NMDS_All.Rmd 
# This will output the initial NMDS with all metagenomes plotted 
# Next run ST_NMDS_Sub.Rmd
# This will output the subplot of the bottom right corner of the initial plot 
# Based on these plots, we will continue with only "pure" metagenomes 

In [None]:
# run the following python script to merge metagenomes with selected columns only, 
# but change "metagenome" for individual domains if desired 
python3 Merge_Selected_Metagenomes.py

# Next move on to SourceTracker 
First install SourceTracker: 
pip3 install sourcetracker 

In [None]:
# SourceTracker takes 3 arguments: 
# input: OTU table in biom format 
# map: samples assigned to "source" or "sink"
# output: directory for output files 

In [None]:
# First convert OTU from csv to tsv 
# Convert tsv to biom 
biom convert -i All_final.txt -o All_final.biom --table-type="OTU table" --to-json

In [None]:
# Run SourceTracker 
sourcetracker2 gibbs -i All_final.biom -m map.txt -o metagenome_out