# Import libraries and metadata

In [2]:
import pandas as pd

In [3]:
wd = '/Users/rosekantor/data/awtp2_metagenomics'

In [4]:
tab = pd.read_csv(f'{wd}/metagenomics_sample_table.tsv', sep='\t')

In [5]:
tab.head()

Unnamed: 0,sample_id,read_names,sample_16S,order,band_in_gel_amplicons,sample_code_partial,Sequal_prep_elute_pooled,Duplicate.,need_optimization.,Has_Other_pos_sample,...,sample_or_control,location_code,sample_type,sampler_name,Ext_batch,Extraction_Date,Ext_num_unique,Microconcentrated.,DNA_Ext_conc_ngperuL,kit
0,KNLK_1,KNLK_1_S87,KNLK_SD176,,+,AWTP_2_RO2_biofilm_control_1,N,False,N,N,...,control,RO_BF_control,biofilm,Rose_Kantor,RO1,9/13/2018,A58,Y,0.059,Powersoil Pro
1,KNLK_10,KNLK_10_S73,KNLK_SD021,73.0,+,DWDS_F_282B_bulk_1,N,False,N,,...,sample,DWDS_282B,DEUF,Lauren_Kennedy,7,7/5/2018,42,Y,24.6,PowerSoil Pro
2,KNLK_11,KNLK_11_S81,KNLK_SD028,81.0,+,AWTP_2_BAC_bulk_5,N,False,N,,...,sample,BAC,DEUF,Scott_Miller,2,6/16/2018,15,N,169.0,PowerSoil Pro
3,KNLK_12,KNLK_12_S2,KNLK_SD178,,+,SDS_F_3_bulk_3,N,False,N,,...,sample,Loop_3,DEUF,Scott_Miller,unk,,W7,supernatant,0.154,
4,KNLK_13,KNLK_13_S10,KNLK_SD177,,+,AWTP_2_RO2_biofilm_sep_2,N,False,N,,...,sample,RO_BF_separator,biofilm,Rose_Kantor,19,9/27/2018,A69,N,0.071,


In [6]:
# change project_type for 'Loop_MWTP_inf' to be Full_Scale so that we can group by project_type later and MWTP will be with DWDS
tab.loc[tab['location_code']=='Loop_MWTP_inf', ['project_type']] = 'Full_Scale'

In [7]:
set(tab.project_type)

{'DPWF', 'ECAWPC', 'Experimental', 'Full_Scale', 'Pipe_Loop'}

# Read trimming and QC

## Preliminary FastQC

In [8]:
# did this on the awtp samples, not everything.  Will check everything after trimming rather than before.
awtp =  tab[tab.project_type.isin(['DPWF', 'ECAWPC'])]

with open(f'{wd}/read_fastqc.sh', 'w') as f:
    for i in awtp.read_names:
        f.write(f'/opt/bin/bio/FastQC/fastqc -t 2 {i}.1.fastq.gz {i}.2.fastq.gz\n')
f.close()

Manual examination of fastQC showed forward reads have adapter contamination at 3' end. Reverse reads have small amounts of low-quality sequence as indicated by over-represented sequences that were strings of G's. Trimming dealt with both of these successfully.

A few samples look like they failed sequencing (high adapter contamination, weird per-base content and weird quality profiles).

## Trim bbmap and sickle

In [9]:
bb = '/opt/bin/bio/bbmap/bbduk.sh'
sickle = '/opt/bin/bio/sickle'
fastqc = '/opt/bin/bio/FastQC/fastqc'
adapters = '/opt/bin/bio/bbmap/resources/adapters.fa'
phiX = '/opt/bin/bio/bbmap/resources/phix174_ill.ref.fa.gz'
settings = '-Xmx5g k=23 mink=11 hdist=1 tbo tpe t=2'

# group by project type, 'group' will be the name of the project type, 'df' will be a filtered pandas dataframe for just that project
# iterate through each filtered dataframe and make all the reads processing commands
for group, df in tab.groupby('project_type'):
    with open(f'{wd}/workflows/read_trim_{group}.sh', 'w') as f:
        for row in df.itertuples():
            r = row.read_names
            s = row.sample_id
            trimR = f'{bb} ref={adapters} {settings} ktrim=r ftm=5 -in1={r}.1.fastq.gz -in2={r}.2.fastq.gz -out1={s}.1.noadapt.fastq.gz -out2={s}.2.noadapt.fastq.gz'
            #filtphiX = f'{bb} -Xmx5g t=2 k=31 hdist=1 -in1={s}.1.noadapt.fastq.gz -in2={s}.2.noadapt.fastq.gz out1={s}.1.nophix.fastq.gz out2={s}.2.nophix.fastq.gz ref={phiX} stats={s}_stats_phix.txt'
            # based on testing, reads had no phiX, so this command isn't necessary
            qtrim = f'sickle pe -l 75 -f {s}.1.noadapt.fastq.gz -r {s}.2.noadapt.fastq.gz -t sanger -o {s}.PE.1.fastq -p {s}.PE.2.fastq -s {s}.SR.fastq'
            clean = f'rm {s}.1.noadapt.fastq.gz {s}.2.noadapt.fastq.gz {s}.SR.fastq'
            qc = f'{fastqc} -t 2 {s}.PE.1.fastq {s}.PE.2.fastq'
            gz1 = f'pigz -p 2 {s}.PE.1.fastq'
            gz2 = f'pigz -p 2 {s}.PE.2.fastq'

            cmd = [trimR, qtrim, clean, qc, gz1, gz2]
            f.write('\n'.join(cmd) + '\n')
    f.close()

In [32]:
#len(tab.sample_code_partial)
tab[tab.sample_code_partial.duplicated()==True]
tab[tab.sample_code_partial=='EXP_0_field_bulk']

Unnamed: 0,sample_id,read_names,sample_16S,order,band_in_gel_amplicons,sample_code_partial,Sequal_prep_elute_pooled,Duplicate.,need_optimization.,Has_Other_pos_sample,...,sample_or_control,location_code,sample_type,sampler_name,Ext_batch,Extraction_Date,Ext_num_unique,Microconcentrated.,DNA_Ext_conc_ngperuL,kit
28,KNLK_35,KNLK_35_S12,KNLK_SD001,12.0,+,EXP_0_field_bulk,N,False,N,N,...,control,field_blank,DEUF,Lauren_Kennedy,19.0,9/8/2018,183.0,Y,0.126,PowerSoil Pro
33,KNLK_4,KNLK_4_S25,,25.0,-,EXP_0_field_bulk,,False,N,N,...,,,,,,,,,,


# MASH within-project all-vs-all beta diversity

In [19]:
#using MASH v2.0 (no -I option)
mash = '/opt/bin/bio/mash'

for group, df in tab.groupby('project_type'):
    with open(f'{wd}/workflows/read_mash_{group}.sh', 'w') as f:
        for row in df.itertuples():
            s = row.sample_id
            concat = f'cat {s}.PE.1.fastq.gz {s}.PE.2.fastq.gz > {s}'
            sketch = f'{mash} sketch -m 2 -r {s} -s 10000'
            clean = f'rm {s}'
            cmd = [concat, sketch, clean]
            f.write('\n'.join(cmd) + '\n')
    f.close()

In [40]:
# using MASH v2.2 (with -I option that allows you to provide a sample name with stdin)
# using stdin allows you to concatenate fwd and rev reads and pipe directly into mash without making intermediate files.
# this is denoted with a dash ("-") after the -r parameter instead of a reads name

mash = '/opt/bin/bio/mash'

for group, df in tab.groupby('project_type'):
    with open(f'{wd}/workflows/read_mash_{group}.sh', 'w') as f:
        for row in df.itertuples():
            s = row.sample_id
            cmd = f'cat {s}.PE.1.fastq.gz {s}.PE.2.fastq.gz | {mash} sketch -m 2 -r - -I {s} -s 10000'
            f.write(cmd + '\n')
    f.close()

## mash dist commands:
`mash paste` combines multiple sketches into a single sketch.  The first arg is output name, followed by a list of all the sketch files you want to combine

Command: `mash paste awtp2.msh *msh`

`mash dist` can sketch on the fly or take a sketch as input.  Because we are doing all-vs-all we use the same msh file as the query and reference.

Command: `mash dist awtp2.msh awtp2.msh`

# Assembly

In [31]:
megahit = '/opt/bin/bio/megahit'
assem_dir = '/data2/other/knelson/assembly'
reads_dir = '/data2/other/knelson/raw.d/trimmed_reads'
for group, df in tab.groupby('project_type'):
    with open(f'{wd}/workflows/megahit_{group}.sh', 'w') as f:
        for row in df.itertuples():
            s = row.sample_id
            assem_name = row.sample_code_partial
            assem_cmd = f'{megahit} -t 48 ' \
                        f'-1 {reads_dir}/{s}.PE.1.fastq.gz ' \
                        f'-2 {reads_dir}/{s}.PE.1.fastq.gz ' \
                        f'-o {assem_dir}/{assem_name}'
            qcmd = f'echo "{assem_cmd}" | qsub -V -pe smp 48 -N {s}megahit'
            f.write(qcmd + '\n')
    f.close()

## Testing

In [30]:
# ASSEMBLY TIME:

# KNLK_16megahit assembly time (min):
print(2523.008783/60)
# KNLK_76 assembly time
print(15362.135303/60)

42.050146383333335
256.03558838333333


In [35]:
# ASSEMBLY QUALITY:

# megahit bps on contigs > 1kb
print(100 - 22.37 - 17.85)

# idba_ud bp on contigs > 1kb
print(100 -  0.02 - 15.88 - 20.93)

59.779999999999994
63.17000000000001


In [61]:
tab[tab.project_type=='DPWF'].sort_values('sample_code_partial')

Unnamed: 0,sample_id,read_names,sample_16S,order,band_in_gel_amplicons,sample_code_partial,Sequal_prep_elute_pooled,Duplicate.,need_optimization.,Has_Other_pos_sample,...,sample_or_control,location_code,sample_type,sampler_name,Ext_batch,Extraction_Date,Ext_num_unique,Microconcentrated.,DNA_Ext_conc_ngperuL,kit
85,KNLK_9,KNLK_9_S65,KNLK_SD147,,+,AWTP_2_BAC_bulk_2,N,False,N,,...,sample,BAC,DEUF,Scott_Miller,2,6/16/2018,13,N,17.5,PowerSoil Pro
25,KNLK_32,KNLK_32_S75,KNLK_SD023,75.0,+,AWTP_2_BAC_bulk_3,N,False,N,,...,sample,BAC,DEUF,Scott_Miller,Test_March11,3/11/2018,W5,supernatant,27.0,
73,KNLK_76,KNLK_76_S79,KNLK_SD160,,+,AWTP_2_BAC_bulk_4,N,False,N,,...,sample,BAC,DEUF,Scott_Miller,2,6/16/2018,14,N,85.6,PowerSoil Pro
2,KNLK_11,KNLK_11_S81,KNLK_SD028,81.0,+,AWTP_2_BAC_bulk_5,N,False,N,,...,sample,BAC,DEUF,Scott_Miller,2,6/16/2018,15,N,169.0,PowerSoil Pro
7,KNLK_16,KNLK_16_S34,KNLK_SD132,,+,AWTP_2_MF_comb_bulk_1,N,False,N,,...,sample,MF_combined,DEUF,Scott_Miller,18,9/8/2018,171,Y,1.07,PowerSoil Pro
20,KNLK_28,KNLK_28_S43,KNLK_SD009,43.0,+,AWTP_2_MF_comb_bulk_2,N,False,N,,...,sample,MF_combined,DEUF,Scott_Miller,18,9/8/2018,172,Y,1.91,PowerSoil Pro
46,KNLK_51,KNLK_51_S53,KNLK_SD142,,+,AWTP_2_MF_comb_bulk_3,N,False,N,,...,sample,MF_combined,DEUF,Scott_Miller,18,9/8/2018,173,Y,4.85,PowerSoil Pro
50,KNLK_55,KNLK_55_S85,KNLK_SD032,85.0,+,AWTP_2_RO2_biofilm_ret_scrape_1,N,False,N,,...,sample,RO_BF_retentate_scrapings,biofilm,Rose_Kantor,2,6/16/2018,9,N,600.0,PowerSoil Pro
37,KNLK_43,KNLK_43_S76,KNLK_SD024,76.0,+,AWTP_2_RO2_biofilm_ret_scrape_2,N,False,N,,...,sample,RO_BF_retentate_scrapings,biofilm,Rose_Kantor,19,9/27/2018,199,N,30.5,PowerSoil Pro
31,KNLK_38,KNLK_38_S36,KNLK_SD006,36.0,+,AWTP_2_RO2_biofilm_sep_1,N,False,N,,...,sample,RO_BF_separator,biofilm,Rose_Kantor,RO1,9/13/2018,A57,Y,1.17,Powersoil Pro


## Assembly post-processing

In [74]:
assem_dir = '/data2/other/knelson/assembly'
for group, df in tab.groupby('project_type'):
    with open(f'{wd}/workflows/postAssem_{group}.sh', 'w') as f:
        for row in df.itertuples():
            s = row.sample_id
            assem_name = row.sample_code_partial
            
            # replace the fasta headers that start "k141" with "sample_id" and rename file to sample_id_contigs.fa
            rehead = f"sed 's/k141/{s}/g' {assem_dir}/{assem_name}/final.contigs.fa > {assem_dir}/{assem_name}/{s}_contigs.fa"
                        
            # filter for only contigs ≥1000 bp
            min1000 = f'pullseq -i {assem_dir}/{assem_name}/{s}_contigs.fa --min 1000 > {assem_dir}/{assem_name}/{s}_contigs_min1000.fa'
            
            # delete extra files from assembly
            clean = f'rm -r {assem_dir}/{assem_name}/intermediate_contigs/ '\
                    f'{assem_dir}/{assem_name}/checkpoints.txt '\
                    f'{assem_dir}/{assem_name}/final.contigs.fa '\
                    f'{assem_dir}/{assem_name}/done '\
                    f'{assem_dir}/{assem_name}/options.json'
           
            # make directory to store bowtie2 indices in
            mdbt2 = f'mkdir {assem_dir}/{assem_name}/bt2/'
            
            # index in prep for bowtie2 mapping
            ind = f'bowtie2-build {assem_dir}/{assem_name}/{s}_contigs_min1000.fa {assem_dir}/{assem_name}/bt2/{s}_contigs_min1000.fa'
           
            cmd = [rehead, min1000, clean, mdbt2, ind]
            f.write('\n'.join(cmd) + '\n')
    f.close()

# Read-mapping across samples of same location

In [67]:
bt2 = '/opt/bin/bio/bowtie2'
ssam = '/opt/bin/bio/shrinksam'
assem_dir = '/data2/other/knelson/assembly'
reads_dir = '/data2/other/knelson/raw.d/trimmed_reads'

for group, df in tab.groupby('project_type'):
    
    # commands for each project saved separately
    with open(f'{wd}/workflows/mapping_{group}.sh', 'w') as f:
        
        # group by location and cross-map within location (these reads are most likely to map and will help with binning)
        for location, dfl in df.groupby('location_code'):
            
            #commands for each sample within the project
            for row in dfl.itertuples():
                s = row.sample_id
                assem_name = row.sample_code_partial

                for r in dfl.sample_id:
                    out = f'{assem_dir}/{assem_name}/{s}_contigs_min1000.fa-vs-{r}.sam'
                    bt2ind = f'{assem_dir}/{assem_name}/bt2/{s}_contigs_min1000.fa'
                    r1 = f'{reads_dir}/{r}.PE.1.fastq.gz'
                    r2 = f'{reads_dir}/{r}.PE.2.fastq.gz'
                    map_cmd = f'{bt2} -p 48 -x {bt2ind} -1 {r1} -2 {r2} --reorder | {ssam} -v > {out}'
                    qcmd = f'echo "{map_cmd}" | qsub -V -pe smp 48 -N {s}_vs_{r}'
                    f.write(qcmd + '\n')
    f.close()

## Filter, compress, and index mapping files

In [None]:
#### NOT FINISHED ####

for group, df in tab.groupby('project_type'):
    with open(f'{wd}/workflows/mapping_{group}.sh', 'w') as f:
        f'{assem_dir}/{assem_name}/{s}_contigs_min1000.fa-vs-{r}.sam'
        samtools view -F 4 -bS $i"_scaffold_min2500.vs."{s}.sam > $i"_scaffold_min2500.vs."$s.bam
        samtools sort -m 5G $i"_scaffold_min2500.vs."$s.bam > $i"_scaffold_min2500.vs."$s.sorted.bam
        samtools index $i"_scaffold_min2500.vs."$s.sorted.bam
        rm $i"_scaffold_min2500.vs."$s.sam
        rm $i"_scaffold_min2500.vs."$s.bam
        
####        
assem_dir = '/data2/other/knelson/assembly'

for group, df in tab.groupby('project_type'):
    
    # commands for each project saved separately
    with open(f'{wd}/workflows/processMapping_{group}.sh', 'w') as f:
        
        # group by location and cross-map within location (these reads are most likely to map and will help with binning)
        for location, dfl in df.groupby('location_code'):
            
            #commands for each sample within the project
            for row in dfl.itertuples():
                s = row.sample_id
                assem_name = row.sample_code_partial

                for r in dfl.sample_id:
                    out = f'{assem_dir}/{assem_name}/{s}_contigs_min1000.fa-vs-{r}.sam'
                    bt2ind = f'{assem_dir}/{assem_name}/bt2/{s}_contigs_min1000.fa'
                    r1 = f'{reads_dir}/{r}.PE.1.fastq.gz'
                    r2 = f'{reads_dir}/{r}.PE.2.fastq.gz'
                    map_cmd = f'{bt2} -p 48 -x {bt2ind} -1 {r1} -2 {r2} --reorder | {ssam} -v > {out}'
                    qcmd = f'echo "{map_cmd}" | qsub -V -pe smp 48 -N {s}_vs_{r}'
                    f.write(qcmd + '\n')
    f.close()

# Anvi'o profiling