# Import libraries and metadata

In [2]:
import pandas as pd
import numpy as np
import csv as csv

In [3]:
# paths to data
wd = '/Users/rosekantor/data/awtp2_metagenomics'
wf = '/Users/rosekantor/work/awtp2/workflows'
assem_dir = '/groups/banfield/projects/human/drinkingwater/assembly'
reads_dir = '/groups/banfield/projects/human/drinkingwater/raw.d/trimmed_reads'
mash_dir = '/groups/banfield/projects/human/drinkingwater/raw.d/mash_analysis'

# paths to software
bb = '/shared/software/bin/bbduk.sh -Xmx5g k=23 mink=11 hdist=1 tbo tpe t=2'
sickle = '/shared/software/bin/sickle'
fastqc = '/shared/software/bin/fastqc'
adapters = '/shared/software/bbmap/v38.78/resources/adapters.fa'
phiX = '/shared/software/bbmap/v38.78/resources/phix174_ill.ref.fa.gz'

mash = '/shared/software/bin/mash'

megahit = '/shared/software/bin/megahit'
metaspades = '/shared/software/bin/spades/bin/metaspades.py'

bt2 = '/shared/software/bin/bowtie2'
ssam = '/shared/software/bin/shrinksam'

anvip = 'anvi-profile --min-contig-length 1000 --skip-SNV-profiling -T 48'

In [4]:
# import metadata
tab = pd.read_csv(f'{wd}/metagenome_info_tables/metagenomics_sample_table.tsv', sep='\t')
tab['path_prefix'] = tab[['sample_code_partial', 'sample_id']].apply(lambda x: f'{assem_dir}/'+'/'.join(x)+'_', axis=1)
tab['assem'] = tab[['sample_code_partial', 'sample_id']].apply(lambda x: f'{assem_dir}/'+'/'.join(x)+'_contigs_min1000.fa', axis=1)
tab['r1'] = tab['sample_id'].apply(lambda x: f'{reads_dir}/'+x+'.PE.1.fastq.gz')
tab['r2'] = tab['sample_id'].apply(lambda x: f'{reads_dir}/'+x+'.PE.2.fastq.gz')

# change project_type for 'Loop_MWTP_inf' to be Full_Scale so that we can group by project_type later and MWTP will be with DWDS
tab.loc[tab['location_code']=='Loop_MWTP_inf', ['project_type']] = 'Full_Scale'

# there are no duplicates in sample_code_partial except for two controls. Renaming controls below:

# add a column to tab with shorter names for the controls and NA for samples
controls_names = pd.DataFrame.from_dict({'sample_id': ['KNLK_1', 'KNLK_2', 'KNLK_24', 'KNLK_3', 'KNLK_35', 'KNLK_4','KNLK_42', 'KNLK_57'],
                                         'sample_name': ['neg_RO_bf', 
                                                          'neg_field_bf',
                                                          'pos_ex_1',
                                                          'neg_ex_1',
                                                          'neg_field_1',
                                                          'neg_field_2',
                                                          'pos_ex_3',
                                                          'pos_ex_2']})
tab = tab.merge(controls_names, on='sample_id', how='left')

# fill in na in sample_names with names from sample_code_partial
tab.sample_name = tab.sample_name.fillna(tab.sample_code_partial)

# what projects do we have?
set(tab.project_type)

{'DPWF', 'ECAWPC', 'Experimental', 'Full_Scale', 'Pipe_Loop'}

In [5]:
tab.sample_name.to_clipboard()

  obj.to_csv(buf, sep=sep, encoding="utf-8", **kwargs)


In [6]:
tab[tab.sample_name=='neg_field_bf']

Unnamed: 0,sample_id,read_names,sample_16S,order,band_in_gel_amplicons,sample_code_partial,Sequal_prep_elute_pooled,Duplicate.,need_optimization.,Has_Other_pos_sample,...,Extraction_Date,Ext_num_unique,Microconcentrated.,DNA_Ext_conc_ngperuL,kit,path_prefix,assem,r1,r2,sample_name
11,KNLK_2,KNLK_2_S9,,9.0,-,EXP_0_field_biofilm,,False,N,N,...,,,,,,/groups/banfield/projects/human/drinkingwater/...,/groups/banfield/projects/human/drinkingwater/...,/groups/banfield/projects/human/drinkingwater/...,/groups/banfield/projects/human/drinkingwater/...,neg_field_bf


# Reads: trimming, QC, mash

## Preliminary FastQC

In [7]:
# did this on the awtp samples, not everything.  Will check everything after trimming rather than before.
# awtp =  tab[tab.project_type.isin(['DPWF', 'ECAWPC'])]

# with open(f'{wd}/read_fastqc.sh', 'w') as f:
#     for i in awtp.read_names:
#         f.write(f'/opt/bin/bio/FastQC/fastqc -t 2 {i}.1.fastq.gz {i}.2.fastq.gz\n')
# f.close()

Manual examination of fastQC showed forward reads have adapter contamination at 3' end. Reverse reads have small amounts of low-quality sequence as indicated by over-represented sequences that were strings of G's. Trimming dealt with both of these successfully.

A few samples look like they failed sequencing (high adapter contamination, weird per-base content and weird quality profiles).

## Trim bbmap, sickle, MASH

In [8]:
# iterate through each filtered dataframe and make all the reads processing commands
rawreads = '/groups/banfield/projects/human/drinkingwater/raw.d/KNKL'
trimmedreads = '/groups/banfield/projects/human/drinkingwater/raw.d/trimmed_reads'
trim_cmd = []
mash_cmd = []
for row in tab.itertuples():
    r = row.read_names
    s = row.sample_id
    trimR = f'{bb} ref={adapters} ktrim=r ftm=5 -in1={rawreads}/{r}.1.fastq.gz -in2={rawreads}/{r}.2.fastq.gz -out1={rawreads}/{s}.1.noadapt.fastq.gz -out2={rawreads}/{s}.2.noadapt.fastq.gz'
    #filtphiX = f'{bb} -Xmx5g t=2 k=31 hdist=1 -in1={s}.1.noadapt.fastq.gz -in2={s}.2.noadapt.fastq.gz out1={s}.1.nophix.fastq.gz out2={s}.2.nophix.fastq.gz ref={phiX} stats={s}_stats_phix.txt'
    # based on testing, reads had no phiX, so this command isn't necessary
    qtrim = f'sickle pe -l 75 -f {rawreads}/{s}.1.noadapt.fastq.gz -r {rawreads}/{s}.2.noadapt.fastq.gz -t sanger -o {trimmedreads}/{s}.PE.1.fastq -p {trimmedreads}/{s}.PE.2.fastq -s {trimmedreads}/{s}.SR.fastq'
    clean = f'rm {rawreads}/{s}.1.noadapt.fastq.gz {rawreads}/{s}.2.noadapt.fastq.gz {trimmedreads}/{s}.SR.fastq'
    qc = f'{fastqc} -t 2 {s}.PE.1.fastq {s}.PE.2.fastq'
    gz1 = f'pigz -p 2 {trimmedreads}/{s}.PE.1.fastq'
    gz2 = f'pigz -p 2 {trimmedreads}/{s}.PE.2.fastq'
    cmd = [trimR, qtrim, clean, qc, gz1, gz2]
    cmd = "; ".join(cmd)
    trim_cmd.append(cmd)
    
    # mash all v. all reads
    cmd2 = f'cat {row.r1} {row.r2} | {mash} sketch -m 2 -r - -I {s} -s 10000 -o {mash_dir}/{s}'
    mash_cmd.append(cmd2)
    
trim_cmd = pd.Series(trim_cmd, name='trim_cmd')
mash_cmd = pd.Series(mash_cmd, name='mash_cmd')

# add to tab
tab['trim_cmd'] = trim_cmd
tab['mash_cmd'] = mash_cmd

## mash dist commands
`mash paste` combines multiple sketches into a single sketch.  The first arg is output name, followed by a list of all the sketch files you want to combine

Command: `mash paste awtp2.msh *msh`

`mash dist` can sketch on the fly or take a sketch as input.  Because we are doing all-vs-all we use the same msh file as the query and reference.

Command: `mash dist awtp2.msh awtp2.msh`

# Assembly

## Megahit

In [9]:
megahit_cmd = []
for row in tab.itertuples():
    s = row.sample_id
    r1 = row.r1
    r2 = row.r2
    assem_name = row.sample_code_partial
    cmd = f'sbatch --wrap "' \
          f'{megahit} -t 48 ' \
          f'-1 {row.r1} ' \
          f'-2 {row.r2} ' \
          f'-o {assem_dir}/{assem_name}"'

    megahit_cmd.append(cmd)
megahit_cmd = pd.Series(megahit_cmd, name='megahit_cmd')
tab['megahit_cmd'] = megahit_cmd

## Assembly post-processing

In [10]:
awk = "awk '{print $1}'"

postassem_cmd = [] # create empty list
for row in tab.itertuples():
    s = row.sample_id
    assem_name = row.sample_code_partial

    # replace the fasta headers that start "k141" with "sample_id" and rename file to sample_id_contigs.fa, simplify headers to just the first field
    rehead = f"sed 's/k141/{s}/g' {assem_dir}/{assem_name}/final.contigs.fa | {awk} > {assem_dir}/{assem_name}/{s}_contigs.fa"

    # get contig stats
    cstats = f'contig_stats.pl -i {assem_dir}/{assem_name}/{s}_contigs.fa'

    # filter for only contigs ≥1000 bp
    min1000 = f'pullseq -i {assem_dir}/{assem_name}/{s}_contigs.fa --min 1000 > {assem_dir}/{assem_name}/{s}_contigs_min1000.fa'

    # delete extra files from assembly
    clean = f'rm -r {assem_dir}/{assem_name}/intermediate_contigs/ '\
            f'{assem_dir}/{assem_name}/checkpoints.txt '\
            f'{assem_dir}/{assem_name}/final.contigs.fa '\
            f'{assem_dir}/{assem_name}/done '\
            f'{assem_dir}/{assem_name}/options.json'

    # make directory to store bowtie2 indices in
    mdbt2 = f'mkdir {assem_dir}/{assem_name}/bt2/'

    # index in prep for bowtie2 mapping
    ind = f'bowtie2-build {assem_dir}/{assem_name}/{s}_contigs_min1000.fa {assem_dir}/{assem_name}/bt2/{s}_contigs_min1000.fa'

    cmd = [rehead, cstats, min1000, clean, mdbt2, ind]
    all_cmd = '; '.join(cmd) # separate all commands by semicolon (so they will be executed in order for each sample)
    postassem_cmd.append(all_cmd) # append command to list
    
postassem_cmd = pd.Series(postassem_cmd) # from list to Series
tab['postassem_cmd'] = postassem_cmd # add as a column to the dataframe

## Anvi'o process contigs

In [11]:
## update to add this to tab rather than separate table
kaiju_path = '/groups/banfield/projects/human/drinkingwater/kaiju/bin'
kaiju_nodes = '/groups/banfield/projects/human/drinkingwater/kaiju/nodes.dmp'
kaiju_names = '/groups/banfield/projects/human/drinkingwater/kaiju/names.dmp'

kaiju_nr = '/groups/banfield/projects/human/drinkingwater/kaiju/kaiju_db_nr_euk.fmi'
kaiju = f'{kaiju_path}/kaiju -t {kaiju_nodes} -f {kaiju_nr}'
kaiju_addtaxnames = f'{kaiju_path}/kaiju-addTaxonNames -t {kaiju_nodes} -n {kaiju_names} -r superkingdom,phylum,order,class,family,genus,species'

contigs_df = []
for row in tab.itertuples():
    s = row.sample_id
    assem_name = row.sample_code_partial
    path_prefix = row.path_prefix
    contigs_min1000 = row.assem
    contigsDB = f'{path_prefix}contigs.db'
    gene_calls = f'{path_prefix}gene_calls.fa'
    kaiju_out = f'{path_prefix}kaiju.out'
    kaiju_processed = f'{path_prefix}genes_kaiju.txt'
    
    ## local initialize
    make_cdb = f'anvi-gen-contigs-database -f {contigs_min1000} -o {contigsDB} -n {assem_name}'
    get_genes = f'anvi-get-sequences-for-gene-calls -c {contigsDB} -o {gene_calls}'
    makecdb_cmd = f'{make_cdb}; {get_genes}'
    
    ## cluster
    anvhmms = f'anvi-run-hmms -c {contigsDB} -T 48'
    anvscg = f'anvi-run-scg-taxonomy -c {contigsDB} -T 48'
    #anvcogs = f'anvi-run-ncbi-cogs -c {contigsDB} -T 48'
    run_kaiju = f'{kaiju} -i {gene_calls} -v -z 48 > {kaiju_out}' 
    analyzecdbCluster_cmd = f'sbatch --wrap "{anvhmms}; {anvscg}; {run_kaiju}"'
    
    ## local after cluster jobs
    process_kaiju = f'{kaiju_addtaxnames} -i {kaiju_out} -o {kaiju_processed}' 
    import_kaiju = f'anvi-import-taxonomy-for-genes -i {kaiju_processed} -c {contigsDB} -p kaiju --just-do-it'
    cdbAddTaxonomy_cmd = f'{process_kaiju}; {import_kaiju}'
        
    ## append all commands to table
    contigs_cmd = [s, makecdb_cmd, analyzecdbCluster_cmd, cdbAddTaxonomy_cmd]
    contigs_df.append(contigs_cmd)

rownames = ['sample_id', 'makecdb_cmd', 'analyzecdbCluster_cmd', 'cdbAddTaxonomy_cmd']
contigs_df = pd.DataFrame.from_records(contigs_df, columns=rownames)
tab = tab.merge(contigs_df, on='sample_id')

# Later annotations
# blast vs CARD # do this later and use USEARCH
# PFAM and TIGRFAM
#anvi-import-functions

In [12]:
# write files
project = 'DPWF' # 'Full_Scale'
#tab[tab.project_type==project].makecdb_cmd.to_csv(f'{wf}/makecdb_{project}.sh', sep='\t', header=False, index=False, quoting=csv.QUOTE_NONE)
#tab[tab.project_type==project].analyzecdbCluster_cmd.to_csv(f'{wf}/analyzecdbCluster_{project}.sh', sep='\t', header=False, index=False, quoting=csv.QUOTE_NONE)
#tab[tab.project_type==project].cdbAddTaxonomy_cmd.to_csv(f'{wf}/cdbAddTaxonomy_{project}.sh', sep='\t', header=False, index=False, quoting=csv.QUOTE_NONE)

## Contigs summary
anvi-display-contigs-stats --report-as-text -o contig_stats.txt *contigs.db

# Mapping

## Generate decontamination mapping commands df

In [25]:
# within-project mapping
project = 'DPWF' # 'Full_Scale'
df = tab[tab.project_type==project]

# get all rows of tab that are from the project of interest AND controls
df_cont = tab[tab.project_type.isin([project, 'Experimental'])]

mapping_df = []
for srow in df.itertuples():  
    s = srow.sample_id
    assem_name = srow.sample_name
    assem = srow.assem
    path_prefix = srow.path_prefix
    bt2ind = f'{assem_dir}/{assem_name}/bt2/{s}_contigs_min1000.fa'
    
    # vs other samples and controls
    for rrow in df_cont.itertuples():
        r = rrow.sample_id
        rcode = rrow.sample_name
        r1 = rrow.r1
        r2 = rrow.r2
        bam = f'{assem}-vs-{r}.bam'
        filtered_bam_raw = f'{assem}-vs-{r}.filtered.raw.bam'
        filtered_bam = f'{assem}-vs-{r}.filtered.bam'
        
        # mapping directly to bam
        map_cmd = f'{bt2} -p 48 -x {bt2ind} -1 {r1} -2 {r2} --reorder | {ssam} -v | sambam > {bam}'
        map_qcmd = f'sbatch --wrap "{map_cmd}"' 
        
        # filter mapping - for decontamination, we want to use filtered mappings
        filter_mapping = f'sbatch --wrap "reformat.sh in={bam} out={filtered_bam_raw} editfilter=2 threads=48"'
        
        # process mapping
        sort = f'samtools sort -m 5G {filtered_bam_raw} > {filtered_bam}'
        index = f'samtools index {filtered_bam}'
        clean = f'rm {filtered_bam_raw}'
        process_mapping = [sort, index, clean]
        process_mapping = '; '.join(process_mapping) # make into a single command line
        
        # anvi-profile
        profile_out = f'{assem_dir}/{assem_name}/anvio_data/{r}_profile' # check this
        anvip_cmd = f'anvi-profile --min-contig-length 1000 --skip-SNV-profiling -T 48 -i {filtered_bam} -c {path_prefix}contigs.db -o {profile_out} -S {rcode}'
        anvip_qcmd = f'sbatch --wrap "{anvip_cmd}"'
        
        ## append all commands to table
        all_cmds = [s, project, assem_name, r, rcode, map_qcmd, filter_mapping, process_mapping, anvip_qcmd]
        mapping_df.append(all_cmds)

rownames = ['sample_id', 'project_type', 'sample_code', 'read_id', 'read_code', 'map_qcmd', 'filter_mapping', 'process_mapping', 'anvip_qcmd']
mapping_df = pd.DataFrame.from_records(mapping_df, columns=rownames)

## Get mapping commands for decontamination 

In [502]:
# (excluding pos_ex_3 because 2 pos controls is enough)
decontam_mappings = mapping_df[(mapping_df.read_code.str.contains('neg') | mapping_df.read_code.str.contains('pos')) \
                               & (mapping_df.read_code != 'pos_ex_3') \
                               & (mapping_df.read_code != 'neg_RO_bf') \
                               | (mapping_df.read_code == mapping_df.sample_code) # self mapping
                              ]
#decontam_mappings.map_qcmd.to_csv(f'{wf}/decontam_map_qcmd_{project}.sh', sep='\t', header=False, index=False, quoting=csv.QUOTE_NONE)
#decontam_mappings.filter_mapping.to_csv(f'{wf}/decontam_filter_{project}.sh', sep='\t', header=False, index=False, quoting=csv.QUOTE_NONE)
#decontam_mappings.process_mapping.to_csv(f'{wf}/decontam_process_{project}.sh', sep='\t', header=False, index=False, quoting=csv.QUOTE_NONE)
#decontam_mappings.anvip_qcmd.to_csv(f'{wf}/decontam_anvip_qcmd_{project}.sh', sep='\t', header=False, index=False, quoting=csv.QUOTE_NONE)

## Merge and find contaminants

In [41]:
anvi_merge = []
find_contams = []
for row in tab.itertuples():
    s = row.sample_id
    assem_name = row.sample_code_partial
    path_prefix = row.path_prefix
    contigsDB = f'{path_prefix}contigs.db'
    
    anvim_cmd = f'anvi-merge {assem_dir}/{assem_name}/anvio_data/*_profile/PROFILE.db -c {contigsDB} -o {path_prefix}merged_decontam_profiles'
    anvi_merge.append(anvim_cmd)
    
    profileDB = f'{path_prefix}merged_decontam_profiles/PROFILE.db'
    find_contams_cmd = f'/home/rkantor/scripts/find_contams.py -p {profileDB} -c {contigsDB} -o contam -n {assem_name}'
    find_contams.append(find_contams_cmd)
    
tab['anvi_merge_decontam'] = pd.Series(anvi_merge)
tab['find_contams'] = pd.Series(find_contams)

# write comands
project = 'Full_Scale'
anvim = tab[tab.project_type == project].anvi_merge_decontam
find_contams = tab[tab.project_type == project].find_contams
anvim.to_csv(f'{wf}/decontam_anvim_{project}.sh',sep='\t', header=False, index=False, quoting=csv.QUOTE_NONE)
find_contams.to_csv(f'{wf}/decontam_find_contams_{project}.sh', sep='\t', header=False, index=False, quoting=csv.QUOTE_NONE)

In [37]:
# delete bam files (except self-mappings)
bams_list = '/Users/rosekantor/work/awtp2/workflows/bams_list_DWDS.txt' #'/Users/rosekantor/work/awtp2/workflows/bams_list_AWTP2.txt'
bams = pd.read_csv(bams_list, names=['file'])
bams.index = bams.file
bams = bams.file.str.split(pat='_contigs_min1000.fa-vs-', expand=True)
bams.columns = ['samp', 'reads']
bams.samp = bams.samp.str.split('/', expand=True)[1]
bams.reads = bams.reads.str.split('.', expand=True)[0]
bams['file_name'] = bams.index

with open('/Users/rosekantor/work/awtp2/workflows/remove_bams_dwds.sh', 'w') as f:
    for row in bams.itertuples():
        if row.samp != row.reads:
            f.write(f'rm {row.file_name}\n')
            f.write(f'rm {row.file_name}.bai\n')
f.close()

In [32]:
# split all contig dbs to visualize all contams removed and decide if it's okay
cmds = []
for row in tab.itertuples():
    s = row.sample_id
    assem_name = row.sample_code_partial
    path_prefix = row.path_prefix
    contigsDB = f'{path_prefix}contigs.db'
    profileDB = f'{path_prefix}merged_decontam_profiles/PROFILE.db'
    collection = f'/groups/banfield/projects/human/drinkingwater/assembly/decontam_work/results/{assem_name}_contam_collection.txt'
    new_anvi_data = f'{path_prefix}merged_decontam_split'
    anviimp = f'anvi-import-collection -c {contigsDB} -p {profileDB} -C contam {collection}'
    anvisplit = f'anvi-split -c {contigsDB} -p {profileDB} -C contam -o {new_anvi_data}'
    
    cmds.append('; '.join([anviimp, anvisplit]))
    
tab['split_contams'] = pd.Series(cmds)

project = 'DPWF'
split_contams = tab[tab.project_type==project].split_contams
split_contams.to_csv(f'{wf}/decontam_split_contams_{project}.sh', sep='\t', header=False, index=False, quoting=csv.QUOTE_NONE)

In [56]:
## Testing
## import the contams bin as a collection and then split it out as a new profile and cluster it to prep for anvi-interactive
# print(f'anvi-import-collection -c KNLK_58_contigs.db -p KNLK_58_merged_profiles_filtered/PROFILE.db -C contam_f contam_f_collection.txt')
## split out as new anvio db
# print(f'anvi-split -c KNLK_58_contigs.db -p KNLK_58_merged_profiles_filtered/PROFILE.db -C contam_f -o KNLK_58_contam_f/')
## add layers from the output of find_contams.py
# print(f'anvi-import-misc-data -p KNLK_58_contam_f/contam/PROFILE.db -t items contam_f_scores.txt')
## anvi-interactive to manually inspect - does it look different with filtered mapping?
# print('anvi-interactive -p KNLK_58_contam_f/contam/PROFILE.db -c KNLK_58_contam_f/contam/CONTIGS.db')

## Cross-mapping for binning

In [68]:
# within-project mapping
project = 'DPWF' # 'Full_Scale'
df = tab[tab.project_type==project]

# get all rows of tab that are from the project of interest AND controls
df_cont = tab[tab.project_type.isin([project])]

xmapping_df = []
for srow in df.itertuples():  
    s = srow.sample_id
    assem_name = srow.sample_name
    assem = srow.assem
    path_prefix = srow.path_prefix
    bt2ind = f'{assem_dir}/{assem_name}/bt2/{s}_contigs_min1000.fa'
    
    # vs other samples and controls
    for rrow in df_cont.itertuples():
        r = rrow.sample_id
        rcode = rrow.sample_name
        r1 = rrow.r1
        r2 = rrow.r2
        bam = f'{assem}-vs-{r}.bam'
        sorted_bam = f'{assem}-vs-{r}.sorted.bam'
        
        # mapping directly to bam
        map_cmd = f'{bt2} -p 48 -x {bt2ind} -1 {r1} -2 {r2} --reorder | {ssam} -v | sambam > {bam}'
        map_qcmd = f'sbatch --wrap "{map_cmd}"' 
        
        # process mapping
        sort = f'samtools sort -m 5G {bam} > {sorted_bam}'
        index = f'samtools index {sorted_bam}'
        clean = f'rm {bam}'
        process_mapping = [sort, index, clean]
        process_mapping = '; '.join(process_mapping) # make into a single command line
        
        # anvi-profile
        profile_out = f'{assem_dir}/{assem_name}/anvio_data/{r}_profile' # check this
        anvip_cmd = f'anvi-profile --min-contig-length 1000 -T 48 -i {sorted_bam} -c {path_prefix}contigs.db -o {profile_out} -S {rcode}'
        anvip_qcmd = f'sbatch --wrap "{anvip_cmd}"'
        
        ## append all commands to table
        all_cmds = [s, project, assem_name, r, rcode, map_qcmd, filter_mapping, process_mapping, anvip_qcmd]
        xmapping_df.append(all_cmds)

rownames = ['sample_id', 'project_type', 'sample_code', 'read_id', 'read_code', 'map_qcmd', 'filter_mapping', 'process_mapping', 'anvip_qcmd']
xmapping_df = pd.DataFrame.from_records(xmapping_df, columns=rownames)

In [70]:
# group by sample_code, save individually for ease of running jobs on cluster
for location, df in xmapping_df.groupby('sample_code', as_index=False):
    sub_df = df[df.read_code != df.sample_code] # self-mapping is already complete for DPWF and DWDS
    sub_df.map_qcmd.to_csv(f'{wf}/m1.map_qcmd_{project}_{location}.sh', sep='\t', header=False, index=False, quoting=csv.QUOTE_NONE)
    df.process_mapping.to_csv(f'{wf}/m2.process_{project}_{location}.sh', sep='\t', header=False, index=False, quoting=csv.QUOTE_NONE)
    df.anvip_qcmd.to_csv(f'{wf}/m3.anvip_qcmd_{project}_{location}.sh', sep='\t', header=False, index=False, quoting=csv.QUOTE_NONE)

In [71]:
anvi_merge = []
for row in tab.itertuples():
    s = row.sample_id
    assem_name = row.sample_code_partial
    path_prefix = row.path_prefix
    contigsDB = f'{path_prefix}contigs.db'
    anvim_cmd = f'anvi-merge {assem_dir}/{assem_name}/anvio_data/*_profile/PROFILE.db -c {contigsDB} -o {path_prefix}merged'
    anvi_merge.append(anvim_cmd)    
tab['anvi_merge_binning'] = pd.Series(anvi_merge)

# write comands
project = 'DPWF'
anvim = tab[tab.project_type == project].anvi_merge_binning
anvim.to_csv(f'{wf}/anvim_{project}.sh',sep='\t', header=False, index=False, quoting=csv.QUOTE_NONE)

s = # Binning

In [77]:
s = 'KNLK_58'
assem_dir = '/groups/banfield/projects/human/drinkingwater/assembly/AWTP_2_RO2_bulk_3/'
profileDB = '/groups/banfield/projects/human/drinkingwater/assembly/AWTP_2_RO2_bulk_3/KNLK_58_merged/PROFILE.db'
contigsDB = '/groups/banfield/projects/human/drinkingwater/assembly/AWTP_2_RO2_bulk_3/KNLK_58_contigs.db'
get_cococt_input = f'anvi-export-splits-and-coverages -p {profileDB} -c {contigsDB} -o {assem_dir} -O {s} --splits-mode'
run_concoct = f'concoct --coverage_file {assem_dir}/{s}-COVs.txt --composition_file {assem_dir}/{s}-SPLITS.fa -b {assem_dir}/{s}_concoct -r 150 -c 24 -t 10'
print(run_concoct)

## process the output with 1-liner to make it importable into anvio
# ruby -F',' -lane 'puts $F.join("\tbin_")' KNLK_58_concoct_clustering_gt1000.csv > KNLK_58_concoct_clustering_gt1000.tsv
# output collection file: KNLK_58_concoct_clustering_gt1000.csv

concoct --coverage_file /groups/banfield/projects/human/drinkingwater/assembly/AWTP_2_RO2_bulk_3//KNLK_58-COVs.txt --composition_file /groups/banfield/projects/human/drinkingwater/assembly/AWTP_2_RO2_bulk_3//KNLK_58-SPLITS.fa -b /groups/banfield/projects/human/drinkingwater/assembly/AWTP_2_RO2_bulk_3//KNLK_58_concoct -r 150 -c 24 -t 10


In [73]:
tab.columns

Index(['sample_id', 'read_names', 'sample_16S', 'order',
       'band_in_gel_amplicons', 'sample_code_partial',
       'Sequal_prep_elute_pooled', 'Duplicate.', 'need_optimization.',
       'Has_Other_pos_sample', 'Pool_for_16S', 'F_primer', 'R_primer',
       'F_R_primer', 'PCR_rxn_DNA_Total_ng', 'template_dilution_factor',
       'DNA_Ext_conc_ng.uL', 'column', 'row', 'Genomic_DNA_size_1_bp',
       'Genomic_DNA_RLU_1', 'Genomic_DNA_size_2_bp', 'Genomic_DNA_RLU_2',
       'Genomic_DNA_2_plus_peaks', 'Genomic_DNA_3_plus_peaks', 'Notes_prep',
       'count_raw_reads', 'barcode_forward', 'barcode_reverse', 'plate',
       'well', 'metagenomic_sample', 'sample_code_full_and_batch',
       'sample_date', 'batch_sample_date', 'project_type', 'sample_or_control',
       'location_code', 'sample_type', 'sampler_name', 'Ext_batch',
       'Extraction_Date', 'Ext_num_unique', 'Microconcentrated.',
       'DNA_Ext_conc_ngperuL', 'kit', 'path_prefix', 'assem', 'r1', 'r2',
       'sample_name', '

In [None]:
# NOT DONE
# remove contams with anvi-split (figure out how to invert it first)
# split all contig dbs to visualize all contams removed and decide if it's okay
cmds = []
for row in tab.itertuples():
    s = row.sample_id
    assem_name = row.sample_code_partial
    path_prefix = row.path_prefix
    contigsDB = f'{path_prefix}contigs.db'
    profileDB = f'{path_prefix}merged_decontam_profiles/PROFILE.db'
    collection = f'/groups/banfield/projects/human/drinkingwater/assembly/decontam_work/results/{assem_name}_contam_collection.txt'
    new_anvi_data = f'{path_prefix}merged_decontam_split'
    anviimp = f'anvi-import-collection -c {contigsDB} -p {profileDB} -C contam {collection}'
    
    # FIGURE OUT HOW TO INVERT THE COLLECTION, SPLIT THE UNBINNED
    #anvisplit = f'anvi-split -c {contigsDB} -p {profileDB} --skip-hierarchical-clustering -C contam -o {new_anvi_data}'
    
    cmds.append('; '.join([anviimp, anvisplit]))
    
tab['split_contams'] = pd.Series(cmds)

project = 'DPWF'
split_contams = tab[tab.project_type==project].split_contams
split_contams.to_csv(f'{wf}/decontam_split_contams_{project}.sh', sep='\t', header=False, index=False, quoting=csv.QUOTE_NONE)


# Summarize metagenomes

In [61]:
# DONE - both proj # trimmed reads: seqkit stats *fastq.gz -T > trimmed_reads_stats.txt
# DONE - both proj # megahit assembly, log file: grep 'contigs, total' */log > analysis_assemblies/assembly_stats.megahit.042720.txt
# DONE - both proj # read mapping: samtools view -c -F 260
# DONE - both proj # decontam data: compiled reports from find_contams.py
# assembly min1000 after decontam: anvio
# read-mapping after decontam: maybe there's a fancy samtools view -c that lets you exclude a list of contigs 
                            # (or use anvio after anvi-split- check that they are the same?)

# % reads mapping to min1000 after decontam (# reads mapped to non contam contigs) / (total reads - reads mapped to contam contigs)

#summarize commands:
#anvi-display-contigs-stats --report-as-text -o contig_stats.txt *contigs.db
project = 'Full_Scale'
count_mapped = []
for row in tab.itertuples():
    s = row.sample_id
    assem = row.assem
    self_mapping = f'{assem}-vs-{s}.bam'
    cmd = f'samtools view -c -F 260 {self_mapping} >> /groups/banfield/projects/human/drinkingwater/assembly/analysis_assemblies/reads_mapped_{project}.txt'
    count_mapped.append(cmd)
tab['count_mapped'] = count_mapped
tab[tab.project_type == project].count_mapped.to_csv(f'{wf}/count_mapped_{project}.sh', sep='\t', header=False, index=False, quoting=csv.QUOTE_NONE)

# Resfam HMMsearches

In [None]:
for i in sample_1 sample_9 sample_33 sample_37 sample_38 sample_39 sample_65 sample_66 sample_67 sample_185 sample_187 sample_193
do

hmmsearch --tblout resfams.vs.$i.out --cut_ga --cpu 4 ~/ref_databases/resfams/Resfams.hmm /data4/other/awtp1/assembly/$i/idba_ud/prokka_annotation/prokka.faa
rename.py -f resfams.vs.$i.out -d /data4/other/awtp1/assembly/$i/idba_ud/prokka_annotation/$i.prokka_ids-to-geneids.txt > resfams.vs.$i.renamed.out
add_bins.py -s all_scaf2bin.txt -t resfams.vs.$i.renamed.out -g gene > resfams.vs.$i.binned.out

done

In [92]:
# just running this for DPWF for now
hmmsearch = '/usr/bin/hmmsearch'
assem_dir = '/data2/other/knelson/assembly'
resfams = '/home/rkantor/ref_databases/functional_genes_hmms/resfams/Resfams.hmm'
with open(f'{wd}/workflows/resfams_DPWF.sh', 'w') as f:
    for row in tab[tab.project_type=='DPWF'].itertuples():
        assem_name = row.sample_code_partial
        s = row.sample_id
        cmd = f'{hmmsearch} --tblout {assem_dir}/{assem_name}/resfams.vs.{s}.out --cut_ga --cpu 4 {resfams} {assem_dir}/{assem_name}/{s}_contigs_min1000.fa.genes.faa'
        f.write(cmd + '\n')
f.close()

In [101]:
#collecting resfams hits and running uclust on them to identify identical ones:
assem_dir = '/data2/other/knelson/assembly'
with open(f'{wd}/workflows/resfams_collect_DPWF.sh', 'w') as f:
    for row in tab[tab.project_type=='DPWF'].itertuples():
        
        assem_name = row.sample_code_partial
        s = row.sample_id
        min1000 = f'{assem_dir}/{assem_name}/{s}_contigs_min1000.fa.genes.faa'
        
        resout = f'{assem_dir}/{assem_name}/resfams.vs.{s}.out'
        collect_hits = f"grep -v '^#' {resout} | "\
                       "awk '{print $1}' | "\
                       f'pullseq -N -i {min1000} > {assem_dir}/{assem_name}/{s}.resfams_hits.faa'
        f.write(collect_hits + '\n')
f.close()

In [None]:
# cat AWTP_2_*/*resfam*faa > resfam_awtp2_hits_all.faa
# usearch -sortbylength resfam_awtp2_hits_all.faa -fastaout resfam_awtp2_hits_all.sorted.faa
# usearch -cluster_fast resfam_awtp2_hits_all.sorted.faa -id 0.99 -uc resfam_awtp2_hits_all.uc -threads 2

In [88]:
tab[tab.project_type=='DPWF']

Unnamed: 0,sample_id,read_names,sample_16S,order,band_in_gel_amplicons,sample_code_partial,Sequal_prep_elute_pooled,Duplicate.,need_optimization.,Has_Other_pos_sample,...,sample_or_control,location_code,sample_type,sampler_name,Ext_batch,Extraction_Date,Ext_num_unique,Microconcentrated.,DNA_Ext_conc_ngperuL,kit
2,KNLK_11,KNLK_11_S81,KNLK_SD028,81.0,+,AWTP_2_BAC_bulk_5,N,False,N,,...,sample,BAC,DEUF,Scott_Miller,2,6/16/2018,15,N,169.0,PowerSoil Pro
4,KNLK_13,KNLK_13_S10,KNLK_SD177,,+,AWTP_2_RO2_biofilm_sep_2,N,False,N,,...,sample,RO_BF_separator,biofilm,Rose_Kantor,19,9/27/2018,A69,N,0.071,
7,KNLK_16,KNLK_16_S34,KNLK_SD132,,+,AWTP_2_MF_comb_bulk_1,N,False,N,,...,sample,MF_combined,DEUF,Scott_Miller,18,9/8/2018,171,Y,1.07,PowerSoil Pro
15,KNLK_23,KNLK_23_S3,KNLK_SD179,,+,AWTP_2_RO2_bulk_5,N,False,N,,...,sample,RO_2stage,DEUF,Scott_Miller,RO1,9/13/2018,A53,Y,0.399,Powersoil Pro
20,KNLK_28,KNLK_28_S43,KNLK_SD009,43.0,+,AWTP_2_MF_comb_bulk_2,N,False,N,,...,sample,MF_combined,DEUF,Scott_Miller,18,9/8/2018,172,Y,1.91,PowerSoil Pro
25,KNLK_32,KNLK_32_S75,KNLK_SD023,75.0,+,AWTP_2_BAC_bulk_3,N,False,N,,...,sample,BAC,DEUF,Scott_Miller,Test_March11,3/11/2018,W5,supernatant,27.0,
26,KNLK_33,KNLK_33_S83,KNLK_SD030,83.0,+,AWTP_2_inf_bulk_4,N,False,N,,...,sample,WW_3ary,DEUF,Scott_Miller,1,6/15/2018,4,N,518.0,PowerSoil Pro
31,KNLK_38,KNLK_38_S36,KNLK_SD006,36.0,+,AWTP_2_RO2_biofilm_sep_1,N,False,N,,...,sample,RO_BF_separator,biofilm,Rose_Kantor,RO1,9/13/2018,A57,Y,1.17,Powersoil Pro
37,KNLK_43,KNLK_43_S76,KNLK_SD024,76.0,+,AWTP_2_RO2_biofilm_ret_scrape_2,N,False,N,,...,sample,RO_BF_retentate_scrapings,biofilm,Rose_Kantor,19,9/27/2018,199,N,30.5,PowerSoil Pro
38,KNLK_44,KNLK_44_S84,KNLK_SD031,84.0,+,AWTP_2_inf_bulk_5,N,False,N,,...,sample,WW_3ary,DEUF,Scott_Miller,1,6/15/2018,5,N,542.0,PowerSoil Pro


# BLAST of ARGs

In [102]:
ref = '~/ref_databases/args/sul1_model_from_card.fasta'

with open(f'{wd}/workflows/args_blast_DPWF.sh', 'w') as f:
    for row in tab[tab.project_type=='DPWF'].itertuples(): 
        assem_name = row.sample_code_partial
        s = row.sample_id
        min1000 = f'{assem_dir}/{assem_name}/{s}_contigs_min1000.fa.genes.faa'
        
        blastout = f'{min1000}-vs-sul1.out'
        cmd = f'usearch -ublast {min1000} -db {ref} -evalue 1e-5 -blast6out {blastout} -threads 2'
        f.write(cmd + '\n')
f.close()
