# Binning contigs into MAGs for follow-up simulations

Samuel Barnett

### Introduction

Now that I have contigs from the follow-up simulations, I want to bin them into metagenome assembled genomes (MAGs). I'll use MetaBAT for this.

## 1) Initialization

First I need to import the python modules I'll use, set some variables, initiate R magic, and create/get into the working directory.

In [2]:
import os
workDir = '/home/sam/data/SIPSim2_data/RealWorld_study3/followup_sims'

nprocs = 20

In [3]:
if not os.path.isdir(workDir):
    print("Working directory does not exist!!!")
%cd $workDir

/home/sam/data/SIPSim2_data/RealWorld_study3/followup_sims


## 2) Mapping reads to contigs

Before binning I need to get the coverage of each contig in each read set within the group defined by the co-assembly. Here, this group is defined by the factor being tested.

In [None]:
binDir = os.path.join(workDir, 'binning')
if not os.path.exists(binDir):
    os.makedirs(binDir)
    
for followup_set in ['incorp25_lowGC', 'incorp100_lowGC', 
                     'lightwindow_highGC', 'mediumwindow_highGC', 'heavywindow_highGC']:
    fastqDir = os.path.join(workDir, followup_set)

    for exp_type in ['SIP', 'nonSIP']:
        subbinDir = '_'.join([followup_set, exp_type])
        subbinDir = os.path.join(binDir, subbinDir)
        if not os.path.exists(subbinDir):
            os.makedirs(subbinDir)
        %cd $subbinDir

        contigFile = '_'.join([followup_set, exp_type])
        contigFile = os.path.join(workDir, 'coassembly', contigFile, 'final.contigs.fa')

        for lib in [1, 2, 3, 4, 5, 6]:
            print(' '.join(['Mapping reads for', followup_set, 
                            exp_type, 'experiment library', str(lib)]))
            if exp_type == 'SIP':
                F_fastq = [f for f in os.listdir(fastqDir) if '_'.join(['library', str(lib), 'window']) in f if 'f.fastq.gz' in f][0]
                R_fastq = [f for f in os.listdir(fastqDir) if '_'.join(['library', str(lib), 'window']) in f if 'r.fastq.gz' in f][0]

            elif exp_type == 'nonSIP':
                F_fastq = '_'.join(['nonSIP_library', str(lib), 'reads_f.fastq.gz'])
                R_fastq = '_'.join(['nonSIP_library', str(lib), 'reads_r.fastq.gz'])
            else:
                print("Error with selecting files")
            F_fastq = os.path.join(fastqDir, F_fastq)
            R_fastq = os.path.join(fastqDir, R_fastq)

            if not os.path.isfile(F_fastq):
                print(' '.join([F_fastq, 'does not exist']))
            if not os.path.isfile(R_fastq):
                print(' '.join([R_fastq, 'does not exist']))

            # Reorder reads so that they are paired
            print('Reordering reads')
            cmd = ''.join(['repair.sh in1=', F_fastq, ' in2=', R_fastq, 
                           ' out1=repaired1.fastq.gz out2=repaired2.fastq.gz outs=repairedS.fastq.gz'])
            print(cmd)
            os.system(cmd)

            # Map reads
            print('Mapping reads')
            samname = ''.join(['library_', str(lib), '.sam'])
            cmd = ''.join(['bbmap.sh ref=', contigFile,
                           ' in=repaired1.fastq.gz in2=repaired2.fastq.gz', 
                           ' out=', samname, ' threads=', str(nprocs), 
                           ' bamscript=bs.sh; sh bs.sh'])
            print(cmd)
            os.system(cmd)

            # Remove "repaired" reads
            print('Removing reordered reads')
            cmd = 'rm repaired1.fastq.gz repaired2.fastq.gz repairedS.fastq.gz'
            print(cmd)
            os.system(cmd)

            print('\n')

        print('\n')



/home/sam/data/SIPSim2_data/RealWorld_study3/followup_sims/binning/incorp25_lowGC_SIP
Mapping reads for incorp25_lowGC SIP experiment library 1
Reordering reads
repair.sh in1=/home/sam/data/SIPSim2_data/RealWorld_study3/followup_sims/incorp25_lowGC/library_1_window_1.72-1.77_reads_f.fastq.gz in2=/home/sam/data/SIPSim2_data/RealWorld_study3/followup_sims/incorp25_lowGC/library_1_window_1.72-1.77_reads_r.fastq.gz out1=repaired1.fastq.gz out2=repaired2.fastq.gz outs=repairedS.fastq.gz
Mapping reads
bbmap.sh ref=/home/sam/data/SIPSim2_data/RealWorld_study3/followup_sims/coassembly/incorp25_lowGC_SIP/final.contigs.fa in=repaired1.fastq.gz in2=repaired2.fastq.gz out=library_1.sam threads=20 bamscript=bs.sh; sh bs.sh
Removing reordered reads
rm repaired1.fastq.gz repaired2.fastq.gz repairedS.fastq.gz


Mapping reads for incorp25_lowGC SIP experiment library 2
Reordering reads
repair.sh in1=/home/sam/data/SIPSim2_data/RealWorld_study3/followup_sims/incorp25_lowGC/library_2_window_1.72-1.77_rea

In [7]:
print("Done")

Done


## 3) Binning with MetaBat

Now I'll do the binning within each group. Again group is defined by the factor being tested.

In [None]:
exp_dict = {'SIP': 'window', 'nonSIP': 'nonSIP'}

binDir = os.path.join(workDir, 'binning')
#if not os.path.exists(binDir):
#    os.makedirs(binDir)
    
for followup_set in ['incorp25_lowGC', 'incorp100_lowGC', 
                     'lightwindow_highGC', 'mediumwindow_highGC', 'heavywindow_highGC']:
    for exp_type in ['SIP', 'nonSIP']:
        subbinDir = '_'.join([followup_set, exp_type])
        subbinDir = os.path.join(binDir, subbinDir)
        if not os.path.exists(subbinDir):
            os.makedirs(subbinDir)
        %cd $subbinDir

        contigFile = '_'.join([followup_set, exp_type])
        contigFile = os.path.join(workDir, 'coassembly', contigFile, 'final.contigs.fa')

        print('Summarizing contig depths')
        cmd = 'jgi_summarize_bam_contig_depths --outputDepth depth.txt --minContigLength 1000 --minContigDepth 2 library_1_sorted.bam library_2_sorted.bam library_3_sorted.bam library_4_sorted.bam library_5_sorted.bam library_6_sorted.bam'
        print(cmd)
        os.system(cmd)

        print('Binning')
        cmd = ' '.join(['metabat2 -i', contigFile, 
                        '-a depth.txt', 
                        '-o init_bins/bin',
                        '--saveCls',
                        '--unbinned',
                        '-t', str(nprocs)])
        print(cmd)
        os.system(cmd)

        print('---\n')

/home/sam/data/SIPSim2_data/RealWorld_study3/followup_sims/binning/incorp25_lowGC_SIP
Summarizing contig depths
jgi_summarize_bam_contig_depths --outputDepth depth.txt --minContigLength 1000 --minContigDepth 2 library_1_sorted.bam library_2_sorted.bam library_3_sorted.bam library_4_sorted.bam library_5_sorted.bam library_6_sorted.bam
Binning
metabat2 -i /home/sam/data/SIPSim2_data/RealWorld_study3/followup_sims/coassembly/incorp25_lowGC_SIP/final.contigs.fa -a depth.txt -o init_bins/bin --saveCls --unbinned -t 20
---

/home/sam/data/SIPSim2_data/RealWorld_study3/followup_sims/binning/incorp25_lowGC_nonSIP
Summarizing contig depths
jgi_summarize_bam_contig_depths --outputDepth depth.txt --minContigLength 1000 --minContigDepth 2 library_1_sorted.bam library_2_sorted.bam library_3_sorted.bam library_4_sorted.bam library_5_sorted.bam library_6_sorted.bam
Binning
metabat2 -i /home/sam/data/SIPSim2_data/RealWorld_study3/followup_sims/coassembly/incorp25_lowGC_nonSIP/final.contigs.fa -a depth

In [11]:
print('Done')

Done
