# Mapping simulated reads to reference genomes

Samuel Barnett

### Introduction

Here I'll map reads from metagenomic-SIP and shotgun metagenomic simulations to the reference genomes that were used to generate them.



## 1) Initialization

First I need to import the python modules I'll use, set some variables, initiate R magic, and create/get into the working directory.

In [1]:
import os
mainDir = '/home/sam/data/SIPSim2_data/RealWorld_study3/'
mappingDir = os.path.join(mainDir, 'read_mapping')
genomeDir = '/home/sam/databases/ncbi_genomes/ncbi-genomes-2019-01-25/'
nprocs = 15

In [2]:
import sys
import pandas as pd
import numpy as np
import re

In [3]:
# making directories
## working directory
if not os.path.isdir(mappingDir):
    os.makedirs(mappingDir)
%cd $mappingDir

## genome directory
if not os.path.isdir(genomeDir):
    print("Genome directory does not exist!!!")
else:
    print(genomeDir)

/home/sam/data/SIPSim2_data/RealWorld_study3/read_mapping
/home/sam/databases/ncbi_genomes/ncbi-genomes-2019-01-25/


## 2) Generating master genome multifasta files

For this to work I need all reference genomes from each set in a single multi-fasta file. I'll then map reads to this master file.

In [4]:
genset_dict = {'low_GC_skew': 'lowGC', 
               'medium_GC': 'medGC', 
               'high_GC_skew': 'highGC'}
depth_dict = {'depth5MM': '5MM', 
              'depth10MM': '10MM'}
exp_dict = {'SIP': 'window', 'nonSIP': 'nonSIP'}

cmd_dict = {}

for genome_set in ['low_GC_skew', 'medium_GC', 'high_GC_skew']:
    index_file = '_'.join([genome_set, 'genome_index.txt'])
    index_file = os.path.join(mainDir, index_file)
    ref_list = ' '.join([os.path.join(genomeDir, x) for x in pd.read_table(index_file, names = ['genome', 'file'])['file']])
    cat_fasta = '_'.join([genset_dict[genome_set], 'genomes.fasta'])
    cat_fasta = os.path.join(mappingDir, cat_fasta)
    cmd = ' '.join(['cat', ref_list, '>', cat_fasta])
    print(' '.join(['Concatinating', 
                    str(len(pd.read_table(index_file, names = ['genome', 'file'])['file'])), 
                    'genomes for', genome_set, 'into', cat_fasta]))
    os.system(cmd)

Concatinating 500 genomes for low_GC_skew into /home/sam/data/SIPSim2_data/RealWorld_study3/read_mapping/lowGC_genomes.fasta
Concatinating 500 genomes for medium_GC into /home/sam/data/SIPSim2_data/RealWorld_study3/read_mapping/medGC_genomes.fasta
Concatinating 500 genomes for high_GC_skew into /home/sam/data/SIPSim2_data/RealWorld_study3/read_mapping/highGC_genomes.fasta


## 3) Mapping reads with bbmap

I want to return coverage stats for each reference genome into each read library generated from that set.

In [None]:
genset_dict = {'low_GC_skew': 'lowGC', 
               'medium_GC': 'medGC', 
               'high_GC_skew': 'highGC'}
depth_dict = {'depth5MM': '5MM', 
              'depth10MM': '10MM'}
exp_dict = {'SIP': 'window', 'nonSIP': 'nonSIP'}
    
for genome_set in ['low_GC_skew', 'medium_GC', 'high_GC_skew']:
    
    cat_fasta = '_'.join([genset_dict[genome_set], 'genomes.fasta'])
    cat_fasta = os.path.join(mappingDir, cat_fasta)
    
    for depth in ['depth5MM', 'depth10MM']:
        fastqDir = os.path.join(mainDir, genome_set, depth)

        for exp_type in ['SIP', 'nonSIP']:
            submappingDir = '_'.join([genset_dict[genome_set], depth_dict[depth], exp_type])
            submappingDir = os.path.join(mappingDir, submappingDir)
            if not os.path.exists(submappingDir):
                os.makedirs(submappingDir)
            %cd $submappingDir

            for lib in [1, 2, 3, 4, 5, 6]:
                print(' '.join(['Mapping reads for', genome_set, 
                                exp_type, 'experiment', 
                                depth, 'read depth library', str(lib)]))
                if exp_type == 'SIP':
                    F_fastq = '_'.join(['library', str(lib), 'window_1.72-1.77_reads_f.fastq.gz'])
                    R_fastq = '_'.join(['library', str(lib), 'window_1.72-1.77_reads_r.fastq.gz'])
                elif exp_type == 'nonSIP':
                    F_fastq = '_'.join(['nonSIP_library', str(lib), 'reads_f.fastq.gz'])
                    R_fastq = '_'.join(['nonSIP_library', str(lib), 'reads_r.fastq.gz'])
                else:
                    print("Error with selecting files")
                F_fastq = os.path.join(fastqDir, F_fastq)
                R_fastq = os.path.join(fastqDir, R_fastq)
                    
                if not os.path.isfile(F_fastq):
                    print(' '.join([F_fastq, 'does not exist']))
                if not os.path.isfile(R_fastq):
                    print(' '.join([R_fastq, 'does not exist']))
                
                # Reorder reads so that they are paired
                print('Reordering reads')
                cmd = ''.join(['repair.sh in1=', F_fastq, ' in2=', R_fastq, 
                               ' out1=repaired1.fastq.gz out2=repaired2.fastq.gz outs=repairedS.fastq.gz -Xmx20g'])
                os.system(cmd)
                #print(cmd)
                
                # Map reads
                mapStats = '_'.join(['lib', str(lib), 'mapping_stats.txt'])
                mapHist = '_'.join(['lib', str(lib), 'mapping_hist.txt'])
                binCov = '_'.join(['lib', str(lib), 'binned_coverage.txt'])

                print('Mapping reads')         
                cmd = ''.join(['bbmap.sh ',
                               'in1=repaired1.fastq.gz ',
                               'in2=repaired2.fastq.gz ',
                               'ref=', cat_fasta, 
                               ' t=15 nodisk ',
                               'covstats=', mapStats,
                               ' covhist=', mapHist,
                               ' bincov=', binCov, 
                               ' -Xmx20g'])
                os.system(cmd)
                #print(cmd)
                
                # Remove "repaired" reads
                print('Removing reordered reads')
                cmd = 'rm repaired1.fastq.gz repaired2.fastq.gz repairedS.fastq.gz'
                os.system(cmd)
                #print(cmd)
                
                print('\n')

            print('\n')



/home/sam/data/SIPSim2_data/RealWorld_study3/read_mapping/lowGC_5MM_SIP
Mapping reads for low_GC_skew SIP experiment depth5MM read depth library 1
Reordering reads
Mapping reads


In [6]:
print('Done!')

Done!
