In [1]:
import pandas as pd
import os,sys,subprocess
from os.path import join

# Setup

In [2]:
ROOT_DIR = os.getcwd() # Location of github folder

In [3]:
sys.path.append(join(ROOT_DIR,'align_reads.py'))
from align_reads import align_reads

IndentationError: unexpected indent (align_reads.py, line 204)

### Choose bowtie or bowtie2 as your aligner:

In [4]:
ALIGNER = 'bowtie' # 'bowtie2'
try:
    print subprocess.check_output([ALIGNER,'--version'])
except:
    print 'Aligner not installed correctly.'

bowtie version 1.1.2
64-bit
Built on localhost.localdomain
Tue Jun 23 13:28:18 EDT 2015
Compiler: gcc version 4.1.2 20080704 (Red Hat 4.1.2-54)
Options: -O3 -m64  -Wl,--hash-style=both -DPOPCNT_CAPABILITY  
Sizeof {int, long, long long, void*, size_t, off_t}: {4, 8, 8, 8, 8, 8}



### Build bowtie index and format GFF file (optional)
This only needs to be done once per organism. Make sure you get your Genbank file and FASTA file from the same source.

`build_index` creates the bowtie index for alignment using the following:
* `sequence`: Organism sequence as FASTA file
* `bt_index`: Location and basename of index
* `aligner` (optional): `'bowtie'` or `'bowtie2'` (default `'bowtie'`)
    
`gb2gff` creates a GFF file for downstream processing using the following:
* `sequence`: Organism sequence as a FASTA file
* `genbank`: Full genbank file for your organism

`gb2gff` also requires the following packages: [Biopython](#http://biopython.org/) and [BCBio](#https://github.com/chapmanb/bcbb/tree/master/gff)

In [8]:
from align_reads import build_index,gb2gff

FASTA = join(ROOT_DIR,'example/ref/NC_000913.3.fasta')
GB_FULL = join(ROOT_DIR,'example/ref/NC_000913.3.gb')
BT_INDEX = join(ROOT_DIR,'example/ref/NC_000913.3')

#build_index(FASTA,BT_INDEX)
linse = gb2gff(FASTA,GB_FULL)

gi|556503834|ref|NC_000913.3|
['gi|556503834|ref|NC_000913.3|', 'feature', 'exon', 189, 255, '.', '+', '.', 'gene_id "b0001"; transcript_id "b0001"; gene_name "thrL";']
['gi|556503834|ref|NC_000913.3|', 'feature', 'exon', 336, 2799, '.', '+', '.', 'gene_id "b0002"; transcript_id "b0002"; gene_name "thrA";']
['gi|556503834|ref|NC_000913.3|', 'feature', 'exon', 2800, 3733, '.', '+', '.', 'gene_id "b0003"; transcript_id "b0003"; gene_name "thrB";']
['gi|556503834|ref|NC_000913.3|', 'feature', 'exon', 3733, 5020, '.', '+', '.', 'gene_id "b0004"; transcript_id "b0004"; gene_name "thrC";']
['gi|556503834|ref|NC_000913.3|', 'feature', 'exon', 5233, 5530, '.', '+', '.', 'gene_id "b0005"; transcript_id "b0005"; gene_name "yaaX";']
['gi|556503834|ref|NC_000913.3|', 'feature', 'exon', 5682, 6459, '.', '-', '.', 'gene_id "b0006"; transcript_id "b0006"; gene_name "yaaA";']
['gi|556503834|ref|NC_000913.3|', 'feature', 'exon', 6528, 7959, '.', '-', '.', 'gene_id "b0007"; transcript_id "b0007"; gene_n

In [7]:
linse.append('aaa')

In [8]:
linse

[['gi|556503834|ref|NC_000913.3|',
  'feature',
  'exon',
  4534790,
  4536031,
  '.',
  '+',
  '.',
  'gene_id "b4308"; transcript_id "b4308"; gene_name "yjhR";'],
 'aaa']

### Enter bowtie index location

In [5]:
BT_INDEX = join(ROOT_DIR,'example/ref/NC_000913.3')

### Choose directory for aligned files

In [12]:
OUT_DIR = join(ROOT_DIR,'example/bam/')

### Enter files into a csv with 4 columns:
1. Unique sample identifier
1. Metadata file location (optional)
1. R1 file location
1. R2 file location

In [7]:
DF_files = pd.read_csv('example/raw_files.csv')
DF_files.head()

Unnamed: 0,sample_id,metadata,R1,R2
0,wt_fe2_1,/media/nucleoid/raw_data/dhkim/fur/2016-02-14_...,/media/nucleoid/raw_data/dhkim/fur/2016-02-14_...,/media/nucleoid/raw_data/dhkim/fur/2016-02-14_...
1,wt_fe2_2,/media/nucleoid/raw_data/dhkim/fur/2016-02-14_...,/media/nucleoid/raw_data/dhkim/fur/2016-02-14_...,/media/nucleoid/raw_data/dhkim/fur/2016-02-14_...
2,wt_dpd_1,/media/nucleoid/raw_data/dhkim/fur/2016-02-13_...,/media/nucleoid/raw_data/dhkim/fur/2016-02-13_...,/media/nucleoid/raw_data/dhkim/fur/2016-02-13_...
3,wt_dpd_2,/media/nucleoid/raw_data/dhkim/fur/2016-02-13_...,/media/nucleoid/raw_data/dhkim/fur/2016-02-13_...,/media/nucleoid/raw_data/dhkim/fur/2016-02-13_...
4,delfur_fe2_1,/media/nucleoid/raw_data/dhkim/fur/2016-02-11_...,/media/nucleoid/raw_data/dhkim/fur/2016-02-11_...,/media/nucleoid/raw_data/dhkim/fur/2016-02-11_...


# QC

**Before alignment, run FastQC on your samples to assess the quality of the raw reads.**

In [8]:
print 'Number of unique sample IDs: %d'%len(DF_files.sample_id.unique())
print 'Number of unique metadata files: %d'%len(DF_files.metadata.unique())

Number of unique sample IDs: 8
Number of unique metadata files: 4


In [9]:
all_R1 = [r1.split(',') for r1 in DF_files.R1.values]
all_R2 = [r1.split(',') for r1 in DF_files.R2.values]
print 'Number of unique R1 files: %d'%len(DF_files.R1.unique())
print 'Number of unique R2 lists: %d'%len(DF_files.R2.unique())

Number of unique R1 files: 8
Number of unique R2 lists: 8


# Align Reads

The `align_reads` function takes the following required arguments:
* `name`: The unique sample name used to name the output files
* `R1`: Location of the R1 file
* `R2`: Location of the R2 file
* `bt_index`: Location of bowtie index to use for alignment
* `out_dir`: Output directory

Optional arguments:
* `aligner`: 'bowtie' or 'bowtie2' (default 'bowtie')
* `insertsize`: Maximum distance between paired ends (default 1000)
* `cores`: Number of cores to use (default 1)
* `force`: Re-runs alignment even if BAM file already exists
* `verbose`: Update user with current process

`align_reads` performs the following:
1. Unzips .gz files into a temporary folder (if necessary)
2. Uses the bowtie aligner to align reads to a bowtie index:
    * Bowtie: `bowtie -X 1000 -n 2 -p 1 -3 3 -S -1 <R1_files> -2 <R2_files> <bt_index>`
    * Bowtie2: `bowtie2 -X 1000 -N 1 -p 1 -3 3 -1 <R1_files> -2 <R2_files> -x <bt_index>`
    * For information about these options, see docs for [bowtie](#http://bowtie-bio.sourceforge.net/manual.shtml) and [bowtie2](#http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml)
3. Converts the SAM output of bowtie to BAM
    * `samtools view -bS <bowtie_out> -o <unsorted_bam>`
4. Sorts the resulting BAM file
    * `samtools sort <unsorted_bam> -o <sorted_bam>`
5. Cleans up intermediate files

The final output is the alignment score (%) and the location of the final BAM file

In [13]:
for i,row in DF_files.iterrows():
    bam,score = align_reads(row.sample_id,row.R1,row.R2,BT_INDEX,OUT_DIR,
                            aligner=ALIGNER,cores=4,verbose=True)
    DF_files.loc[i,'BAM'] = bam
    DF_files.loc[i,'alignment'] = score

Processing wt_fe2_1
Processing wt_fe2_2
Processing wt_dpd_1
Processing wt_dpd_2
Processing delfur_fe2_1
Processing delfur_fe2_2
Processing delfur_dpd_1
Processing delfur_dpd_2


In [14]:
DF_files.to_csv('example/aligned_files.csv')

# Merge metadata (optional)

In [15]:
DF_metadata = pd.DataFrame()
for metadata in DF_files.metadata:
    df = pd.read_csv(metadata,index_col=0,header=None)
    df.loc['metadata'] = metadata
    DF_metadata = pd.concat([DF_metadata,df.transpose()])
DF_metadata.index = DF_files.sample_id
DF_metadata.head()

Unnamed: 0_level_0,creator,creator-email,project,data-type,run-date,taxonomy-id,strain-description,growth-stage,antibody,base-media,...,supplement,antibiotic,biological-replicates,technical-replicates,machine,illumina-kit,read-type,read-length,experiment-details,metadata
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
wt_fe2_1,Donghyuk Kim,dok023@ucsd.edu,fur,RNA-seq,2016-02-14,511145,Escherichia coli K-12 MG1655,mid-log,,M9,...,"non-sauer trace element mixture,FeCl2",,2,1,MiSeq,50 Cycle,Paired-end reads,31,,/media/nucleoid/raw_data/dhkim/fur/2016-02-14_...
wt_fe2_2,Donghyuk Kim,dok023@ucsd.edu,fur,RNA-seq,2016-02-14,511145,Escherichia coli K-12 MG1655,mid-log,,M9,...,"non-sauer trace element mixture,FeCl2",,2,1,MiSeq,50 Cycle,Paired-end reads,31,,/media/nucleoid/raw_data/dhkim/fur/2016-02-14_...
wt_dpd_1,Donghyuk Kim,dok023@ucsd.edu,fur,RNA-seq,2016-02-13,511145,Escherichia coli K-12 MG1655,mid-log,,M9,...,"non-sauer trace element mixture,DPD",,2,1,MiSeq,50 Cycle,Paired-end reads,31,,/media/nucleoid/raw_data/dhkim/fur/2016-02-13_...
wt_dpd_2,Donghyuk Kim,dok023@ucsd.edu,fur,RNA-seq,2016-02-13,511145,Escherichia coli K-12 MG1655,mid-log,,M9,...,"non-sauer trace element mixture,DPD",,2,1,MiSeq,50 Cycle,Paired-end reads,31,,/media/nucleoid/raw_data/dhkim/fur/2016-02-13_...
delfur_fe2_1,Donghyuk Kim,dok023@ucsd.edu,fur,RNA-seq,2016-02-11,511145,Escherichia coli K-12 MG1655 del_fur,mid-log,,M9,...,"non-sauer trace element mixture,FeCl2",Kanamycin,2,1,MiSeq,50 Cycle,Paired-end reads,31,,/media/nucleoid/raw_data/dhkim/fur/2016-02-11_...


In [16]:
DF_metadata.to_csv('example/metadata.csv')