# Setup

In [2]:
from rnaseq import mount_bucket,align_reads
import pandas as pd
import os

In [2]:
# Set raw data bucket
DATA_BUCKET = '2018-20-06-test-data'

## Mount data bucket

In [3]:
IN_DIR = mount_bucket(DATA_BUCKET)

### Enter files into a Tab-separated file with 4 columns:
1. Unique sample identifier
  * Make these easy to read and understand
  * Biological replicates should end with _1, _2, etc.
1. R1 file location
  * If your fastq files are split across >1 files, separate using semicolons
1. R2 file location
1. Organism ID (from 0_setup_organism)

In [17]:
DF_files = pd.read_csv('example/raw_files.csv')
DF_files

Unnamed: 0,sample_id,R1,R2,organism
0,wt_fe2_1,WT-Fe2-1_S1_L001_R1_001.fastq.gz,WT-Fe2-1_S1_L001_R2_001.fastq.gz,MG1655
1,wt_fe2_2,WT-FE2-2_S2_L001_R1_001.fastq.gz,WT-FE2-2_S2_L001_R2_001.fastq.gz,MG1655
2,wt_dpd_1,WTDPD1_S1_L001_R1_001.fastq.gz,WTDPD1_S1_L001_R2_001.fastq.gz,MG1655
3,wt_dpd_2,WTDPD2_S1_L001_R1_001.fastq.gz,WTDPD2_S1_L001_R2_001.fastq.gz,MG1655
4,delfur_fe2_1,del-fur-Fe2-1_S1_L001_R1_001.fastq.gz,del-fur-Fe2-1_S1_L001_R2_001.fastq.gz,MG1655
5,delfur_fe2_2,del-fur-Fe2-2_S2_L001_R1_001.fastq.gz,del-fur-Fe2-2_S2_L001_R2_001.fastq.gz,MG1655
6,delfur_dpd_1,delfurDPD1_S2_L001_R1_001.fastq.gz,delfurDPD1_S2_L001_R2_001.fastq.gz,MG1655
7,delfur_dpd_2,delfurDPD2_S2_L001_R1_001.fastq.gz,delfurDPD2_S2_L001_R2_001.fastq.gz,MG1655


# QC

**Before alignment, run FastQC on your samples to assess the quality of the raw reads.**

In [18]:
print 'Number of unique sample IDs: %d'%len(DF_files.sample_id.unique())

Number of unique sample IDs: 8


In [19]:
all_R1 = [r1.split(',') for r1 in DF_files.R1.values]
all_R2 = [r1.split(',') for r1 in DF_files.R2.values]
print 'Number of unique R1 files: %d'%len(DF_files.R1.unique())
print 'Number of unique R2 lists: %d'%len(DF_files.R2.unique())

Number of unique R1 files: 8
Number of unique R2 lists: 8


# Align Reads

The `align_reads` function takes the following required arguments:
* `name`: The unique sample name used to name the output files
* `R1`: Location of the R1 file
* `R2`: Location of the R2 file
* `bt_index`: Location of bowtie index to use for alignment
* `out_dir`: Output directory

Optional arguments:
* `aligner`: 'bowtie' or 'bowtie2' (default 'bowtie')
* `insertsize`: Maximum distance between paired ends (default 1000)
* `cores`: Number of cores to use (default 1)
* `force`: Re-runs alignment even if BAM file already exists
* `verbose`: Update user with current process

`align_reads` performs the following:
1. Unzips .gz files into a temporary folder (if necessary)
2. Uses the bowtie aligner to align reads to a bowtie index:
    * Bowtie: `bowtie -X 1000 -n 2 -p <cores> -3 3 -S -1 <R1_files> -2 <R2_files> <bt_index>`
    * Bowtie2: `bowtie2 -X 1000 -N 1 -p <cores> -3 3 -1 <R1_files> -2 <R2_files> -x <bt_index>`
    * For information about these options, see docs for [bowtie](http://bowtie-bio.sourceforge.net/manual.shtml) and [bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml)
3. Converts the SAM output of bowtie to BAM
    * `samtools view -b <bowtie_out> -@ <cores> -o <unsorted_bam>`
4. Sorts the resulting BAM file
    * `samtools sort <unsorted_bam> -@ <cores> -o <sorted_bam>`
5. Cleans up intermediate files

The final output is the alignment score (%) and the location of the final BAM file

In [36]:
OUT_DIR = '../processed_data/bam_files/'

In [23]:
for i,row in DF_files.iterrows():
    bam,score = align_reads(row.sample_id,row.R1,row.R2,row.organism,
                            IN_DIR,OUT_DIR,cores=12,verbose=True)
    
    DF_files.loc[i,'BAM'] = bam
    DF_files.loc[i,'alignment'] = score

Processing wt_fe2_1
Unzipping file: /home/avsastry/2018-20-06-test-data/WT-Fe2-1_S1_L001_R1_001.fastq.gz
Unzipping file: /home/avsastry/2018-20-06-test-data/WT-Fe2-1_S1_L001_R2_001.fastq.gz
Running bowtie: bowtie -X 1000 -n 2 -p 12 -3 3 -S -1 ../processed_data/bam_files/tmp/WT-Fe2-1_S1_L001_R1_001.fastq -2 ../processed_data/bam_files/tmp/WT-Fe2-1_S1_L001_R2_001.fastq /home/avsastry/ref/MG1655/MG1655
Converting to BAM: samtools view -b ../processed_data/bam_files/tmp/wt_fe2_1.sam -@ 12 -o ../processed_data/bam_files/tmp/wt_fe2_1.unsorted.bam
Sorting BAM file: samtools sort ../processed_data/bam_files/tmp/wt_fe2_1.unsorted.bam -@ 12 -o ../processed_data/bam_files/wt_fe2_1.bam
Cleaning up...
Processing wt_fe2_2
Unzipping file: /home/avsastry/2018-20-06-test-data/WT-FE2-2_S2_L001_R1_001.fastq.gz
Unzipping file: /home/avsastry/2018-20-06-test-data/WT-FE2-2_S2_L001_R2_001.fastq.gz
Running bowtie: bowtie -X 1000 -n 2 -p 12 -3 3 -S -1 ../processed_data/bam_files/tmp/WT-FE2-2_S2_L001_R1_001.fas

In [6]:
DF_files.to_csv('../processed_data/aligned_files.csv')