# Setup

In [1]:
from rnaseq import mount_bucket,align_reads
import pandas as pd
import os

In [2]:
# Set raw data bucket
DATA_BUCKET = 'sbrg-precise-db'

## Mount data bucket

In [3]:
IN_DIR = mount_bucket(DATA_BUCKET)

### Enter files into a csv with 4 columns:
1. Unique sample identifier
1. Metadata file location (optional)
1. R1 file location
1. R2 file location

In [4]:
DF_files = pd.read_csv('/home/avsastry/saugat/D712_saugat.csv')
DF_files

Unnamed: 0,sample_id,R1,R2,organism
0,camhb__1,ARM_HRS_D712_CAMHB_0_ugmL_Naf_Rep_1_S37_L003_R...,ARM_HRS_D712_CAMHB_0_ugmL_Naf_Rep_1_S37_L003_R...,D712
1,camhb__2,ARM_HRS_D712_CAMHB_0_ugmL_Naf_Rep_2_S38_L003_R...,ARM_HRS_D712_CAMHB_0_ugmL_Naf_Rep_2_S38_L003_R...,D712
2,rpmi_25__1,ARM_HRS_D712_RPMI_0_25_ugmL_Rep_1_S4_L003_R1_0...,ARM_HRS_D712_RPMI_0_25_ugmL_Rep_1_S4_L003_R2_0...,D712
3,rpmi_25__2,ARM_HRS_D712_RPMI_0_25_ugmL_Rep_2_S5_L003_R1_0...,ARM_HRS_D712_RPMI_0_25_ugmL_Rep_2_S5_L003_R2_0...,D712
4,rpmi_25__3,ARM_HRS_D712_RPMI_0_25_ugmL_Rep_3_S6_L003_R1_0...,ARM_HRS_D712_RPMI_0_25_ugmL_Rep_3_S6_L003_R2_0...,D712
5,rpmi_5__1,ARM_HRS_D712_RPMI_0_5_ugmL_Rep_1_S7_L003_R1_00...,ARM_HRS_D712_RPMI_0_5_ugmL_Rep_1_S7_L003_R2_00...,D712
6,rpmi_5__2,ARM_HRS_D712_RPMI_0_5_ugmL_Rep_2_S8_L003_R1_00...,ARM_HRS_D712_RPMI_0_5_ugmL_Rep_2_S8_L003_R2_00...,D712
7,rpmi_5__3,ARM_HRS_D712_RPMI_0_5_ugmL_Rep_3_S9_L003_R1_00...,ARM_HRS_D712_RPMI_0_5_ugmL_Rep_3_S9_L003_R2_00...,D712
8,rpmi_0__1,ARM_HRS_D712_RPMI_0_ugmL_Naf_Rep_1_S1_L003_R1_...,ARM_HRS_D712_RPMI_0_ugmL_Naf_Rep_1_S1_L003_R2_...,D712
9,rpmi_0__2,ARM_HRS_D712_RPMI_0_ugmL_Naf_Rep_2_S2_L003_R1_...,ARM_HRS_D712_RPMI_0_ugmL_Naf_Rep_2_S2_L003_R2_...,D712


# QC

**Before alignment, run FastQC on your samples to assess the quality of the raw reads.**

In [5]:
print 'Number of unique sample IDs: %d'%len(DF_files.sample_id.unique())

Number of unique sample IDs: 12


In [6]:
all_R1 = [r1.split(',') for r1 in DF_files.R1.values]
all_R2 = [r1.split(',') for r1 in DF_files.R2.values]
print 'Number of unique R1 files: %d'%len(DF_files.R1.unique())
print 'Number of unique R2 lists: %d'%len(DF_files.R2.unique())

Number of unique R1 files: 12
Number of unique R2 lists: 12


# Align Reads

The `align_reads` function takes the following required arguments:
* `name`: The unique sample name used to name the output files
* `R1`: Location of the R1 file
* `R2`: Location of the R2 file
* `bt_index`: Location of bowtie index to use for alignment
* `out_dir`: Output directory

Optional arguments:
* `aligner`: 'bowtie' or 'bowtie2' (default 'bowtie')
* `insertsize`: Maximum distance between paired ends (default 1000)
* `cores`: Number of cores to use (default 1)
* `force`: Re-runs alignment even if BAM file already exists
* `verbose`: Update user with current process

`align_reads` performs the following:
1. Unzips .gz files into a temporary folder (if necessary)
2. Uses the bowtie aligner to align reads to a bowtie index:
    * Bowtie: `bowtie -X 1000 -n 2 -p 1 -3 3 -S -1 <R1_files> -2 <R2_files> <bt_index>`
    * Bowtie2: `bowtie2 -X 1000 -N 1 -p 1 -3 3 -1 <R1_files> -2 <R2_files> -x <bt_index>`
    * For information about these options, see docs for [bowtie](http://bowtie-bio.sourceforge.net/manual.shtml) and [bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml)
3. Converts the SAM output of bowtie to BAM
    * `samtools view -bS <bowtie_out> -o <unsorted_bam>`
4. Sorts the resulting BAM file
    * `samtools sort <unsorted_bam> -o <sorted_bam>`
5. Cleans up intermediate files

The final output is the alignment score (%) and the location of the final BAM file

In [7]:
IN_DIR = IN_DIR + '/saugat/'
OUT_DIR = '../saugat/processed_data/'

In [None]:
for i,row in DF_files.iterrows():
    bam,score = align_reads(row.sample_id,row.R1,row.R2,row.organism,
                            IN_DIR,OUT_DIR,cores=8,verbose=True)
    
    DF_files.loc[i,'BAM'] = bam
    DF_files.loc[i,'alignment'] = score

Processing camhb__1
Running bowtie: bowtie2 -X 1000 -N 1 -p 8 -3 3 -1 /home/avsastry/sbrg-precise-db/saugat/ARM_HRS_D712_CAMHB_0_ugmL_Naf_Rep_1_S37_L003_R1_001.fastq.gz -2 /home/avsastry/sbrg-precise-db/saugat/ARM_HRS_D712_CAMHB_0_ugmL_Naf_Rep_1_S37_L003_R2_001.fastq.gz -x /home/avsastry/ref/D712/D712


In [None]:
DF_files.to_csv('../saugat/aligned_files.csv')

# Merge metadata (optional)

In [15]:
DF_metadata = pd.DataFrame()
for metadata in DF_files.metadata:
    df = pd.read_csv(metadata,index_col=0,header=None)
    df.loc['metadata'] = metadata
    DF_metadata = pd.concat([DF_metadata,df.transpose()])
DF_metadata.index = DF_files.sample_id
DF_metadata.head()

Unnamed: 0_level_0,creator,creator-email,project,data-type,run-date,taxonomy-id,strain-description,growth-stage,antibody,base-media,...,supplement,antibiotic,biological-replicates,technical-replicates,machine,illumina-kit,read-type,read-length,experiment-details,metadata
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
wt_fe2_1,Donghyuk Kim,dok023@ucsd.edu,fur,RNA-seq,2016-02-14,511145,Escherichia coli K-12 MG1655,mid-log,,M9,...,"non-sauer trace element mixture,FeCl2",,2,1,MiSeq,50 Cycle,Paired-end reads,31,,/media/nucleoid/raw_data/dhkim/fur/2016-02-14_...
wt_fe2_2,Donghyuk Kim,dok023@ucsd.edu,fur,RNA-seq,2016-02-14,511145,Escherichia coli K-12 MG1655,mid-log,,M9,...,"non-sauer trace element mixture,FeCl2",,2,1,MiSeq,50 Cycle,Paired-end reads,31,,/media/nucleoid/raw_data/dhkim/fur/2016-02-14_...
wt_dpd_1,Donghyuk Kim,dok023@ucsd.edu,fur,RNA-seq,2016-02-13,511145,Escherichia coli K-12 MG1655,mid-log,,M9,...,"non-sauer trace element mixture,DPD",,2,1,MiSeq,50 Cycle,Paired-end reads,31,,/media/nucleoid/raw_data/dhkim/fur/2016-02-13_...
wt_dpd_2,Donghyuk Kim,dok023@ucsd.edu,fur,RNA-seq,2016-02-13,511145,Escherichia coli K-12 MG1655,mid-log,,M9,...,"non-sauer trace element mixture,DPD",,2,1,MiSeq,50 Cycle,Paired-end reads,31,,/media/nucleoid/raw_data/dhkim/fur/2016-02-13_...
delfur_fe2_1,Donghyuk Kim,dok023@ucsd.edu,fur,RNA-seq,2016-02-11,511145,Escherichia coli K-12 MG1655 del_fur,mid-log,,M9,...,"non-sauer trace element mixture,FeCl2",Kanamycin,2,1,MiSeq,50 Cycle,Paired-end reads,31,,/media/nucleoid/raw_data/dhkim/fur/2016-02-11_...


In [16]:
DF_metadata.to_csv('example/metadata.csv')