In [1]:
import pandas as pd
import os,sys,subprocess

# Setup

In [2]:
ROOT_DIR = os.getcwd() # Location of github folder

In [3]:
sys.path.append(os.path.join(ROOT_DIR,'align_reads.py'))
from align_reads import align_reads

### Choose bowtie or bowtie2 as your aligner:

In [4]:
ALIGNER = 'bowtie' # 'bowtie2'
try:
    subprocess.call(ALIGNER)
except:
    print 'Aligner not installed correctly.'

### Enter bowtie index location

In [5]:
BT_INDEX = 'example/ref/NC_000913.3'

### Choose directory for aligned files

In [6]:
OUT_DIR = 'example/'

### Enter files into a csv with 4 columns:
1. Unique sample identifier
1. Metadata file location (optional)
1. R1 file location
1. R2 file location

Note: Use absolute file locations, not relative locations

In [7]:
DF_files = pd.read_csv('example/raw_files.csv')
DF_files.head()

Unnamed: 0,sample_id,metadata,R1,R2
0,wt_fe2_1,/media/nucleoid/raw_data/dhkim/fur/2016-02-14_...,/media/nucleoid/raw_data/dhkim/fur/2016-02-14_...,/media/nucleoid/raw_data/dhkim/fur/2016-02-14_...
1,wt_fe2_2,/media/nucleoid/raw_data/dhkim/fur/2016-02-14_...,/media/nucleoid/raw_data/dhkim/fur/2016-02-14_...,/media/nucleoid/raw_data/dhkim/fur/2016-02-14_...
2,wt_dpd_1,/media/nucleoid/raw_data/dhkim/fur/2016-02-13_...,/media/nucleoid/raw_data/dhkim/fur/2016-02-13_...,/media/nucleoid/raw_data/dhkim/fur/2016-02-13_...
3,wt_dpd_2,/media/nucleoid/raw_data/dhkim/fur/2016-02-13_...,/media/nucleoid/raw_data/dhkim/fur/2016-02-13_...,/media/nucleoid/raw_data/dhkim/fur/2016-02-13_...
4,delfur_fe2_1,/media/nucleoid/raw_data/dhkim/fur/2016-02-11_...,/media/nucleoid/raw_data/dhkim/fur/2016-02-11_...,/media/nucleoid/raw_data/dhkim/fur/2016-02-11_...


# QC

**Before alignment, run FastQC on your samples to assess the quality of the raw reads**

In [8]:
print 'Number of unique metadata files: %d'%len(DF_files.metadata.unique())

Number of unique metadata files: 4


In [9]:
all_R1 = [r1.split(',') for r1 in DF_files.R1.values]
all_R2 = [r1.split(',') for r1 in DF_files.R2.values]
print 'Number of unique R1 files: %d'%len(DF_files.R1.unique())
print 'Number of unique R2 lists: %d'%len(DF_files.R2.unique())

Number of unique R1 files: 8
Number of unique R2 lists: 8


# Process Reads

In [None]:
for i,row in DF_files.iterrows():
    bam,score = align_reads(row.sample_id,row.R1,row.R2,BT_INDEX,OUT_DIR,
                            aligner=ALIGNER,cores=4,verbose=True)
    DF_files.loc[i,'BAM'] = bam
    DF_files.loc[i,'alignment'] = score

Processing wt_fe2_1
Processing wt_fe2_2
Unzipping file: /media/nucleoid/raw_data/dhkim/fur/2016-02-14_RNA-seq/WT-FE2-2_S2_L001_R1_001.fastq.gz
Unzipping file: /media/nucleoid/raw_data/dhkim/fur/2016-02-14_RNA-seq/WT-FE2-2_S2_L001_R2_001.fastq.gz
Running bowtie aligner...
Converting to BAM...
Sorting BAM file...
Cleaning up...
Processing wt_dpd_1
Unzipping file: /media/nucleoid/raw_data/dhkim/fur/2016-02-13_RNA-seq/WTDPD1_S1_L001_R1_001.fastq.gz
Unzipping file: /media/nucleoid/raw_data/dhkim/fur/2016-02-13_RNA-seq/WTDPD1_S1_L001_R2_001.fastq.gz
Running bowtie aligner...
Converting to BAM...
Sorting BAM file...
Cleaning up...
Processing wt_dpd_2
Unzipping file: /media/nucleoid/raw_data/dhkim/fur/2016-02-13_RNA-seq/WTDPD2_S1_L001_R1_001.fastq.gz
Unzipping file: /media/nucleoid/raw_data/dhkim/fur/2016-02-13_RNA-seq/WTDPD2_S1_L001_R2_001.fastq.gz
Running bowtie aligner...
Converting to BAM...
Sorting BAM file...
Cleaning up...
Processing delfur_fe2_1
Unzipping file: /media/nucleoid/raw_data

In [12]:
os.rmdir('example/tmp/')

OSError: [Errno 39] Directory not empty: 'example/tmp/'

In [17]:
DF_files

Unnamed: 0,project,condition,rep,metadata,R1,R2,BAM,alignment
10,gadewx,delgadx_ph5,1,/media/nucleoid/raw_data/dhkim/gadewx/2016-02-...,/media/nucleoid/raw_data/dhkim/gadewx/2016-02-...,/media/nucleoid/raw_data/dhkim/gadewx/2016-02-...,/home/anand/Dropbox/Projects/biggdata_analysis...,46.76
11,gadewx,delgadx_ph5,2,/media/nucleoid/raw_data/dhkim/gadewx/2016-02-...,/media/nucleoid/raw_data/dhkim/gadewx/2016-02-...,/media/nucleoid/raw_data/dhkim/gadewx/2016-02-...,/home/anand/Dropbox/Projects/biggdata_analysis...,78.31
45,me_param,bw25113_gly,2,/media/nucleoid/raw_data/dhkim/me_param/2016-0...,/media/nucleoid/raw_data/dhkim/me_param/2016-0...,/media/nucleoid/raw_data/dhkim/me_param/2016-0...,/home/anand/Dropbox/Projects/biggdata_analysis...,82.97
79,minspan,bw25113_ade,1,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/home/anand/Dropbox/Projects/biggdata_analysis...,78.38
80,minspan,bw25113_trp,1,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/home/anand/Dropbox/Projects/biggdata_analysis...,78.6
82,minspan,bw25113_delnac_ade,1,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/home/anand/Dropbox/Projects/biggdata_analysis...,81.84
83,minspan,bw25113_delcra_glc,1,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/home/anand/Dropbox/Projects/biggdata_analysis...,84.03
84,minspan,bw25113_delcra_trp,1,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/home/anand/Dropbox/Projects/biggdata_analysis...,76.33
86,minspan,bw25113_delmntr_anaero,1,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/home/anand/Dropbox/Projects/biggdata_analysis...,67.99
90,minspan,bw25113_glc_anaero,1,/media/nucleoid/raw_data/avsastry/minspan/wt_a...,/media/nucleoid/raw_data/avsastry/minspan/wt_a...,/media/nucleoid/raw_data/avsastry/minspan/wt_a...,/home/anand/Dropbox/Projects/biggdata_analysis...,65.51


# Merge metadata (optional)

In [12]:
DF_metadata = pd.DataFrame()
for metadata in DF_files.metadata:
    df = pd.read_csv(metadata,index_col=0,header=None)
    df.loc['metadata'] = metadata
    DF_metadata = pd.concat([DF_metadata,df.transpose()])
DF_metadata = DF_metadata.reset_index(drop=True)
DF_metadata.head()

Unnamed: 0,creator,creator-email,project,data-type,run-date,taxonomy-id,strain-description,growth-stage,antibody,base-media,...,supplement,antibiotic,biological-replicates,technical-replicates,machine,illumina-kit,read-type,read-length,experiment-details,metadata
0,Donghyuk Kim,dok023@ucsd.edu,fur,RNA-seq,2016-02-14,511145,Escherichia coli K-12 MG1655,mid-log,,M9,...,"non-sauer trace element mixture,FeCl2",,2,1,MiSeq,50 Cycle,Paired-end reads,31,,/media/nucleoid/raw_data/dhkim/fur/2016-02-14_...
1,Donghyuk Kim,dok023@ucsd.edu,fur,RNA-seq,2016-02-14,511145,Escherichia coli K-12 MG1655,mid-log,,M9,...,"non-sauer trace element mixture,FeCl2",,2,1,MiSeq,50 Cycle,Paired-end reads,31,,/media/nucleoid/raw_data/dhkim/fur/2016-02-14_...
2,Donghyuk Kim,dok023@ucsd.edu,fur,RNA-seq,2016-02-13,511145,Escherichia coli K-12 MG1655,mid-log,,M9,...,"non-sauer trace element mixture,DPD",,2,1,MiSeq,50 Cycle,Paired-end reads,31,,/media/nucleoid/raw_data/dhkim/fur/2016-02-13_...
3,Donghyuk Kim,dok023@ucsd.edu,fur,RNA-seq,2016-02-13,511145,Escherichia coli K-12 MG1655,mid-log,,M9,...,"non-sauer trace element mixture,DPD",,2,1,MiSeq,50 Cycle,Paired-end reads,31,,/media/nucleoid/raw_data/dhkim/fur/2016-02-13_...
4,Donghyuk Kim,dok023@ucsd.edu,fur,RNA-seq,2016-02-11,511145,Escherichia coli K-12 MG1655 del_fur,mid-log,,M9,...,"non-sauer trace element mixture,FeCl2",Kanamycin,2,1,MiSeq,50 Cycle,Paired-end reads,31,,/media/nucleoid/raw_data/dhkim/fur/2016-02-11_...
