In [1]:
import pandas as pd
import os, subprocess, gzip, re

# Setup

In [10]:
DF_files = pd.read_csv('raw_data_info.csv',index_col=0)

In [11]:
def gunzip(gz,out_dir):
    basename = os.path.split(gz)[1][:-3]
    result = os.path.join(out_dir,basename)
    with gzip.open(gz,'rb') as f:
        with open(result,'w') as f2:
            f2.write(f.read())
    return result

def get_alignment_score(out_dir):
    filename = os.path.join(out_dir,'bowtie_align.txt')
    with open(filename,'r') as f:
        result = f.readlines()[1]
    match = re.search('\([\d\.]*%\)',result)
    return float(result[match.start()+1:match.end()-2])

def process_data(project,condition,rep,R1,R2,cores=8,verbose=False):
        # Set base directory
        ROOT_DIR = '/home/anand/Dropbox/Projects/biggdata_analysis/data/processed_data/'

        ### Create processed file subdirectories ###

        out_dir = os.path.join(ROOT_DIR,project,condition,str(rep))
        basename = '__'.join([project,condition,str(rep)])
        
        # Quit if file already exists
        if os.path.isfile(os.path.join(out_dir,basename+'.bam')):
            score = get_alignment_score(out_dir)
            return os.path.join(out_dir,basename+'.bam'),score
        
        if verbose:
            print 'Processing %s -> %s -> %s'%(project,condition,rep)
        
        # Create directories if they don't already exist
        if not os.path.isdir(out_dir):
            if verbose:
                print 'Creating directories %s'%out_dir
            os.makedirs(out_dir)

        ### Unzip fastq files ###

        r1_files = []
        r2_files = []
        for fastq in R1.split(','):
            if fastq.endswith('.gz'):
                if verbose:
                    print 'Unzipping file: %s'%fastq
                r1_files.append(gunzip(fastq,out_dir))
            else:
                r1_files.append(fastq)

        for fastq in R2.split(','):
            if fastq.endswith('.gz'):
                if verbose:
                    print 'Unzipping file: %s'%fastq
                r2_files.append(gunzip(fastq,out_dir))
            else:
                r2_files.append(fastq)

        ### Run Bowtie Aligner ###

        options = ['-X','1000','-n','2','-p',str(cores),'-3','3','-S']
        bowtie_ref = '/home/anand/bin/bowtie-1.1.2/indexes/NC_000913.3'
        bowtie_out = os.path.join(out_dir,basename+'.sam')
        bowtie_err = os.path.join(out_dir,'bowtie_align.txt')
        
        if verbose:
            print 'Running bowtie aligner...'
        
        with open(bowtie_out,'w') as out:
            with open(bowtie_err,'w') as err:
                subprocess.call(['bowtie']+options+['-1',','.join(r1_files),
                             '-2',','.join(r2_files),bowtie_ref],
                            stdout=out,stderr=err)

        ### Post-process files ###

        unsorted_bam = os.path.join(out_dir,basename+'.unsorted.bam')
        sorted_bam = os.path.join(out_dir,basename+'.bam')
        if verbose:
            print 'Converting to BAM...'
        subprocess.call(['samtools','view','-bS',bowtie_out,'-o',unsorted_bam])
        if verbose:
            print 'Sorting BAM file...'
        subprocess.call(['samtools','sort',unsorted_bam,'-o',sorted_bam])

        ### Clear all non-bam files ###
        if verbose:
            print 'Cleaning up...'
        for f in os.listdir(out_dir):
            if f.endswith('fastq'):
                os.remove(os.path.join(out_dir,f))
        os.remove(bowtie_out)
        os.remove(unsorted_bam)
        
        ### Find alignment score ###
        score = get_alignment_score(out_dir)

        ### Add BAM file and alignment value to replicate ###
        return sorted_bam,score

# QC

In [12]:
print 'Number of conditions: %d'%len(DF_files[['project','condition']].drop_duplicates())
print 'Number of unique conditions: %d'%len(DF_files[['project','condition','metadata']].drop_duplicates())
print 'Number of unique metadata files: %d'%len(DF_files.metadata.unique())

Number of conditions: 114
Number of unique conditions: 114
Number of unique metadata files: 87


In [13]:
print 'Number of Replicates: %d'%len(DF_files[['project','condition','rep']].drop_duplicates())
all_R1 = [r1.split(',') for r1 in DF_files.R1.values]
all_R2 = [r1.split(',') for r1 in DF_files.R2.values]
print 'Number of unique R1 files: %d'%len(DF_files.R1.unique())
print 'Number of unique R2 lists: %d'%len(DF_files.R2.unique())

Number of Replicates: 174
Number of unique R1 files: 174
Number of unique R2 lists: 174


### Assertions

In [14]:
# Check that there are no duplicate R1/R2 files within each replicate
assert all([len(r1.split(',')) == len(set(r1.split(','))) for r1 in DF_files.R1.unique()])
assert all([len(r2.split(',')) == len(set(r2.split(','))) for r2 in DF_files.R2.unique()])

# Ensure all R1/R2 files have R1/R2
assert all([('R1' in r1) or ('_1.fastq' in r1) for f in DF_files.R1.unique() for r1 in f.split(',')])
assert all([('R2' in r2) or ('_2.fastq' in r2) for f in DF_files.R2.unique() for r2 in f.split(',')])

# Process Reads

In [24]:
for i,row in DF_files.iterrows():
    bam,score = process_data(row.project,row.condition,row.rep,row.R1,row.R2,cores=4,verbose=True)
    DF_files.loc[i,'BAM'] = bam
    DF_files.loc[i,'alignment'] = score

In [25]:
DF_files.to_csv('raw_data_info.csv')

In [17]:
DF_files[DF_files.alignment < 90]

Unnamed: 0,project,condition,rep,metadata,R1,R2,BAM,alignment
10,gadewx,delgadx_ph5,1,/media/nucleoid/raw_data/dhkim/gadewx/2016-02-...,/media/nucleoid/raw_data/dhkim/gadewx/2016-02-...,/media/nucleoid/raw_data/dhkim/gadewx/2016-02-...,/home/anand/Dropbox/Projects/biggdata_analysis...,46.76
11,gadewx,delgadx_ph5,2,/media/nucleoid/raw_data/dhkim/gadewx/2016-02-...,/media/nucleoid/raw_data/dhkim/gadewx/2016-02-...,/media/nucleoid/raw_data/dhkim/gadewx/2016-02-...,/home/anand/Dropbox/Projects/biggdata_analysis...,78.31
45,me_param,bw25113_gly,2,/media/nucleoid/raw_data/dhkim/me_param/2016-0...,/media/nucleoid/raw_data/dhkim/me_param/2016-0...,/media/nucleoid/raw_data/dhkim/me_param/2016-0...,/home/anand/Dropbox/Projects/biggdata_analysis...,82.97
79,minspan,bw25113_ade,1,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/home/anand/Dropbox/Projects/biggdata_analysis...,78.38
80,minspan,bw25113_trp,1,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/home/anand/Dropbox/Projects/biggdata_analysis...,78.6
82,minspan,bw25113_delnac_ade,1,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/home/anand/Dropbox/Projects/biggdata_analysis...,81.84
83,minspan,bw25113_delcra_glc,1,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/home/anand/Dropbox/Projects/biggdata_analysis...,84.03
84,minspan,bw25113_delcra_trp,1,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/home/anand/Dropbox/Projects/biggdata_analysis...,76.33
86,minspan,bw25113_delmntr_anaero,1,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/media/nucleoid/raw_data/avsastry/minspan/SRR9...,/home/anand/Dropbox/Projects/biggdata_analysis...,67.99
90,minspan,bw25113_glc_anaero,1,/media/nucleoid/raw_data/avsastry/minspan/wt_a...,/media/nucleoid/raw_data/avsastry/minspan/wt_a...,/media/nucleoid/raw_data/avsastry/minspan/wt_a...,/home/anand/Dropbox/Projects/biggdata_analysis...,65.51


## STOP HERE - Load Metadata

In [18]:
DF_metadata = pd.DataFrame()
for metadata in DF_files.metadata:
    df = pd.read_csv(metadata,index_col=0,header=None)
    df.loc['metadata'] = metadata
    DF_metadata = pd.concat([DF_metadata,df.transpose()])
DF_metadata = DF_metadata.reset_index(drop=True)

IOError: File /media/nucleoid/raw_data/troy_sandberg/SSW_RNAseq/1_133_S28/2016-10-01_RNA-seq.csv does not exist

In [11]:
DF_metadata['carbon-source'] = [re.sub(' |\(.*\)','',x.lower()) 
                                for x in DF_metadata['carbon-source']]
DF_metadata['nitrogen-source'] = [re.sub('\(.*\)','',x) 
                                  for x in DF_metadata['nitrogen-source']]
DF_metadata['electron-acceptor'] = [re.sub('\(.*\)','',x) if type(x)==str else x
                                    for x in DF_metadata['electron-acceptor']]

In [12]:
DF_metadata.loc[0,'project'] = 'wt_dhk'
DF_metadata.loc[1,'project'] = 'wt_dhk'

In [13]:
DF_metadata = pd.merge(DF_metadata.drop_duplicates(),DF_files[['condition','rep','metadata']],on='metadata')

In [14]:
for i,row in DF_metadata.iterrows():
    DF_metadata.loc[i,'name'] = '__'.join([re.sub(' ','_',row.project.lower()),row.condition,str(row.rep)])

In [15]:
DF_metadata.to_csv('metadata.csv')