In [7]:
from math import log
unit_list = zip(['bytes', 'kB', 'MB', 'GB', 'TB', 'PB'], [0, 0, 1, 2, 2, 2])
def sizeof_fmt(num):
    """Human friendly file size"""
    if num > 1:
        exponent = min(int(log(num, 1024)), len(unit_list) - 1)
        quotient = float(num) / 1024**exponent
        unit, num_decimals = unit_list[exponent]
        format_string = '{:.%sf} {}' % (num_decimals)
        return format_string.format(quotient, unit)
    if num == 0:
        return '0 bytes'
    if num == 1:
        return '1 byte'

In [6]:
class ListTable(list):
    """ Overridden list class which takes a 2-dimensional list of 
        the form [[1,2,3],[4,5,6]], and renders an HTML Table in 
        IPython Notebook. """
    
    def _repr_html_(self):
        html = ["<table>"]
        count = 0
        for row in self:
            html.append("<tr>")
            
            for col in row:
                if count == 0:
                    html.append("<td><strong>{0}</strong></td>".format(col))
                else:
                    html.append("<td>{0}</td>".format(col))
            
            html.append("</tr>")
            count += 1
        html.append("</table>")
        return ''.join(html)

In [5]:
# Imports / style (run this first always)

%matplotlib inline
from IPython.display import FileLink, FileLinks
from IPython.core import display
from collections import defaultdict
import json
import sys
import time

import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd

from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl

class AwesomeError(Exception):
     def __init__(self, value):
         self.value = value
         pass
     def __str__(self):
         return repr(self.value)
         pass

#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'


def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()
        
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)

import Bio as bp
from Bio.Sequencing.Applications import BwaAlignCommandline as bwa_aln
from Bio.Sequencing.Applications import BwaSamseCommandline as bwa_samse
from Bio.Sequencing.Applications import BwaSampeCommandline as bwa_sampe
from Bio.Sequencing.Applications import BwaIndexCommandline as bwa_index
from Bio.Sequencing.Applications import BwaBwaswCommandline as bwa_bwasw
import HTSeq as ht
import subprocess

In [4]:
!pip install subprocess

[33mYou are using pip version 6.0.8, however version 7.1.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Collecting subprocess
[31m  Could not find any downloads that satisfy the requirement subprocess[0m
[31m  No distributions at all found for subprocess[0m


In [39]:
import os
import struct

def getuncompressedsize(filename):
    with open(filename) as f:
        f.seek(-4, 2)
        return struct.unpack('I', f.read(4))[0]
    
print "root prints out directories only from what you specified"
print "dirs prints out sub-directories from root"
print "files prints out all files from root and directories"
print "*" * 20
master_file_array = []
prefix = "/home/neffr01/minerva/short_reads/"
for root, dirs, files in os.walk(prefix):
    #print root
    #print dirs
    holdit = []
    for file_ in files:
        tmp = os.path.join(root, file_)
        try:
            size = os.path.getsize(tmp)
        except:
            continue
        ext = tmp.split('/')[-1].split('.')[-1]
        if ext == "gz":
            ext = '.'.join([tmp.split('/')[-1].split('.')[-2], 'gz'])
            #size = getuncompressedsize(tmp)
        holdit.append((tmp.split('/')[5], file_, ext, size, tmp))
    master_file_array.append(holdit)
    #print files
len(master_file_array)
master_file_array2 = [item for sublist in master_file_array for item in sublist] # only do this once!
master_file_array2.sort()
pdarr = pd.DataFrame(master_file_array2)
pdarr_pretty = pd.DataFrame(master_file_array2, columns=["folder", "name", "extension", "size", "path"])[['.' not in i[0] for i in pdarr[1]]]
pdarr = pdarr[['.' not in i[0] for i in pdarr[1]]] # remove hidden files
grouped = pdarr.groupby(pdarr[0])

root prints out directories only from what you specified
dirs prints out sub-directories from root
files prints out all files from root and directories
********************


In [None]:
import os
import struct
def getlastchars(filename):
    with open(filename) as f:
        f.seek(-16, 2)
        return f.read(16)

In [40]:
fastq_files = pdarr_pretty[(pdarr_pretty['extension'] == 'fastq.gz')]

In [41]:
fastq_files['parent'] = fastq_files['path'].map(lambda x: '/'.join(x.split('/')[-4:-3]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [42]:
fastq_files

Unnamed: 0,folder,name,extension,size,path,parent
82,hg002,2A1_CGATGT_L001_R1_001.fastq.gz,fastq.gz,428893454,/home/neffr01/minerva/short_reads/hg002/ftp.nc...,140528_D00360_0019_BH8VDAADXX
83,hg002,2A1_CGATGT_L001_R1_001.fastq.gz,fastq.gz,509951892,/home/neffr01/minerva/short_reads/hg002/ftp.nc...,140528_D00360_0018_AH8VC6ADXX
84,hg002,2A1_CGATGT_L001_R1_001.fastq.gz,fastq.gz,513979404,/home/neffr01/minerva/short_reads/hg002/ftp.nc...,140605_D00360_0021_BH9V1VADXX
85,hg002,2A1_CGATGT_L001_R1_001.fastq.gz,fastq.gz,514453620,/home/neffr01/minerva/short_reads/hg002/ftp.nc...,140605_D00360_0020_AH9V1RADXX
86,hg002,2A1_CGATGT_L001_R1_001.fastq.gz,fastq.gz,523477905,/home/neffr01/minerva/short_reads/hg002/ftp.nc...,140609_D00360_0022_AH9UJNADXX
87,hg002,2A1_CGATGT_L001_R1_001.fastq.gz,fastq.gz,524462927,/home/neffr01/minerva/short_reads/hg002/ftp.nc...,140609_D00360_0023_BH9UD5ADXX
88,hg002,2A1_CGATGT_L001_R1_002.fastq.gz,fastq.gz,167071275,/home/neffr01/minerva/short_reads/hg002/ftp.nc...,140605_D00360_0020_AH9V1RADXX
89,hg002,2A1_CGATGT_L001_R1_002.fastq.gz,fastq.gz,180252467,/home/neffr01/minerva/short_reads/hg002/ftp.nc...,140605_D00360_0021_BH9V1VADXX
90,hg002,2A1_CGATGT_L001_R1_002.fastq.gz,fastq.gz,203174069,/home/neffr01/minerva/short_reads/hg002/ftp.nc...,140528_D00360_0018_AH8VC6ADXX
91,hg002,2A1_CGATGT_L001_R1_002.fastq.gz,fastq.gz,206957195,/home/neffr01/minerva/short_reads/hg002/ftp.nc...,140609_D00360_0023_BH9UD5ADXX


In [43]:
study="HG002-GIAB"
left=""
right=""
sample="HG002"
lane=""
outfile = open('/home/neffr01/minerva/short_reads.index', 'wb')
for i,n in fastq_files.iterrows():
    basepath = '/'.join(n['path'].split('/')[0:-1]) + '/'
    if "R1" not in n['name']:
        continue
    sample = n['parent'] + "-" + "_".join(n['name'].split("_")[0:3])
    left = n['path']
    right = n['path'].replace('_R1_','_R2_')
    lane = n['name'].split('_')[2].strip('L')
    outfile.write('\t'.join([left, right, sample, lane, study]) + '\n')
outfile.close()

In [None]:
bamfiles_quarles

In [None]:
samples_set = set(["_".join(i['name'].split("_")[0:2]) for a,i in bamfiles_quarles.iterrows()])

In [None]:
for i in samples_set:
    print i

In [None]:
for i in samples_set:
    names = bamfiles_quarles[[i == "_".join(a.split("_")[0:2]) for a in bamfiles_quarles['name']]]
    try:
        print "\t".join([list(names[["out" in a for a in names['name']]]['path'])[0],
                         list(names[["merged_mhgrid.bam_rg.bam" in a for a in names['name']]]['path'])[0], 
                        i])
    except:
        sys.stderr.write("error on sample " + i + " \n")

In [None]:
sizeof_fmt(sum(pdarr_pretty['size']))

In [None]:
redcap_ids = pd.read_csv('redcap_ids.txt', header=None)

In [None]:
redcap_ids = [i[0] for a,i in redcap_ids.iterrows()]
redcap_ids

In [None]:
index_list = []
for a,i in pdarr_pretty.iterrows():
    print i['name']
    continue
    inlist = any([True for b in redcap_ids if b in str(i['name'])])
    if inlist:
        index_list.append(a)

In [None]:
len(index_list)

In [None]:
sizeof_fmt(sum(pdarr_pretty['size']))

In [None]:
pdarr_pretty.sort(columns='size', ascending=False, inplace=True)

In [None]:
pdarr_pretty['size'] = [sizeof_fmt(i) for i in pdarr_pretty['size']]

In [None]:
pdarr_pretty[(pdarr_pretty['extension']=='fastq.gz')].to_csv('totalRNA.txt', sep='\t', index=None)

In [None]:
pdarr_pretty[(pdarr_pretty['extension']=='bam')].to_csv('/data/projects_gibbons/home/neffra/mhgrid_tot_rna_bams', sep='\t')

In [None]:
sorted(filedict.keys())

In [None]:
res = pdarr_pretty[pdarr_pretty.duplicated(cols='name') | pdarr_pretty.duplicated(cols='name', take_last=True)].sort(columns='name')
res2 = res[res.duplicated(cols='size') | res.duplicated(cols='size', take_last=True)].sort(columns='name')
res2['path'] = ['/'.join(i.split('/')[6:]) for i in list(res2['path'])]
res3 = res2[[('QC' not in i) & ('.txt' not in i) for i in res2['path']]]

duplevel = 0
for i in res3.iterrows():
    duplevel += int(i[1][3])

print "Size of duplicated files: " + sizeof_fmt(duplevel/2)
    

In [None]:
#use this to look for files within the directory

ext = "vcf"
totsize = 0
for i in list(pdarr_pretty[[ext in i for i in pdarr_pretty['extension']]]['size']):
    totsize += int(i)

print "\nTotal size: " + sizeof_fmt(totsize)
                                    
display.display_html(pdarr_pretty[[ext in i for i in pdarr_pretty['extension']]])

a = pdarr_pretty[[ext in i for i in pdarr_pretty['extension']]]
a.to_excel('vcffiles.xlsx', 'VCFFiles')

In [None]:
#use this to look for files within the directory

ext = "fastq.gz"
totsize = 0
for i in list(pdarr_pretty[[ext in i for i in pdarr_pretty['extension']]]['size']):
    totsize += int(i)

print "\nTotal uncompressed size: " + sizeof_fmt(totsize)

g = pdarr_pretty[[ext in i for i in pdarr_pretty['extension']]]
g['name'] = ['_'.join(x.split("_")[0:4]) for x in g['name']]
count = 0

totsize = 0
#merge_file = open("/data/test/home/neffra/mhgrid_analysis/scripts/merge_all_mhgrid.sh", "wb")
alt_file = open("/data/test/home/neffra/mhgrid_analysis/align_files2", "wb")
for name, group in pdarr_pretty[["MHM" in i for i in pdarr_pretty['name']]].groupby(g['name']):
    #names = sorted(list(set('_'.join(x.split("_")[0:4]) for x in group['name'])))
    #names = [i + "_bwamem.realigned.bam" for i in names]
    #merge_file.write('qsub -b y -N merge -o /dev/null -e mergeout.err "samtools merge ' + name + "_merged_mhgrid.bam " + " ".join(names) + '"\n')
    alt_file.write("_".join(str(list(group['path'])[0]).split("_")[0:-2]) + " " + name + "\n")
    #l = str(list(group['path'])[0]).split("/")[0:-1]
    #l.append(name)
    #print '/'.join(l)
    #totsize += np.sum(group['size'])
#    print len(group)
    #count += 1
    #if count > 50: 
     #   break
alt_file.close()
#merge_file.close()
print sizeof_fmt(totsize/count)
print count
                                    
print list(g[["MHM" in i for i in g['name']]].head(1)['path'])

#for i in list(pdarr_pretty[[ext in i for i in pdarr_pretty['extension']]]['path'])[0:10]:
#    html = '<a href="sftp://neffra@walnut.nhgri.nih.gov'
#    html += i
#    html += '">' + i.split('/')[-1] + "</a>"
#    h = HTML(html)
#    display.display_html(h)

In [None]:
ext = "fastq.gz"
study = "S1304120MHM"
totsize = 0
for i in list(pdarr_pretty[[ext in i for i in pdarr_pretty['extension']]]['size']):
    totsize += int(i)

print "\nTotal uncompressed size: " + sizeof_fmt(totsize)

f = pdarr_pretty[[study in i for i in pdarr_pretty['name']]]
g = f[[ext in i for i in f['extension']]]
g['name'] = ['_'.join(x.split("_")[0:2]) for x in g['name']]

print len(set(g['name']))
g

In [None]:
masterdict = dict()
for name, group in grouped:
    print name
    filedict = dict()
    
    for idx, item in group.iterrows():
        ext = item[2]
        if ext in filedict:
            filedict[ext][0] += item[3]
            filedict[ext][1] += 1
        else:
            filedict[ext] = [item[3], 1]
    tbl = ListTable()
    tbl.append(['extension', '# files', 'size'])
    for key in sorted(filedict.keys()):
        tbl.append([key, str(filedict[key][1]), sizeof_fmt(filedict[key][0])])
    display.display_html(tbl)
    for key in filedict:
        if key in masterdict:
            masterdict[key][0] += filedict[key][0]
            masterdict[key][1] += filedict[key][1]
        else:
            masterdict[key] = filedict[key]

    

In [None]:
print "---------------\n"
print "Master stats:\n"
totsize = 0
tbl = ListTable()
tbl.append(['extension', '# files', 'size'])
for key in sorted(masterdict.keys()):
    if masterdict[key][0] >= 1*1024*1024*512:
        tbl.append([key, str(masterdict[key][1]), sizeof_fmt(masterdict[key][0])])
    totsize += masterdict[key][0]
print sizeof_fmt(totsize)
display.display_html(tbl)