##scandirectory

Finds all of the files in a certain directory and makes nice looking tables based on what the most frequent file type is, what size that file type takes up, etc.

In [1]:
from math import log
unit_list = zip(['bytes', 'kB', 'MB', 'GB', 'TB', 'PB'], [0, 0, 1, 2, 2, 2])
def sizeof_fmt(num):
    """Human friendly file size"""
    if num > 1:
        exponent = min(int(log(num, 1024)), len(unit_list) - 1)
        quotient = float(num) / 1024**exponent
        unit, num_decimals = unit_list[exponent]
        format_string = '{:.%sf} {}' % (num_decimals)
        return format_string.format(quotient, unit)
    if num == 0:
        return '0 bytes'
    if num == 1:
        return '1 byte'

In [2]:
class ListTable(list):
    """ Overridden list class which takes a 2-dimensional list of 
        the form [[1,2,3],[4,5,6]], and renders an HTML Table in 
        IPython Notebook. """
    
    def _repr_html_(self):
        html = ["<table>"]
        count = 0
        for row in self:
            html.append("<tr>")
            
            for col in row:
                if count == 0:
                    html.append("<td><strong>{0}</strong></td>".format(col))
                else:
                    html.append("<td>{0}</td>".format(col))
            
            html.append("</tr>")
            count += 1
        html.append("</table>")
        return ''.join(html)

In [3]:
# Imports / style (run this first always)

%matplotlib inline
from IPython.display import FileLink, FileLinks
from IPython.core import display
from collections import defaultdict
import json
import sys
import time

import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd

from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl

class AwesomeError(Exception):
     def __init__(self, value):
         self.value = value
         pass
     def __str__(self):
         return repr(self.value)
         pass

#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'


def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()
        
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)

import Bio as bp
from Bio.Sequencing.Applications import BwaAlignCommandline as bwa_aln
from Bio.Sequencing.Applications import BwaSamseCommandline as bwa_samse
from Bio.Sequencing.Applications import BwaSampeCommandline as bwa_sampe
from Bio.Sequencing.Applications import BwaIndexCommandline as bwa_index
from Bio.Sequencing.Applications import BwaBwaswCommandline as bwa_bwasw
import HTSeq as ht
import subprocess

In [4]:
!pip install subprocess

[33mYou are using pip version 6.0.8, however version 7.1.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Collecting subprocess
[31m  Could not find any downloads that satisfy the requirement subprocess[0m
[31m  No distributions at all found for subprocess[0m


In [12]:
import os
import struct

def getuncompressedsize(filename):
    with open(filename) as f:
        f.seek(-4, 2)
        return struct.unpack('I', f.read(4))[0]
    
print "root prints out directories only from what you specified"
print "dirs prints out sub-directories from root"
print "files prints out all files from root and directories"
print "*" * 20
master_file_array = []
prefix = "/hpc/users/neffr01/jason_new/short_reads/hg004/"
for root, dirs, files in os.walk(prefix):
    #print root
    #print dirs
    holdit = []
    for file_ in files:
        tmp = os.path.join(root, file_)
        try:
            size = os.path.getsize(tmp)
        except:
            continue
        ext = tmp.split('/')[-1].split('.')[-1]
        if ext == "gz":
            ext = '.'.join([tmp.split('/')[-1].split('.')[-2], 'gz'])
            #size = getuncompressedsize(tmp)
        holdit.append((tmp.split('/')[5], file_, ext, size, tmp))
    master_file_array.append(holdit)
    #print files
len(master_file_array)
master_file_array2 = [item for sublist in master_file_array for item in sublist] # only do this once!
master_file_array2.sort()
pdarr = pd.DataFrame(master_file_array2)
pdarr_pretty = pd.DataFrame(master_file_array2, columns=["folder", "name", "extension", "size", "path"])[['.' not in i[0] for i in pdarr[1]]]
pdarr = pdarr[['.' not in i[0] for i in pdarr[1]]] # remove hidden files
grouped = pdarr.groupby(pdarr[0])

root prints out directories only from what you specified
dirs prints out sub-directories from root
files prints out all files from root and directories
********************


In [None]:
import os
import struct
def getlastchars(filename):
    with open(filename) as f:
        f.seek(-16, 2)
        return f.read(16)

In [13]:
fastq_files = pdarr_pretty[(pdarr_pretty['extension'] == 'fastq.gz')]

In [14]:
fastq_files['size'] = fastq_files['size'].map(lambda x: sizeof_fmt(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [15]:
fastq_files['parent'] = fastq_files['path'].map(lambda x: '/'.join(x.split('/')[-4:-3]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [8]:
len(fastq_files)

1084

In [16]:
study="HG004-GIAB"
left=""
right=""
sample="HG004"
lane=""
outfile = open('/hpc/users/neffr01/jason_new/short_reads/hg004.fastq.index', 'wb')
count = [0,0]
for i,n in fastq_files.iterrows():
    basepath = '/hpc/users/neffr01/jason_new/'
    if "R1" not in n['name']:
        if "R2" in n['name']:
            count[1] += 1
            continue
        else:
            print n['name']
            continue
    sample = n['parent'] + "-" + "_".join(n['name'].split("_")[0:3])
    left = n['path']
    right = n['path'].replace('_R1_','_R2_')
    lane = n['name'].split('_')[2].strip('L')
    count[0] += 1
    outfile.write('\t'.join([left, right, sample, lane, study]) + '\n')
outfile.close()
print count

[493, 492]


In [10]:
%debug

> [1;32m<ipython-input-9-a5831ba6cd2e>[0m(22)[0;36m<module>[1;34m()[0m
[1;32m     21 [1;33m    [0mcount[0m[1;33m[[0m[1;36m0[0m[1;33m][0m [1;33m+=[0m [1;36m1[0m[1;33m[0m[0m
[0m[1;32m---> 22 [1;33m    [0moutfile[0m[1;33m.[0m[0mwrite[0m[1;33m([0m[1;34m'\t'[0m[1;33m.[0m[0mjoin[0m[1;33m([0m[1;33m[[0m[0mleft[0m[1;33m,[0m [0mright[0m[1;33m,[0m [0msample[0m[1;33m,[0m [0mlane[0m[1;33m,[0m [0mstudy[0m[1;33m][0m[1;33m)[0m [1;33m+[0m [1;34m'\n'[0m[1;33m)[0m[1;33m[0m[0m
[0m[1;32m     23 [1;33m[0moutfile[0m[1;33m.[0m[0mclose[0m[1;33m([0m[1;33m)[0m[1;33m[0m[0m
[0m
ipdb> print left
['', 'hpc', 'users', 'neffr01', 'jason_new', 'short_reads', 'hg003', 'giab', 'ftp', 'technical', 'NISTAshkenazimTrio', 'HG-003_Homogeneity-12389378', 'HG003_HiSeq300x_fastq', '140627_D00360_0030_AHA0L6ADXX', 'Project_RM8392', 'Sample_3A1', '3A1_CGATGT_L001_R1_001.fastq.gz']
ipdb> exit


In [57]:
masterdict = dict()
for name, group in grouped:
    print name
    filedict = dict()
    
    for idx, item in group.iterrows():
        ext = item[2]
        if ext in filedict:
            filedict[ext][0] += item[3]
            filedict[ext][1] += 1
        else:
            filedict[ext] = [item[3], 1]
    tbl = ListTable()
    tbl.append(['extension', '# files', 'size'])
    for key in sorted(filedict.keys()):
        tbl.append([key, str(filedict[key][1]), sizeof_fmt(filedict[key][0])])
    display.display_html(tbl)
    for key in filedict:
        if key in masterdict:
            masterdict[key][0] += filedict[key][0]
            masterdict[key][1] += filedict[key][1]
        else:
            masterdict[key] = filedict[key]

    

hg002


0,1,2
extension,# files,size
csv,118,20 kB
fastq.gz,1504,615.46 GB


hg002_bams


0,1,2
extension,# files,size
10018,1,9.4 MB
10180,1,9.4 MB
10357,1,9.4 MB
10395,1,9.4 MB
1049,1,9.4 MB
10515,1,9.4 MB
10849,1,9.4 MB
11275,1,9.4 MB
11327,1,9.4 MB


In [None]:
print "---------------\n"
print "Master stats:\n"
totsize = 0
tbl = ListTable()
tbl.append(['extension', '# files', 'size'])
for key in sorted(masterdict.keys()):
    if masterdict[key][0] >= 1*1024*1024*512:
        tbl.append([key, str(masterdict[key][1]), sizeof_fmt(masterdict[key][0])])
    totsize += masterdict[key][0]
print sizeof_fmt(totsize)
display.display_html(tbl)