forked from fanhuan/script
-
Notifications
You must be signed in to change notification settings - Fork 0
/
count2hist.py
executable file
·42 lines (34 loc) · 1.32 KB
/
count2hist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/usr/bin/python
from os.path import join, isfile, splitext
from optparse import OptionParser
import os
import re
import math
import subprocess
import gzip
import collections
import operator
import sys
def smartopen(filename,*args,**kwargs):
'''opens with open unless file ends in .gz, then use gzip.open
in theory should transparently allow reading of files regardless of compression
'''
if filename.endswith('.gz'):
return gzip.open(filename,*args,**kwargs)
else:
return open(filename,*args,**kwargs)
Usage = "zcat x.pkdat.gz | cut -f 2 | python count2hist.py > x.hist"
#Usage = "subsample_pkdat.py [ -i <input filename>] -n<number of bins wanted>"
parser = OptionParser(Usage)
#parser.add_option( "-n", dest="n", type=int, default=1, help="top n frequency to show in histogram later")
(options, args) = parser.parse_args()
for line in sys.stdin:
counts =re.findall('\d+',open(iptf).read().lower())
#abundance=collections.Counter(counts).most_common(n) #this returns a list
abundance_dic=collections.Counter(counts) #this returns a dic
n=len(abundance_dic)
abundance=abundance_dic.most_common(n)
sorted_ab=sorted(abundance, key=operator.itemgetter(1),reverse=True) #sorted according to frequency, descending!
for item in sorted_ab:
output.write("%d\t%d\n" % (int(item[0]),item[1]))
output.close()