Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 159 lines (132 sloc) 5.19 KB
#!/usr/bin/env python
import argparse
import sys
import os
import shutil
import glob
import re
import logging
args = None
def parse_args():
logging.basicConfig(level=logging.INFO,
format="[%(levelname)s] %(message)s")
parser = argparse.ArgumentParser(
description='clustering files by regular expression [V3.0]',
epilog="https://github.com/shenwei356/easy_qsub")
parser.add_argument('indir', type=str, help='source directory')
parser.add_argument('-o',
'--outdir',
type=str,
help='out directory [<indir>.cluster]')
parser.add_argument(
'-p',
'--pattern',
type=str,
help='pattern (regular expression) of files in indir. ' +
'if not given, it will be the longest common substring of the files.' +
'GROUP (parenthese) should be in the regular expression. ' +
'Captured group will be the cluster name. e.g. "(.+?)_\d\.fq\.gz"')
parser.add_argument('-k',
'--keep',
action='store_true',
help='keep original dir structure')
parser.add_argument('-m',
'--mv',
action='store_true',
help='moving files instead of creating symbolic links')
parser.add_argument(
"-f",
"--force",
action="store_true",
help='force file overwriting, i.e. deleting existed out directory')
args = parser.parse_args()
args.indir = os.path.normpath(args.indir)
if not args.outdir:
args.outdir = os.path.normpath(args.indir) + '.cluster'
args.outdir = os.path.normpath(args.outdir)
return args
def longest_common_substring(s1, s2):
m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))]
longest, x_longest = 0, 0
for x in range(1, 1 + len(s1)):
for y in range(1, 1 + len(s2)):
if s1[x - 1] == s2[y - 1]:
m[x][y] = m[x - 1][y - 1] + 1
if m[x][y] > longest:
longest = m[x][y]
x_longest = x
else:
m[x][y] = 0
return s1[x_longest - longest:x_longest]
if __name__ == '__main__':
args = parse_args()
targets = list()
if args.pattern:
try:
pattern = re.compile(args.pattern)
except:
logging.error("illegal regular expression: {}".format(
args.pattern))
sys.exit(1)
if not ('(' in args.pattern and ')' in args.pattern):
logging.error(
'GROUP (parenthese) should be in the regular expression. ' +
'Captured group will be the cluster name. e.g. "(.+?)_\d\.fq\.gz"')
sys.exit(1)
def walk_func(_, dir, files):
basenames = [os.path.basename(file) for file in files
if not file.startswith('.')] # ignore .file
if not args.pattern:
lcs = basenames[0]
for file in basenames[1:]:
lcs = longest_common_substring(lcs, file)
lcs = lcs.lstrip('.')
if lcs == '':
return
clusters = set([lcs])
files = [os.path.join(dir, file) for file in files if lcs in file]
else:
clusters = set(pattern.findall(file)[0] for file in basenames
if pattern.search(file))
if len(clusters) == 0:
return
files = [os.path.join(dir, file) for file in files
if pattern.search(file)]
targets.append([dir, clusters, files])
os.path.walk(args.indir, walk_func, ())
if len(targets) == 0:
logging.error('no files match pattern: {}'.format(args.pattern))
sys.exit(1)
if os.path.exists(args.outdir):
if args.force:
shutil.rmtree(args.outdir)
else:
logging.info("update existed directory: {}".format(args.outdir))
for dir, clusters, files in targets:
for cluster in clusters:
if args.keep:
splits = os.path.split(dir)
if not splits[0] == '':
newdir = os.path.join(splits[1:])[0]
else: # no subdir in args.indir
newdir = ''
else:
newdir = ''
outdir = os.path.join(args.outdir, newdir, cluster)
if os.path.exists(outdir):
if args.force:
shutil.rmtree(outdir)
else:
logging.info("ignore existed directory: {}".format(outdir))
continue
logging.info("create new directory: {}".format(outdir))
os.makedirs(outdir)
for file in files:
if pattern.findall(os.path.basename(file))[0] != cluster:
continue
if args.mv: # moving file
shutil.move(file, outdir)
else: # creating symbolic links
link_name = os.path.join(outdir, os.path.basename(file))
source = os.path.relpath(os.path.abspath(file), outdir)
os.symlink(source, link_name)