Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
executable file 158 lines (132 sloc) 5.19 KB
#!/usr/bin/env python2
import argparse
import sys
import os
import shutil
import glob
import re
import logging
args = None
def parse_args():
logging.basicConfig(level=logging.INFO,
format="[%(levelname)s] %(message)s")
parser = argparse.ArgumentParser(
description='clustering files by regular expression [V3.0]',
epilog="https://github.com/shenwei356/easy_qsub")
parser.add_argument('indir', type=str, help='source directory')
parser.add_argument('-o',
'--outdir',
type=str,
help='out directory [<indir>.cluster]')
parser.add_argument(
'-p',
'--pattern',
type=str,
help='pattern (regular expression) of files in indir. ' +
'if not given, it will be the longest common substring of the files.' +
'GROUP (parenthese) should be in the regular expression. ' +
'Captured group will be the cluster name. e.g. "(.+?)_\d\.fq\.gz"')
parser.add_argument('-k',
'--keep',
action='store_true',
help='keep original dir structure')
parser.add_argument('-m',
'--mv',
action='store_true',
help='moving files instead of creating symbolic links')
parser.add_argument(
"-f",
"--force",
action="store_true",
help='force file overwriting, i.e. deleting existed out directory')
args = parser.parse_args()
args.indir = os.path.normpath(args.indir)
if not args.outdir:
args.outdir = os.path.normpath(args.indir) + '.cluster'
args.outdir = os.path.normpath(args.outdir)
return args
def longest_common_substring(s1, s2):
m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))]
longest, x_longest = 0, 0
for x in range(1, 1 + len(s1)):
for y in range(1, 1 + len(s2)):
if s1[x - 1] == s2[y - 1]:
m[x][y] = m[x - 1][y - 1] + 1
if m[x][y] > longest:
longest = m[x][y]
x_longest = x
else:
m[x][y] = 0
return s1[x_longest - longest:x_longest]
if __name__ == '__main__':
args = parse_args()
targets = list()
if args.pattern:
try:
pattern = re.compile(args.pattern)
except:
logging.error("illegal regular expression: {}".format(
args.pattern))
sys.exit(1)
if not ('(' in args.pattern and ')' in args.pattern):
logging.error(
'GROUP (parenthese) should be in the regular expression. ' +
'Captured group will be the cluster name. e.g. "(.+?)_\d\.fq\.gz"')
sys.exit(1)
def walk_func(_, dir, files):
basenames = [os.path.basename(file) for file in files
if not file.startswith('.')] # ignore .file
if not args.pattern:
lcs = basenames[0]
for file in basenames[1:]:
lcs = longest_common_substring(lcs, file)
lcs = lcs.lstrip('.')
if lcs == '':
return
clusters = set([lcs])
files = [os.path.join(dir, file) for file in files if lcs in file]
else:
clusters = set(pattern.findall(file)[0] for file in basenames
if pattern.search(file))
if len(clusters) == 0:
return
files = [os.path.join(dir, file) for file in files
if pattern.search(file)]
targets.append([dir, clusters, files])
os.path.walk(args.indir, walk_func, ())
if len(targets) == 0:
logging.error('no files match pattern: {}'.format(args.pattern))
sys.exit(1)
if os.path.exists(args.outdir):
if args.force:
shutil.rmtree(args.outdir)
else:
logging.info("update existed directory: {}".format(args.outdir))
for dir, clusters, files in targets:
for cluster in clusters:
if args.keep:
splits = os.path.split(dir)
if not splits[0] == '':
newdir = os.path.join(splits[1:])[0]
else: # no subdir in args.indir
newdir = ''
else:
newdir = ''
outdir = os.path.join(args.outdir, newdir, cluster)
if os.path.exists(outdir):
if args.force:
shutil.rmtree(outdir)
else:
logging.info("ignore existed directory: {}".format(outdir))
continue
logging.info("create new directory: {}".format(outdir))
os.makedirs(outdir)
for file in files:
if pattern.findall(os.path.basename(file))[0] != cluster:
continue
if args.mv: # moving file
shutil.move(file, outdir)
else: # creating symbolic links
link_name = os.path.join(outdir, os.path.basename(file))
source = os.path.relpath(os.path.abspath(file), outdir)
os.symlink(source, link_name)