Skip to content
Browse files

made it possible for BLweight to compute weights for multiple languag…

…e pairs at once
  • Loading branch information...
1 parent 00f8d33 commit a26dcd6e4f29dc6140f02d141eaf90e91dc6fb42 @saffsd committed Mar 7, 2013
Showing with 54 additions and 27 deletions.
  1. +54 −27 langid/train/BLweight.py
View
81 langid/train/BLweight.py
@@ -21,17 +21,16 @@
if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument("-o","--output", metavar="PATH", help = "write weights to PATH")
+ parser.add_argument("-o","--output", metavar="DIR", help = "write weights to DIR")
parser.add_argument('-f','--features', metavar="FILE", help = 'only output features from FILE')
parser.add_argument("-t", "--temp", metavar='TEMP_DIR', help="store buckets in TEMP_DIR instead of in MODEL_DIR/buckets")
parser.add_argument("-j","--jobs", type=int, metavar='N', help="spawn N processes (set to 1 for no paralleization)")
parser.add_argument("-m","--model", help="save output to MODEL_DIR", metavar="MODEL_DIR")
parser.add_argument("--buckets", type=int, metavar='N', help="distribute features into N buckets", default=NUM_BUCKETS)
parser.add_argument("--chunksize", type=int, help="max chunk size (number of files to tokenize at a time - smaller should reduce memory use)", default=CHUNKSIZE)
parser.add_argument("--no_norm", default=False, action="store_true", help="do not normalize difference in p(t|C) by sum p(t|C)")
- parser.add_argument("lang1", metavar='LANG', help="first language")
- parser.add_argument("lang2", metavar='LANG', help="second language")
parser.add_argument("corpus", help="read corpus from CORPUS_DIR", metavar="CORPUS_DIR")
+ parser.add_argument("pairs", metavar='LANG_PAIR', nargs="*", help="language pairs to compute BL weights for")
args = parser.parse_args()
# Work out where our model directory is
@@ -54,54 +53,82 @@ def m_path(name):
else:
raise ValueError("no suitable feature list")
- # Where to do output
- if args.output:
- weights_path = args.output
- else:
- if args.no_norm:
- weights_path = m_path('BLfeats.no_norm.{0}.{1}'.format(args.lang1, args.lang2))
- else:
- weights_path = m_path('BLfeats.{0}.{1}'.format(args.lang1, args.lang2))
-
# Where temp files go
if args.temp:
buckets_dir = args.temp
else:
buckets_dir = m_path('buckets')
makedir(buckets_dir)
+ all_langs = set()
+ pairs = []
+ for p in args.pairs:
+ try:
+ lang1, lang2 = p.split(',')
+ except ValueError:
+ # Did not unpack to two values
+ parser.error("{0} is not a lang-pair".format(p))
+ all_langs.add(lang1)
+ all_langs.add(lang2)
+ pairs.append((lang1, lang2))
+
+ if args.output:
+ makedir(args.output)
+ out_dir = args.output
+ else:
+ out_dir = model_dir
+
+ langs = sorted(all_langs)
+
# display paths
- print "languages: {0} {1}".format(args.lang1, args.lang2)
+ print "languages({1}): {0}".format(langs, len(langs))
print "model path:", model_dir
print "feature path:", feat_path
- print "weights path:", weights_path
+ print "output path:", out_dir
print "temp (buckets) path:", buckets_dir
feats = read_features(feat_path)
- indexer = CorpusIndexer(args.corpus, langs = [args.lang1, args.lang2])
+ indexer = CorpusIndexer(args.corpus, langs = langs)
items = [ (d,l,p) for (d,l,n,p) in indexer.items ]
if len(items) == 0:
raise ValueError("found no files!")
print "will process {0} features across {1} paths".format(len(feats), len(items))
- langs = [args.lang1, args.lang2]
- cm = generate_cm([ (l,p) for d,l,p in items], len(langs))
- paths = zip(*items)[2]
-
+ # produce a scanner over all the features
+ tk_nextmove, tk_output = build_scanner(feats)
- nb_classes = langs
- nb_pc = learn_pc(cm)
+ # Generate a class map over all the languages we are dealing with
+ cm = generate_cm([ (l,p) for d,l,p in items], len(langs))
- tk_nextmove, tk_output = build_scanner(feats)
+ # Compute P(t|C)
+ print "learning P(t|C)"
+ paths = zip(*items)[2]
nb_ptc = learn_ptc(paths, tk_nextmove, tk_output, cm, buckets_dir, args)
- nb_ptc = np.array(nb_ptc).reshape(len(feats), len(nb_pc))
+ nb_ptc = np.array(nb_ptc).reshape(len(feats), len(langs))
# Normalize to 1 on the term axis
+ print "renormalizing P(t|C)"
for i in range(nb_ptc.shape[1]):
- nb_ptc[:,i] = (1/np.exp(nb_ptc[:,i][None,:] - nb_ptc[:,i][:,None]).sum(1))
+ # had to de-vectorize this due to memory consumption
+ newval = np.empty_like(nb_ptc[:,i])
+ for j in range(newval.shape[0]):
+ newval[j] = (1/np.exp(nb_ptc[:,i] - nb_ptc[j,i]).sum())
+ nb_ptc[:,i] = newval
+ assert (1.0 - newval.sum()) < 0.0001
+
+ print "doing per-pair output"
+ for lang1, lang2 in pairs:
+ # Where to do output
+ if args.no_norm:
+ weights_path = os.path.join(out_dir, ('BLfeats.no_norm.{0}.{1}'.format(lang1, lang2)))
+ else:
+ weights_path = os.path.join(out_dir, ('BLfeats.{0}.{1}'.format(lang1, lang2)))
+
+ i1 = indexer.lang_index[lang1]
+ i2 = indexer.lang_index[lang2]
- w = dict(zip(feats, np.abs((nb_ptc[:,0] - nb_ptc[:,1]) / (nb_ptc.sum(1) if not args.no_norm else 1))))
- write_weights(w, weights_path)
- print "wrote weights to {0}".format(weights_path)
+ w = dict(zip(feats, np.abs((nb_ptc[:,i1] - nb_ptc[:,i2]) / (nb_ptc.sum(1) if not args.no_norm else 1))))
+ write_weights(w, weights_path)
+ print "wrote weights to {0}".format(weights_path)

0 comments on commit a26dcd6

Please sign in to comment.
Something went wrong with that request. Please try again.