Skip to content

Commit

Permalink
reverted to per-chunk calculation of prod due to memory issues
Browse files Browse the repository at this point in the history
  • Loading branch information
saffsd committed Mar 5, 2014
1 parent 7644078 commit 13480a8
Showing 1 changed file with 33 additions and 20 deletions.
53 changes: 33 additions & 20 deletions langid/train/NBtrain.py
Expand Up @@ -121,19 +121,20 @@ def pass_tokenize(arg):

return chunk_id, doc_count, len(term_freq), labels

def setup_pass_fm(num_instances, chunk_offsets):
global __num_instances, __chunk_offsets
def setup_pass_ptc(cm, num_instances, chunk_offsets):
global __cm, __num_instances, __chunk_offsets
__cm = cm
__num_instances = num_instances
__chunk_offsets = chunk_offsets

def pass_fm(b_dir):
def pass_ptc(b_dir):
"""
Take a bucket, form a feature map, compute the count of
each feature in each class.
@param b_dir path to the bucket directory
@returns (read_count, f_ids, prod)
"""
global __num_instances, __chunk_offsets
global __cm, __num_instances, __chunk_offsets

terms = defaultdict(lambda : np.zeros((__num_instances,), dtype='int'))

Expand All @@ -147,14 +148,20 @@ def pass_fm(b_dir):

f_ids, f_vs = zip(*terms.items())
fm = np.vstack(f_vs)
return read_count, f_ids, fm
# The calculation of the term-class distribution is done per-chunk rather
# than globally for memory efficiency reasons.
prod = np.dot(fm, __cm)

return read_count, f_ids, prod

def learn_nb_params(items, num_langs, tk_nextmove, tk_output, temp_path, args):
"""
@param items label, path pairs
"""
global outdir

print "learning NB parameters on {} items".format(len(items))

# Generate the feature map
nm_arr = mp.Array('i', tk_nextmove, lock=False)

Expand All @@ -181,14 +188,17 @@ def learn_nb_params(items, num_langs, tk_nextmove, tk_output, temp_path, args):
pass_tokenize_params = (nm_arr, output_states, tk_output, b_dirs, args.line)
with MapPool(args.jobs, setup_pass_tokenize, pass_tokenize_params) as f:
pass_tokenize_out = f(pass_tokenize, pass_tokenize_arg)

write_count = 0
chunk_sizes = {}
labels = []
for chunk_id, doc_count, writes, _labels in pass_tokenize_out:
num_chunks = len(item_chunks)
print "about to tokenize {} chunks".format(num_chunks)
for i, (chunk_id, doc_count, writes, _labels) in enumerate(pass_tokenize_out):
write_count += writes
chunk_sizes[chunk_id] = doc_count
labels.extend(_labels)
print "processed chunk (%d/%d) [%d keys]" % (i+1, num_chunks, writes)

print "wrote a total of %d keys" % write_count

Expand All @@ -198,27 +208,30 @@ def learn_nb_params(items, num_langs, tk_nextmove, tk_output, temp_path, args):
chunk_offsets = {}
for i in range(len(chunk_sizes)):
chunk_offsets[i] = sum(chunk_sizes[x] for x in range(i))
print " offset for chunk {0} is {1}".format(i, chunk_offsets[i])

pass_fm_params = (num_instances, chunk_offsets)
with MapPool(args.jobs, setup_pass_fm, pass_fm_params) as f:
pass_fm_out = f(pass_fm, b_dirs)
print "have {} labels".format(len(labels))
cm = np.zeros((num_instances, num_langs), dtype='bool')
for doc_id, lang_id in enumerate(labels):
cm[doc_id, lang_id] = True

reads, ids, fms = zip(*pass_fm_out)
pass_ptc_params = (cm, num_instances, chunk_offsets)
with MapPool(args.jobs, setup_pass_ptc, pass_ptc_params) as f:
pass_ptc_out = f(pass_ptc, b_dirs)

def pass_ptc_progress():
for i,v in enumerate(pass_ptc_out):
yield v
print "processed chunk ({0}/{1})".format(i+1, len(b_dirs))

reads, ids, prods = zip(*pass_ptc_progress())
read_count = sum(reads)
print "read a total of %d keys (%d short)" % (read_count, write_count - read_count)

num_features = max( i for v in tk_output.values() for i in v) + 1
fm = np.zeros((num_features, num_instances), dtype=int)
fm[np.concatenate(ids)] = np.vstack(fms)

print "have {} labels".format(len(labels))
cm = np.zeros((num_instances, num_langs), dtype='bool')
for doc_id, lang_id in enumerate(labels):
cm[doc_id, lang_id] = True
prod = np.zeros((num_features, cm.shape[1]), dtype=int)
prod[np.concatenate(ids)] = np.vstack(prods)

# This is where the smoothing occurs
prod = np.dot(fm, cm)
ptc = np.log(1 + prod) - np.log(num_features + prod.sum(0))

nb_ptc = array.array('d')
Expand Down

0 comments on commit 13480a8

Please sign in to comment.