train.py uses excessive memory (patch enclosed) #12

ralfbrown · 2013-08-02T03:55:55Z

The following patch adds a bunch of {var}=None statements to let Python reuse memory that is no longer needed. This reduces memory use by more than a factor of two.
The patch also bypasses generation of domain_dist_vec when --no_domain_ig is specified, since it is never used in that case.

*** ./train.py  2013-06-25 19:12:19.000000000 -0400
--- ../new/train.py 2013-08-01 20:45:35.867486680 -0400
***************
*** 123,126 ****
--- 123,127 ----

    items = [ (d,l,p) for (d,l,n,p) in indexer.items ]
+   indexer = None
    if args.debug:
      # output the language index
***************
*** 191,198 ****
          write_weights(doc_count, doc_count_path)
          print "wrote DF counts for all features to:", doc_count_path
- 
      if DFfeats is None:
        # Choose the first-stage features
        DFfeats = ngram_select(doc_count, args.max_order, args.df_tokens)

      if args.debug:
--- 192,199 ----
          write_weights(doc_count, doc_count_path)
          print "wrote DF counts for all features to:", doc_count_path
        if DFfeats is None:
          # Choose the first-stage features
          DFfeats = ngram_select(doc_count, args.max_order, args.df_tokens)
+       doc_count = None

      if args.debug:
***************
*** 213,222 ****
--- 214,227 ----
      DF_scanner = Scanner(DFfeats)
      b_dirs = build_index(items, DF_scanner, buckets_dir, args.buckets, args.jobs, args.chunksize)
+     DF_scanner = None

      # Build vectors of domain and language distributions for use in IG calculation
+     if not args.no_domain_ig:
        domain_dist_vec = numpy.array([ domain_dist[domain_index[d]]
                 for d in sorted(domain_index, key=domain_index.get)], dtype=int)
+     domain_dist = None
      lang_dist_vec = numpy.array([ lang_dist[lang_index[l]]
              for l in sorted(lang_index.keys(), key=lang_index.get)], dtype=int)
+     lang_dist = None

      # Compute IG
***************
*** 235,241 ****
--- 240,249 ----
          write_weights(ig, weights_path)
        ig_vals[label] = dict((row[0], numpy.array(row[1].flat)) for row in ig)
+       ig = None
+     DFfeats = None

      # Select features according to the LD criteria
      features_per_lang = select_LD_features(ig_vals['lang'], ig_vals.get('domain'), args.feats_per_lang, ignore_domain = args.no_domain_ig)
+     ig_vals = None
      LDfeats = reduce(set.union, map(set, features_per_lang.values()))
      print 'selected %d features' % len(LDfeats)
***************
*** 251,254 ****
--- 259,263 ----
            writer.writerow(map(repr,features_per_lang[i]))
        print 'wrote LD.perlang features to "%s"' % feature_path + '.perlang'
+     features_per_lang = None

    # Compile a scanner for the LDfeats
***************
*** 259,277 ****
        cPickle.dump((tk_nextmove, tk_output, LDfeats), f)
      print "wrote scanner to {0}".format(scanner_path)

    # Assemble the NB model
    langs = sorted(lang_index, key=lang_index.get)

    cm = generate_cm([ (l,p) for d,l,p in items], len(langs))
    paths = zip(*items)[2]

    nb_classes = langs
    nb_pc = learn_pc(cm)
    nb_ptc = learn_ptc(paths, tk_nextmove, tk_output, cm, buckets_dir, args)

    # output the model
    output_path = os.path.join(model_dir, 'model')
    model = nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output
!   string = base64.b64encode(bz2.compress(cPickle.dumps(model)))
    with open(output_path, 'w') as f:
      f.write(string)
--- 268,298 ----
        cPickle.dump((tk_nextmove, tk_output, LDfeats), f)
      print "wrote scanner to {0}".format(scanner_path)
+   LDfeats = None

    # Assemble the NB model
    langs = sorted(lang_index, key=lang_index.get)
+   lang_index = None

    cm = generate_cm([ (l,p) for d,l,p in items], len(langs))
    paths = zip(*items)[2]
+   items = None

    nb_classes = langs
+   langs = None
    nb_pc = learn_pc(cm)
    nb_ptc = learn_ptc(paths, tk_nextmove, tk_output, cm, buckets_dir, args)
+   paths = None
+   cm = None

    # output the model
    output_path = os.path.join(model_dir, 'model')
    model = nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output
!   dump = cPickle.dumps(model)
!   tk_nextmove = None
!   tk_output = None
!   nb_pc = None
!   nb_classes = None
!   model = None
!   string = base64.b64encode(bz2.compress(dump))
    with open(output_path, 'w') as f:
      f.write(string)

The text was updated successfully, but these errors were encountered:

saffsd · 2013-08-03T04:43:22Z

Thanks for that! I've integrated it into the new branch I opened for issue #11.

ghost assigned saffsd Aug 3, 2013

saffsd added a commit that referenced this issue Aug 3, 2013

integrated memory-use changes by ralfbrown (issue #12)

5a3665b

saffsd closed this as completed Aug 3, 2013

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

train.py uses excessive memory (patch enclosed) #12

train.py uses excessive memory (patch enclosed) #12

ralfbrown commented Aug 2, 2013

saffsd commented Aug 3, 2013

train.py uses excessive memory (patch enclosed) #12

train.py uses excessive memory (patch enclosed) #12

Comments

ralfbrown commented Aug 2, 2013

saffsd commented Aug 3, 2013