Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

train.py uses excessive memory (patch enclosed) #12

Closed
ralfbrown opened this issue Aug 2, 2013 · 1 comment
Closed

train.py uses excessive memory (patch enclosed) #12

ralfbrown opened this issue Aug 2, 2013 · 1 comment
Assignees

Comments

@ralfbrown
Copy link

The following patch adds a bunch of {var}=None statements to let Python reuse memory that is no longer needed. This reduces memory use by more than a factor of two.
The patch also bypasses generation of domain_dist_vec when --no_domain_ig is specified, since it is never used in that case.

*** ./train.py  2013-06-25 19:12:19.000000000 -0400
--- ../new/train.py 2013-08-01 20:45:35.867486680 -0400
***************
*** 123,126 ****
--- 123,127 ----

    items = [ (d,l,p) for (d,l,n,p) in indexer.items ]
+   indexer = None
    if args.debug:
      # output the language index
***************
*** 191,198 ****
          write_weights(doc_count, doc_count_path)
          print "wrote DF counts for all features to:", doc_count_path
- 
      if DFfeats is None:
        # Choose the first-stage features
        DFfeats = ngram_select(doc_count, args.max_order, args.df_tokens)

      if args.debug:
--- 192,199 ----
          write_weights(doc_count, doc_count_path)
          print "wrote DF counts for all features to:", doc_count_path
        if DFfeats is None:
          # Choose the first-stage features
          DFfeats = ngram_select(doc_count, args.max_order, args.df_tokens)
+       doc_count = None

      if args.debug:
***************
*** 213,222 ****
--- 214,227 ----
      DF_scanner = Scanner(DFfeats)
      b_dirs = build_index(items, DF_scanner, buckets_dir, args.buckets, args.jobs, args.chunksize)
+     DF_scanner = None

      # Build vectors of domain and language distributions for use in IG calculation
+     if not args.no_domain_ig:
        domain_dist_vec = numpy.array([ domain_dist[domain_index[d]]
                 for d in sorted(domain_index, key=domain_index.get)], dtype=int)
+     domain_dist = None
      lang_dist_vec = numpy.array([ lang_dist[lang_index[l]]
              for l in sorted(lang_index.keys(), key=lang_index.get)], dtype=int)
+     lang_dist = None

      # Compute IG
***************
*** 235,241 ****
--- 240,249 ----
          write_weights(ig, weights_path)
        ig_vals[label] = dict((row[0], numpy.array(row[1].flat)) for row in ig)
+       ig = None
+     DFfeats = None

      # Select features according to the LD criteria
      features_per_lang = select_LD_features(ig_vals['lang'], ig_vals.get('domain'), args.feats_per_lang, ignore_domain = args.no_domain_ig)
+     ig_vals = None
      LDfeats = reduce(set.union, map(set, features_per_lang.values()))
      print 'selected %d features' % len(LDfeats)
***************
*** 251,254 ****
--- 259,263 ----
            writer.writerow(map(repr,features_per_lang[i]))
        print 'wrote LD.perlang features to "%s"' % feature_path + '.perlang'
+     features_per_lang = None

    # Compile a scanner for the LDfeats
***************
*** 259,277 ****
        cPickle.dump((tk_nextmove, tk_output, LDfeats), f)
      print "wrote scanner to {0}".format(scanner_path)

    # Assemble the NB model
    langs = sorted(lang_index, key=lang_index.get)

    cm = generate_cm([ (l,p) for d,l,p in items], len(langs))
    paths = zip(*items)[2]

    nb_classes = langs
    nb_pc = learn_pc(cm)
    nb_ptc = learn_ptc(paths, tk_nextmove, tk_output, cm, buckets_dir, args)

    # output the model
    output_path = os.path.join(model_dir, 'model')
    model = nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output
!   string = base64.b64encode(bz2.compress(cPickle.dumps(model)))
    with open(output_path, 'w') as f:
      f.write(string)
--- 268,298 ----
        cPickle.dump((tk_nextmove, tk_output, LDfeats), f)
      print "wrote scanner to {0}".format(scanner_path)
+   LDfeats = None

    # Assemble the NB model
    langs = sorted(lang_index, key=lang_index.get)
+   lang_index = None

    cm = generate_cm([ (l,p) for d,l,p in items], len(langs))
    paths = zip(*items)[2]
+   items = None

    nb_classes = langs
+   langs = None
    nb_pc = learn_pc(cm)
    nb_ptc = learn_ptc(paths, tk_nextmove, tk_output, cm, buckets_dir, args)
+   paths = None
+   cm = None

    # output the model
    output_path = os.path.join(model_dir, 'model')
    model = nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output
!   dump = cPickle.dumps(model)
!   tk_nextmove = None
!   tk_output = None
!   nb_pc = None
!   nb_classes = None
!   model = None
!   string = base64.b64encode(bz2.compress(dump))
    with open(output_path, 'w') as f:
      f.write(string)
@saffsd
Copy link
Owner

saffsd commented Aug 3, 2013

Thanks for that! I've integrated it into the new branch I opened for issue #11.

@saffsd saffsd closed this as completed Aug 3, 2013
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants