Add method to save/load DeepcutTokenizer, close #45

rkcosmos · Nov 6, 2019 · 40adc26 · 40adc26
1 parent b3cceab
commit 40adc26
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -86,6 +86,16 @@ print(tokenizer.vocabulary_) # {'บิน': 0, 'ได้': 1, 'ฉัน': 2,
 
 X_test = tokenizer.transform(['ฉันกิน', 'ฉันไม่อยากบิน']) # use built tokenizer vobalurary to transform new text
 print(X_test.shape) # 2 x 6 CSR sparse matrix
+
+tokenizer.save_model('tokenizer.pickle') # save the tokenizer to use later
+```
+
+You can load the saved tokenizer to use later
+
+``` python
+tokenizer = deepcut.load_model('tokenizer.pickle')
+X_sample = tokenizer.transform(['ฉันกิน', 'ฉันไม่อยากบิน'])
+print(X_sample.shape) # getting the same 2 x 6 CSR sparse matrix as X_test
 ```
 
 ### Custom Dictionary

diff --git a/deepcut/__init__.py b/deepcut/__init__.py
@@ -1,4 +1,4 @@
 #!/usr/bin/env python
 # encoding: utf-8
-from .deepcut import tokenize, DeepcutTokenizer
+from .deepcut import tokenize, load_model, DeepcutTokenizer
 from .train import generate_best_dataset, prepare_feature, train_model, evaluate
diff --git a/deepcut/deepcut.py b/deepcut/deepcut.py
@@ -9,6 +9,7 @@
 import numpy as np
 import scipy.sparse as sp
 import six
+import pickle
 
 from .model import get_convo_nn2
 from .stop_words import THAI_STOP_WORDS
@@ -90,6 +91,20 @@ def _check_stop_list(stop):
     return frozenset(stop)
 
 
+def load_model(file_path):
+    """
+    Load saved pickle file of DeepcutTokenizer
+
+    Parameters
+    ==========
+    file_path: str, path to saved model from ``save_model`` method in DeepcutTokenizer 
+    """
+    tokenizer = pickle.load(open(file_path, 'rb'))
+    tokenizer.model = get_convo_nn2()
+    tokenizer.model = tokenizer.model.load_weights(WEIGHT_PATH)
+    return tokenizer
+
+
 class DeepcutTokenizer(object):
     """
     Class for tokenizing given Thai text documents using deepcut library
@@ -325,3 +340,11 @@ def tokenize(self, text, custom_dict=None):
                 tokens.append(word)
                 word = ''
         return tokens
+
+    def save_model(self, file_path):
+        """
+        Save tokenizer to pickle format
+        """
+        self.model = None # set model to None to successfully save the model
+        with open(file_path, 'wb') as f:
+            pickle.dump(self, f)