Skip to content

Commit

Permalink
Add method to save/load DeepcutTokenizer, close #45
Browse files Browse the repository at this point in the history
  • Loading branch information
titipata committed Nov 6, 2019
1 parent b3cceab commit 40adc26
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 1 deletion.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,16 @@ print(tokenizer.vocabulary_) # {'บิน': 0, 'ได้': 1, 'ฉัน': 2,

X_test = tokenizer.transform(['ฉันกิน', 'ฉันไม่อยากบิน']) # use built tokenizer vobalurary to transform new text
print(X_test.shape) # 2 x 6 CSR sparse matrix

tokenizer.save_model('tokenizer.pickle') # save the tokenizer to use later
```

You can load the saved tokenizer to use later

``` python
tokenizer = deepcut.load_model('tokenizer.pickle')
X_sample = tokenizer.transform(['ฉันกิน', 'ฉันไม่อยากบิน'])
print(X_sample.shape) # getting the same 2 x 6 CSR sparse matrix as X_test
```

### Custom Dictionary
Expand Down
2 changes: 1 addition & 1 deletion deepcut/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
# encoding: utf-8
from .deepcut import tokenize, DeepcutTokenizer
from .deepcut import tokenize, load_model, DeepcutTokenizer
from .train import generate_best_dataset, prepare_feature, train_model, evaluate
23 changes: 23 additions & 0 deletions deepcut/deepcut.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import numpy as np
import scipy.sparse as sp
import six
import pickle

from .model import get_convo_nn2
from .stop_words import THAI_STOP_WORDS
Expand Down Expand Up @@ -90,6 +91,20 @@ def _check_stop_list(stop):
return frozenset(stop)


def load_model(file_path):
"""
Load saved pickle file of DeepcutTokenizer
Parameters
==========
file_path: str, path to saved model from ``save_model`` method in DeepcutTokenizer
"""
tokenizer = pickle.load(open(file_path, 'rb'))
tokenizer.model = get_convo_nn2()
tokenizer.model = tokenizer.model.load_weights(WEIGHT_PATH)
return tokenizer


class DeepcutTokenizer(object):
"""
Class for tokenizing given Thai text documents using deepcut library
Expand Down Expand Up @@ -325,3 +340,11 @@ def tokenize(self, text, custom_dict=None):
tokens.append(word)
word = ''
return tokens

def save_model(self, file_path):
"""
Save tokenizer to pickle format
"""
self.model = None # set model to None to successfully save the model
with open(file_path, 'wb') as f:
pickle.dump(self, f)

0 comments on commit 40adc26

Please sign in to comment.