In [1]:
import numpy
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer, TfidfVectorizer
from skl2onnx import convert_sklearn
import skl2onnx
from skl2onnx.common.data_types import StringTensorType
import onnx

In [2]:
corpus = numpy.array([
            "This is the first document.",
            "This document is the second document.",
            "And this is the third one.",
            "Is this the first document?",
        ]).reshape((4, 1))

In [3]:
vect = HashingVectorizer(ngram_range=(1, 1), norm=None, alternate_sign=False)
cvect = CountVectorizer(ngram_range=(1, 1))
tvect = TfidfVectorizer(ngram_range=(1, 1), use_idf=False)

In [4]:
Y = cvect.fit_transform(corpus.ravel())
print(cvect.get_feature_names())
print(Y)

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
  (0, 8)	1
  (0, 3)	1
  (0, 6)	1
  (0, 2)	1
  (0, 1)	1
  (1, 8)	1
  (1, 3)	1
  (1, 6)	1
  (1, 1)	2
  (1, 5)	1
  (2, 8)	1
  (2, 3)	1
  (2, 6)	1
  (2, 0)	1
  (2, 7)	1
  (2, 4)	1
  (3, 8)	1
  (3, 3)	1
  (3, 6)	1
  (3, 2)	1
  (3, 1)	1


In [5]:
X = vect.transform(corpus.ravel())
print(X)

  (0, 144749)	1.0
  (0, 170062)	1.0
  (0, 286878)	1.0
  (0, 351664)	1.0
  (0, 989160)	1.0
  (1, 144749)	1.0
  (1, 170062)	1.0
  (1, 286878)	1.0
  (1, 351664)	2.0
  (1, 544379)	1.0
  (2, 144749)	1.0
  (2, 170062)	1.0
  (2, 178949)	1.0
  (2, 180525)	1.0
  (2, 286878)	1.0
  (2, 948532)	1.0
  (3, 144749)	1.0
  (3, 170062)	1.0
  (3, 286878)	1.0
  (3, 351664)	1.0
  (3, 989160)	1.0


In [6]:
Z = tvect.fit_transform(corpus.ravel())
print(Z)

  (0, 8)	0.4472135954999579
  (0, 3)	0.4472135954999579
  (0, 6)	0.4472135954999579
  (0, 2)	0.4472135954999579
  (0, 1)	0.4472135954999579
  (1, 8)	0.35355339059327373
  (1, 3)	0.35355339059327373
  (1, 6)	0.35355339059327373
  (1, 1)	0.7071067811865475
  (1, 5)	0.35355339059327373
  (2, 8)	0.4082482904638631
  (2, 3)	0.4082482904638631
  (2, 6)	0.4082482904638631
  (2, 0)	0.4082482904638631
  (2, 7)	0.4082482904638631
  (2, 4)	0.4082482904638631
  (3, 8)	0.4472135954999579
  (3, 3)	0.4472135954999579
  (3, 6)	0.4472135954999579
  (3, 2)	0.4472135954999579
  (3, 1)	0.4472135954999579


In [7]:
model_onnx_c = convert_sklearn(cvect, "CountVectorizer",
                                     [("input", StringTensorType([1]))])
skl2onnx.helpers.onnx_helper.save_onnx_model(model_onnx_c, filename="count_vec.onnx")

  op_type, domain))


b'\x08\x07\x12\x08skl2onnx\x1a\x051.6.0"\x07ai.onnx(\x002\x00:\xa1\x06\n4\n\x05input\n\x0cshape_tensor\x12\tflattened\x1a\x07Reshape"\x07Reshape:\x00\nw\n\tflattened\x12\nnormalized\x1a\x10StringNormalizer"\x10StringNormalizer*\x1e\n\x12case_change_action"\x05LOWER\xa0\x01\x03*\x18\n\x11is_case_sensitive\x18\x00\xa0\x01\x02:\x00\n\x8d\x01\n\nnormalized\x12\ttokenized\x1a\tTokenizer"\tTokenizer*\x0b\n\x04mark\x18\x00\xa0\x01\x02*\x11\n\nmincharnum\x18\x01\xa0\x01\x02*\x11\n\tpad_value"\x01#\xa0\x01\x03*\x1c\n\x08tokenexp"\r[a-zA-Z0-9_]+\xa0\x01\x03:\rcom.microsoft\n+\n\ttokenized\x12\nflattened1\x1a\x07Flatten"\x07Flatten:\x00\n\xd2\x02\n\nflattened1\x12\x08variable\x1a\x0fTfIdfVectorizer"\x0fTfIdfVectorizer*\x16\n\x0fmax_gram_length\x18\x01\xa0\x01\x02*\x15\n\x0emax_skip_count\x18\x00\xa0\x01\x02*\x16\n\x0fmin_gram_length\x18\x01\xa0\x01\x02*\r\n\x04mode"\x02TF\xa0\x01\x03*\x13\n\x0cngram_counts@\x00\xa0\x01\x07*$\n\rngram_indexes@\x00@\x01@\x02@\x03@\x04@\x05@\x06@\x07@\x08\xa0\x01\x0

In [8]:
model_onnx = convert_sklearn(vect, "HashingVectorizer",
                                     [("input", StringTensorType([1]))])
model_onnx.ir_version=6
skl2onnx.helpers.onnx_helper.save_onnx_model(model_onnx, "hashing_vec.onnx")

  op_type, domain))


b'\x08\x06\x12\x08skl2onnx\x1a\x051.6.0"\x07ai.onnx(\x002\x00:\xb1\x04\n4\n\x05input\n\x0cshape_tensor\x12\tflattened\x1a\x07Reshape"\x07Reshape:\x00\nw\n\tflattened\x12\nnormalized\x1a\x10StringNormalizer"\x10StringNormalizer*\x1e\n\x12case_change_action"\x05LOWER\xa0\x01\x03*\x18\n\x11is_case_sensitive\x18\x00\xa0\x01\x02:\x00\n\x8d\x01\n\nnormalized\x12\ttokenized\x1a\tTokenizer"\tTokenizer*\x0b\n\x04mark\x18\x00\xa0\x01\x02*\x11\n\nmincharnum\x18\x01\xa0\x01\x02*\x11\n\tpad_value"\x01#\xa0\x01\x03*\x1c\n\x08tokenexp"\r[a-zA-Z0-9_]+\xa0\x01\x03:\rcom.microsoft\n+\n\ttokenized\x12\nflattened1\x1a\x07Flatten"\x07Flatten:\x00\n_\n\nflattened1\x12\x08variable\x1a\x11HashingVectorizer"\x11HashingVectorizer*\x12\n\nn_features\x18\x88\'\xa0\x01\x02:\rcom.microsoft\x12\x11HashingVectorizer*\x1e\x08\x01\x10\x07:\n\xff\xff\xff\xff\xff\xff\xff\xff\xff\x01B\x0cshape_tensorZ\x13\n\x05input\x12\n\n\x08\x08\x08\x12\x04\n\x02\x08\x01b\x1a\n\x08variable\x12\x0e\n\x0c\x08\x01\x12\x08\n\x00\n\x04\x08\

In [9]:
model_onnx_t = convert_sklearn(tvect, "TfidfVectorizer",
                                     [("input", StringTensorType([1]))])
skl2onnx.helpers.onnx_helper.save_onnx_model(model_onnx_t, filename="tfidf_vec.onnx")

b'\x08\x07\x12\x08skl2onnx\x1a\x051.6.0"\x07ai.onnx(\x002\x00:\xae\x07\n.\n\x0ctfidf_output\x12\x08variable\x1a\x08Identity"\x08Identity:\x00\n4\n\x05input\n\x0cshape_tensor\x12\tflattened\x1a\x07Reshape"\x07Reshape:\x00\nw\n\tflattened\x12\nnormalized\x1a\x10StringNormalizer"\x10StringNormalizer*\x1e\n\x12case_change_action"\x05LOWER\xa0\x01\x03*\x18\n\x11is_case_sensitive\x18\x00\xa0\x01\x02:\x00\n\x8d\x01\n\nnormalized\x12\ttokenized\x1a\tTokenizer"\tTokenizer*\x0b\n\x04mark\x18\x00\xa0\x01\x02*\x11\n\nmincharnum\x18\x01\xa0\x01\x02*\x11\n\tpad_value"\x01#\xa0\x01\x03*\x1c\n\x08tokenexp"\r[a-zA-Z0-9_]+\xa0\x01\x03:\rcom.microsoft\n+\n\ttokenized\x12\nflattened1\x1a\x07Flatten"\x07Flatten:\x00\n\xda\x02\n\nflattened1\x12\x10count_vec_output\x1a\x0fTfIdfVectorizer"\x0fTfIdfVectorizer*\x16\n\x0fmax_gram_length\x18\x01\xa0\x01\x02*\x15\n\x0emax_skip_count\x18\x00\xa0\x01\x02*\x16\n\x0fmin_gram_length\x18\x01\xa0\x01\x02*\r\n\x04mode"\x02TF\xa0\x01\x03*\x13\n\x0cngram_counts@\x00\xa0\x01

In [10]:
print(model_onnx_c)
print(model_onnx_t)
print(model_onnx)

ir_version: 7
producer_name: "skl2onnx"
producer_version: "1.6.0"
domain: "ai.onnx"
model_version: 0
doc_string: ""
graph {
  node {
    input: "input"
    input: "shape_tensor"
    output: "flattened"
    name: "Reshape"
    op_type: "Reshape"
    domain: ""
  }
  node {
    input: "flattened"
    output: "normalized"
    name: "StringNormalizer"
    op_type: "StringNormalizer"
    attribute {
      name: "case_change_action"
      s: "LOWER"
      type: STRING
    }
    attribute {
      name: "is_case_sensitive"
      i: 0
      type: INT
    }
    domain: ""
  }
  node {
    input: "normalized"
    output: "tokenized"
    name: "Tokenizer"
    op_type: "Tokenizer"
    attribute {
      name: "mark"
      i: 0
      type: INT
    }
    attribute {
      name: "mincharnum"
      i: 1
      type: INT
    }
    attribute {
      name: "pad_value"
      s: "#"
      type: STRING
    }
    attribute {
      name: "tokenexp"
      s: "[a-zA-Z0-9_]+"
      type: STRING
    }
    domain: "c

In [11]:
onnx_model = onnx.load('squeezenet.onnx')

In [12]:
print(onnx_model.ir_version)

3
