# Converting a Scikit model into ONNX format

### Import the necessary packages.

In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from skl2onnx.convert import convert_sklearn
from skl2onnx.common.data_types import StringTensorType
from onnxruntime import InferenceSession
from onnxmltools.utils import save_model

### 20 newsgroup dataset
We use the 20 newsgroups dataset in this experiment. It comprises around 18000 newsgroups posts on 20 topics split in two subsets: one for training and the other one for testing. We pick 2 categories out of 20 for our experiment.

In [2]:
cats = ['rec.motorcycles', 'sci.electronics']
training_data = fetch_20newsgroups(subset='train', categories=cats)
test_data = fetch_20newsgroups(subset='test', categories=cats)

In [3]:
X_train, y_train = np.array(training_data.data), training_data.target
X_test, y_test = np.array(test_data.data), test_data.target

### Scikit Pipeline
We create a scikit pipeline, which featurises the text using CountVectorizer() and then uses an MLPClassifier() to train the model.

In [7]:
model = Pipeline([('countvec', CountVectorizer()), ('predictor', MLPClassifier(random_state=42))])
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('countvec',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary...
                               batch_size='auto', beta_1=0.9, beta_2=0.999,
                               early_stopping=False, epsilon=1e-08,
                               hidden_layer_sizes=(100,),
                               learning_rate='constant',
                               learning_rate_init=0.001, max_iter=200,
      

### Model accuracy
Calculate the accuracy of our model on the test set.

In [8]:
np.mean(model.predict(X_test) == y_test)

0.9835651074589128

### Conversion to ONNX
Convert the scikit model into ONNX format using convert_sklearn(), then save the ONNX model.

In [9]:
model_onnx = convert_sklearn(model, 'newsgroup', [('input', StringTensorType([None]))])
save_model(model_onnx, 'news.onnx')

### Load the onnx model
For inferening, we first load the model as shown below.

In [10]:
sess = InferenceSession('news.onnx')

### Prediction using onnxruntime
In order to run prediction on a test set, we call run() passing the test set like this:

In [11]:
res = sess.run(None, input_feed={'input': X_test})

The above function call returns two outputs: label(output 0) and class probability scores(output 1).

## Comparing results of onnx and Scikit models
Here, we compare the labels returned by onnxruntime with the labels predicted by scikit.

In [12]:
np.mean(res[0] == model.predict(X_test))

1.0

We can also match the predicted probability scores of the two models.

In [13]:
np.mean(np.isclose(list(map(lambda x: [x[0], x[1]], res[1])),
                   model.predict_proba(X_test), atol=1e-5))

1.0