In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [3]:
from lightgbm import LGBMClassifier

In [4]:
dataset = pd.read_csv('xl-preprocessed.csv')

In [6]:
pipe = Pipeline(
    [
        ('vectorizer', TfidfVectorizer(token_pattern='\S+', max_features=60000)),
        ('clf', LGBMClassifier(random_state=42)),
    ]
)

In [7]:
pipe.fit(dataset['code'], dataset['language'])

In [8]:
pipe.predict(['#include <iostream>'])

array(['C++'], dtype=object)

In [9]:
len(pipe[0].get_feature_names_out())

60000

In [10]:
import joblib

with open('xl-dump.joblib', 'wb') as f:
    joblib.dump(pipe, f)

In [11]:
import lightgbm
import onnxmltools
import skl2onnx
import onnx
import sklearn
import matplotlib.pyplot as plt
import os
from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer
import onnxruntime as rt
from onnxruntime.capi.onnxruntime_pybind11_state import Fail as OrtFail
from skl2onnx import convert_sklearn, update_registered_converter
from skl2onnx.common.shape_calculator import calculate_linear_classifier_output_shapes  # noqa
from onnxmltools.convert.lightgbm.operator_converters.LightGbm import convert_lightgbm  # noqa
import onnxmltools.convert.common.data_types
from skl2onnx.common.data_types import FloatTensorType, StringTensorType
import numpy
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier

In [12]:
update_registered_converter(
    LGBMClassifier, 'LightGbmLGBMClassifier',
    calculate_linear_classifier_output_shapes, convert_lightgbm,
    options={'nocl': [True, False], 'zipmap': [True, False, 'columns']})

In [13]:
model_onnx = convert_sklearn(
    pipe, 'pipeline_lightgbm',
    [('text', StringTensorType([None, 1]))],
    target_opset={'': 12, 'ai.onnx.ml': 2}
)

In [None]:
# model_onnx = to_onnx(
#     pipeline,
#     initial_types=[('text', StringTensorType([None, 1]))],
#     target_opset={'': 12, 'ai.onnx.ml': 2},
# )

In [14]:
with open("base-xl.onnx", "wb") as f:
    f.write(model_onnx.SerializeToString())

In [15]:
corpus = [
    'import pathlib def foo(bar): return bar',
    '#include <iostream>',
    'int void * foo(int a)',
]

In [16]:
sess = sess = rt.InferenceSession(model_onnx.SerializeToString())
inputs = {"text": np.array([corpus]).reshape(-1, 1)}
pred_onx = sess.run(None, inputs)

In [17]:
pred_onx

[array(['Python', 'C++', 'C'], dtype=object),
 [{'Assembly': 0.002173589775338769,
   'Batchfile': 0.004575828090310097,
   'C': 0.0008158839191310108,
   'C#': 5.953173058514949e-06,
   'C++': 0.00041115088970400393,
   'CMake': 5.148449417902157e-05,
   'CSS': 9.055629561771639e-06,
   'Dockerfile': 1.5214744735203567e-06,
   'FORTRAN': 0.005310575012117624,
   'GO': 0.00027702577062882483,
   'HTML': 0.001340474933385849,
   'Haskell': 0.0012890968937426805,
   'Java': 5.963276635156944e-06,
   'JavaScript': 0.003371991915628314,
   'Julia': 0.0018368088640272617,
   'Lua': 0.020403845235705376,
   'Makefile': 5.743997826357372e-05,
   'PHP': 0.0002973030786961317,
   'Perl': 0.00010332070087315515,
   'PowerShell': 0.0006017646519467235,
   'Python': 0.9466577768325806,
   'Ruby': 0.00033437329693697393,
   'Rust': 7.320353324757889e-05,
   'SQL': 0.00038299363222904503,
   'Scala': 0.003946422133594751,
   'TeX': 0.0017047355649992824,
   'TypeScript': 0.003960451111197472},
  {'A

In [33]:
!pip list | grep 'onnxmltools'

onnxmltools                   1.11.2
