In [None]:
import json

file_path = 'naseza.json'
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

In [None]:
from sklearn.model_selection import train_test_split
from shekar.preprocessing import StopWordRemover

stop_word_remover = StopWordRemover()

labels = [1 if entry['label'] and entry['label'][0] == 'Offensive' else 0 for entry in data]
texts = [stop_word_remover(entry['text']) for entry in data]

train_x, test_x, train_y, test_y = train_test_split(texts, labels, test_size=0.15, random_state=42)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report


pipeline = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(analyzer="char", ngram_range=(2,6))),
    ('clf', LogisticRegression())
])

pipeline.fit(train_x, train_y).score(test_x, test_y)

y_pred = pipeline.predict(test_x)
ConfusionMatrixDisplay.from_predictions(test_y, y_pred)

print(classification_report(test_y, y_pred))

sample_text = "خیلی بی‌شرفی"
sample_text = stop_word_remover(sample_text)
print(pipeline.predict([sample_text]))
print(pipeline.predict_proba([sample_text]))

#### Export to ONNX 

In [None]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import StringTensorType

initial_types = [("input_text", StringTensorType([None, 1]))]
model_name = "tfidf_logistic_offensive"
onx = convert_sklearn(
    pipeline,
    initial_types=initial_types,
    options={id(pipeline): {"zipmap": False}},  # probabilities as ndarray
    target_opset=17
)

with open(f"{model_name}.onnx", "wb") as f:
    f.write(onx.SerializeToString())

In [None]:
import numpy as np, onnxruntime as ort

class2label = {0: "Neutral", 1: "Offensive"}

sample = stop_word_remover("خیلی بی‌شرفی")
print("sklearn:", pipeline.predict([sample]), pipeline.predict_proba([sample]))

sess = ort.InferenceSession(f"{model_name}.onnx", providers=["CPUExecutionProvider"])
in_name = sess.get_inputs()[0].name
out_names = [o.name for o in sess.get_outputs()]
arr = np.array([[sample]], dtype=object)
onnx_label, onnx_proba = sess.run(out_names, {in_name: arr})

if onnx_proba.ndim != 2:
    onnx_label, onnx_proba = onnx_proba, onnx_label

print("onnx:", class2label[onnx_label[0]], onnx_proba[0][onnx_label[0]])
