In [1]:
!pip install skl2onnx
!pip install onnxruntime



In [9]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib
import numpy as np

In [3]:
df = pd.read_csv(r"/content/phishing_email.csv")

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["text_combined"])
y = df["label"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=314)
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

In [5]:
y_pred = model.predict(X_test)
print("Accuracy_score" ,accuracy_score(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(f"[[{cm[0,0]} {cm[0,1]}]")
print(f" [{cm[1,0]} {cm[1,1]}]]")

Accuracy_score 0.9857433808553971
Confusion Matrix:
[[9576 202]
 [92 10752]]


In [6]:
pipeline = Pipeline([
    ("tfidf", vectorizer),
    ("clf", model)
])

joblib.dump(pipeline, "email_classifier_pipeline.pkl")

['email_classifier_pipeline.pkl']

In [7]:
import skl2onnx
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import StringTensorType

joblib.dump(pipeline, "text_pipeline.pkl")

# Define the input type for the ONNX model
# Here: a single string input
initial_type = [("input", StringTensorType([None, 1]))]

# Convert the pipeline to ONNX
onnx_model = convert_sklearn(pipeline, initial_types=initial_type)

# Save the ONNX model to a file
with open("text_pipeline.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())