In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType, StringTensorType


In [29]:
# Load and clean data
df = pd.read_parquet("Data/Parquet/orion-pipeline-2024-08-11.00.parquet")
df = df.drop(['SourceIP', 'TCP', 'ICMP', 'Country'], axis=1)

X = df.drop('EventType', axis=1)
y = df['EventType']

# Identify numeric and categorical features
numeric_features = X.select_dtypes(include='number').columns.tolist()
categorical_features = X.select_dtypes(exclude='number').columns.tolist()

In [30]:
# Preprocessing pipelines
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', num_transformer, numeric_features),
    ('cat', cat_transformer, categorical_features)
])

In [None]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models to train
models = {
    "GaussianNB": GaussianNB(),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42)
}

# Create ONNX initial types
initial_types = []
for col in X.columns:
    if X[col].dtype == 'object':
        initial_types.append((col, StringTensorType([None, 1])))
    else:
        initial_types.append((col, FloatTensorType([None, 1])))

In [None]:
# Train models and export to ONNX
for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('classifier', model)
    ])
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    
    onnx_model = convert_sklearn(pipeline, initial_types=initial_types)
    with open(f"{name}.onnx", "wb") as f:
        f.write(onnx_model.SerializeToString())
    print(f"{name} model saved as {name}.onnx")

In [35]:
import onnxruntime as ort

sess = ort.InferenceSession("DecisionTree.onnx")
X_new = pd.DataFrame([
    {
        "Port": 80,
        "Traffic": 10,
        "Packets": 5,
        "Bytes": 1024,
        "UniqueDests": 3,
        "UniqueDest24s": 2,
        "Lat": 37.77,
        "Long": -122.41,
        "ASN": 12345,
    }
])

# Convert numeric columns to float32
for col in X_new.select_dtypes(include=['int64', 'int32', 'float64']).columns:
    X_new[col] = X_new[col].astype('float32')

# Prepare inputs for ONNX
onnx_inputs = {name: X_new[name].values.reshape(-1, 1) for name in X_new.columns}

# Run inference
preds = sess.run(None, onnx_inputs)
