In [4]:
import os                   # Python's built-in operating system interface module
print(os.getcwd())          # get current working directory
# If it shows .../Project1/notebooks, then Jupyter was launched from the wrong place. That breaks the import path.

c:\Users\P3856387\Desktop\Programming\GPT_ML_Projects\Project1\notebooks


In [6]:
# Step 1: create a blank __init__.py file in src - to mark src/ as a Python package
# Step 2: create setup.py in project root
# Step 3: From root path: pip install -e .          - Python treats src/data.py as data module globally, regardless of where you run code from.
from data import load_raw, clean_data, split_data
from feature import build_preprocessor

import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
import joblib



def train_and_evaluate(raw_csv):
    df = load_raw(raw_csv)
    df = clean_data(df)

    X_train, X_valid, y_train, y_valid = split_data(df, target='')

    numeric = X_train.select_dtypes(inculde=['int64', 'float64']).columns.tolist()
    categorical = X_train.select_dtypes(include=['object','category']).columns.to_list()

    pre = build_preprocessor(numeric, categorical)                          # Preprocessing

    pipe = Pipeline([
                        ('pre', pre),                                       # pack preprocessing and
                        ('clf', RandomForestClassifier(random_state=42))    # model in Pipeline (code container)
                    ])
    
    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_valid)

    # Predicts probabilities for test data belonging to each class - Used in evaluation metrics like ROC-AUC
    y_proba = pipe.predict_proba(X_valid)     # [:,1] extracts probability of the positive class (class 1)

    print(f"Classification Report: {classification_report(y_valid, y_pred)}")
    print(f"ROC AUC Score{roc_auc_score(y_valid, y_proba)}\n")

    joblib.dump(pipe, 'model.pkl')           # our preprocessed and trained model is in pipe 
    return pipe, numeric, categorical        # returns a tuple of three elements

#### **Note: joblib** 
* joblib is a Python library mainly used for efficiently saving, loading, and caching Python objects, especially large NumPy arrays or ML models.
* **Used for**
    * model saving/loading 
        - Save trained ML models (joblib.dump) and load later (joblib.load) efficiently
    * caching results 
        - Cache results of expensive computations to avoid recomputation
    * Fast disk I/O	
        - Handles large data efficiently with compression
    * Parallel computing 
        - Supports simple parallel loops (less common)


* import joblib

**Save model**
* joblib.dump(model, 'model.pkl')

**Load model**
* model = joblib.load('model.pkl')

In [None]:
from onnxmltools import convert_sklearn
import onnx
from skl2onnx.common.data_types import FloatTensorType

# Defines a function to convert a sklearn pipeline (pipe) to ONNX format.
# Numeric and categorical are lists of feature names.
# output_path is the file path to save the ONNX model (default: '../models/model.onnx')
def convert_to_onnx(pipe, numeric, categorical, output_path):

    # Defines input type info for ONNX conversion.
    initial_types = [('num', FloatTensorType([None, len(numeric)]))]    # 'num' - the input tensor name
                                                                        # None = dynamic batch size (any number of samples).
                                                                        # Number of numeric features as input dimension.
    onnx_model = convert_sklearn(pipe, initial_types=initial_types)     # convert the sklearn pipeline to ONNX format using the input type info.

    with open(output_path, 'wb') as f:                    # Opens the output file in write binary mode.
        f.write(onnx_model.SerializeToString())           # Serializes the ONNX model to a binary string and writes it to disk.

In [8]:
if __name__ == '__main__':
    pipe, num, cat = train_and_evaluate('../data/raw/dataset.csv')
    convert_to_onnx(pipe, num, cat)

FileNotFoundError: [Errno 2] No such file or directory: '../data/raw/dataset.csv'

In [None]:
# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV

param_grid = {
    'clf_n_estimators': [100,200],
    'clf_max_depth': [None, 10, 20]
}
v = GridSearchCV(pipe, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
cv.fit(X_train, y_train)
print(cv.best_params_, cv.best_score_)
pipe = cv.best_estimator_