In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import joblib # For saving and loading scikit-learn models/pipelines
import os

# --- 0. Setup: Ensure directory for models exists ---
MODEL_DIR = "saved_models"
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)
MODEL_PATH = os.path.join(MODEL_DIR, "sklearn_pipeline_model.joblib")


# --- 1. Train a Sample Model (similar to Scikit-learn Basics module) ---
# Sample Data
data = {
    'age': [25, 30, 35, 40, 45, 22, 50, 28, 33, 60, np.nan, 38],
    'salary': [50000, 60000, 80000, 75000, 90000, 45000, 120000, 55000, 65000, 150000, 70000, np.nan],
    'department': ['HR', 'IT', 'Sales', 'IT', 'HR', 'Sales', 'IT', 'Sales', 'HR', 'IT', 'IT', 'Sales'],
    'purchased_premium': [0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0] # Target variable
}
df = pd.DataFrame(data)

df


Unnamed: 0,age,salary,department,purchased_premium
0,25.0,50000.0,HR,0
1,30.0,60000.0,IT,1
2,35.0,80000.0,Sales,1
3,40.0,75000.0,IT,0
4,45.0,90000.0,HR,1
5,22.0,45000.0,Sales,0
6,50.0,120000.0,IT,1
7,28.0,55000.0,Sales,0
8,33.0,65000.0,HR,0
9,60.0,150000.0,IT,1


In [11]:

X = df.drop('purchased_premium', axis=1)
y = df['purchased_premium']

# Identify feature types
numerical_features = ['age', 'salary']
categorical_features = ['department']

# Preprocessing pipelines
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# ColumnTransformer to apply pipelines to respective columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ],
    remainder='passthrough'
)

# Full ML pipeline with a Logistic Regression model
full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', random_state=42))
])

# Train-test split (optional for this demo, but good practice)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

print("--- Training the model pipeline ---")
full_pipeline.fit(X_train, y_train)
print("Model training complete.")


--- Training the model pipeline ---
Model training complete.


In [12]:
full_pipeline

In [13]:

# --- 2. Model Serialization (Saving the trained pipeline) ---
print(f"\n--- Serializing (saving) the trained pipeline to {MODEL_PATH} ---")
try:
    joblib.dump(full_pipeline, MODEL_PATH)
    print(f"Pipeline successfully saved to {MODEL_PATH}")
except Exception as e:
    print(f"Error saving pipeline: {e}")




--- Serializing (saving) the trained pipeline to saved_models/sklearn_pipeline_model.joblib ---
Pipeline successfully saved to saved_models/sklearn_pipeline_model.joblib


In [14]:

# --- 3. Model Deserialization (Loading the pipeline) ---
print(f"\n--- Deserializing (loading) the pipeline from {MODEL_PATH} ---")
loaded_pipeline = None
try:
    if os.path.exists(MODEL_PATH):
        loaded_pipeline = joblib.load(MODEL_PATH)
        print("Pipeline successfully loaded.")
        # You can optionally verify the loaded model type or components
        print(type(loaded_pipeline))
        print(loaded_pipeline.named_steps)
    else:
        print(f"Error: Model file not found at {MODEL_PATH}. Please run the saving step first.")
except Exception as e:
    print(f"Error loading pipeline: {e}")



--- Deserializing (loading) the pipeline from saved_models/sklearn_pipeline_model.joblib ---
Pipeline successfully loaded.
<class 'sklearn.pipeline.Pipeline'>
{'preprocessor': ColumnTransformer(remainder='passthrough',
                  transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 ['age', 'salary']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['department'])]), 'classifier': LogisticRegression(random_state=42, solver='libl

In [15]:


# --- 4. Building a Prediction Function (Inference) ---
def make_prediction(input_data_df: pd.DataFrame, model_pipeline) -> np.ndarray:
    """
    Makes predictions using the loaded model pipeline.
    Args:
        input_data_df (pd.DataFrame): DataFrame with features for prediction.
                                      Must have the same columns as used for training
                                      (e.g., 'age', 'salary', 'department').
        model_pipeline: The loaded scikit-learn pipeline.
    Returns:
        np.ndarray: Array of predictions (e.g., class labels).
    """
    if model_pipeline is None:
        raise ValueError("Model pipeline is not loaded.")
    if not isinstance(input_data_df, pd.DataFrame):
        raise TypeError("Input data must be a Pandas DataFrame.")

    print(f"\nInput data for prediction:\n{input_data_df}")
    try:
        predictions = model_pipeline.predict(input_data_df)
        probabilities = model_pipeline.predict_proba(input_data_df) # Get probabilities as well
        return predictions, probabilities
    except Exception as e:
        print(f"Error during prediction: {e}")
        # In a real app, log this error properly
        raise

In [16]:
new_data_list = [
        {'age': 32, 'salary': 72000, 'department': 'IT'},
        {'age': 48, 'salary': 95000, 'department': 'Sales'},
        {'age': 26, 'salary': np.nan, 'department': 'HR'}, # Test with missing salary
        {'age': 55, 'salary': 110000, 'department': 'Marketing'} # Test with unknown department if not handled
    ]
new_data_df = pd.DataFrame(new_data_list)
new_data_df

Unnamed: 0,age,salary,department
0,32,72000.0,IT
1,48,95000.0,Sales
2,26,,HR
3,55,110000.0,Marketing


In [17]:

# --- 5. Using the Prediction Function with New Data ---
if loaded_pipeline:
    print("\n--- Making predictions with the loaded pipeline ---")

    try:
        predictions, probabilities = make_prediction(new_data_df, loaded_pipeline)
        print("\nPredictions (0 or 1 for 'purchased_premium'):")
        for i, (pred, prob) in enumerate(zip(predictions, probabilities)):
            print(f"Data point {i+1}: Predicted class={pred}, Probabilities (0,1)={np.round(prob,3)}")

        # Example: Get probabilities for the positive class (class 1)
        positive_class_probabilities = probabilities[:, 1]
        print(f"\nProbabilities of purchasing premium: {np.round(positive_class_probabilities,3)}")

    except Exception as e:
        print(f"An error occurred while making predictions: {e}")
else:
    print("\nSkipping prediction step as the model was not loaded successfully.")

# Key considerations for model serialization:
# - **joblib vs. pickle**: joblib is generally preferred for scikit-learn objects as it's more efficient with NumPy arrays.
# - **Python versions & library versions**: Compatibility can be an issue. It's best to load a model with the same (or very similar) versions of Python and libraries (especially scikit-learn, pandas, numpy) that were used to save it. Containerization (e.g., Docker) helps manage this.
# - **Security**: Be cautious when loading models from untrusted sources, as pickle (and by extension joblib) can execute arbitrary code.
# - **Model versioning**: In MLOps, you'd version your models (e.g., using MLflow, DVC, or a simple naming convention) along with metadata about how they were trained.


--- Making predictions with the loaded pipeline ---

Input data for prediction:
   age    salary department
0   32   72000.0         IT
1   48   95000.0      Sales
2   26       NaN         HR
3   55  110000.0  Marketing

Predictions (0 or 1 for 'purchased_premium'):
Data point 1: Predicted class=1, Probabilities (0,1)=[0.421 0.579]
Data point 2: Predicted class=1, Probabilities (0,1)=[0.268 0.732]
Data point 3: Predicted class=0, Probabilities (0,1)=[0.552 0.448]
Data point 4: Predicted class=1, Probabilities (0,1)=[0.128 0.872]

Probabilities of purchasing premium: [0.579 0.732 0.448 0.872]
