In [1]:
# Iris Classification with Random Forest
# This notebook demonstrates local ML development before moving to Kubeflow

import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib
import mlflow
import mlflow.sklearn
import os

# Set up MLflow (optional - for local tracking)
mlflow.set_experiment("iris-classification")

# Load the Iris dataset
print("Loading Iris dataset...")
iris = load_iris()
X = iris.data
y = iris.target

# Create a DataFrame for better visualization
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y
df['species'] = df['target'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# Define hyperparameters
params = {
    'n_estimators': 100,
    'max_depth': 3,
    'random_state': 42
}

# Start MLflow run
with mlflow.start_run():
    # Train the model
    print("\nTraining Random Forest model...")
    rf_model = RandomForestClassifier(**params)
    rf_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"\nModel Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=iris.target_names))
    
    # Log parameters and metrics to MLflow
    mlflow.log_params(params)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("test_samples", len(y_test))
    
    # Save the model locally
    model_path = "model.pkl"
    joblib.dump(rf_model, model_path)
    print(f"\nModel saved to: {model_path}")
    
    # Log model to MLflow
    mlflow.sklearn.log_model(rf_model, "random_forest_model")
    
    # Log the model file as an artifact
    mlflow.log_artifact(model_path)

print("\nâœ… Training completed successfully!")
print("Next: Move this code to a Python script for Kubeflow pipeline")

# Test the saved model
print("\nðŸ§ª Testing saved model...")
loaded_model = joblib.load(model_path)
test_prediction = loaded_model.predict([[5.1, 3.5, 1.4, 0.2]])  # Sample input
print(f"Test prediction: {iris.target_names[test_prediction[0]]}")

2025/06/06 16:26:18 INFO mlflow.tracking.fluent: Experiment with name 'iris-classification' does not exist. Creating a new experiment.


Loading Iris dataset...
Dataset shape: (150, 6)

First 5 rows:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target species  
0       0  setosa  
1       0  setosa  
2       0  setosa  
3       0  setosa  
4       0  setosa  

Training set size: 120
Test set size: 30

Training Random Forest model...

Model Accuracy: 0.9667

Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      0.90      0.95        10
   virginica       0.91      1.00      0.95        10

 




âœ… Training completed successfully!
Next: Move this code to a Python script for Kubeflow pipeline

ðŸ§ª Testing saved model...
Test prediction: setosa
