# Implement multiclass classification on the Iris dataset

## 1. Import Required Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Import preprocessing and model modules
from data_preprocessing import DataLoader, DataPreprocessor, create_pipeline
from models import MulticlassClassifier, ModelTrainer, HyperparameterTuner
from evaluation import ClassificationEvaluator, print_evaluation_summary
from feature_engineering import FeatureSelector, DimensionalityReducer

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

## 2. Load and Explore Data

In [None]:
# Load dataset
loader = DataLoader()
df = loader.load_sklearn_dataset('iris', save_raw=True)

print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:\n{df.head()}")
print(f"\nData types:\n{df.dtypes}")
print(f"\nClass distribution:\n{df['target'].value_counts()}")

## 3. Data Preprocessing

In [None]:
# Create preprocessing pipeline
X_train, X_test, y_train, y_test, preprocessor = create_pipeline(
    df,
    target_column='target',
    handle_missing=True,
    missing_strategy='mean',
    encode_categorical=True,
    scale_features=True,
    scaling_method='standard',
    remove_outliers=False,
    test_size=0.2,
    random_state=42
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"\nClass distribution in training set:\n{pd.Series(y_train).value_counts().sort_index()}")

## 4. Featyre Engineering

In [None]:
# Feature selection
selector = FeatureSelector()
X_train_selected, selected_features = selector.select_k_best(
    X_train, y_train, k=3
)
X_test_selected = X_test[:, selector.selector.get_support()]

print(f"Original features: {X_train.shape[1]}")
print(f"Selected features: {len(selected_features)}")
print(f"Feature names: {selected_features}")

## 5. Train Multiclass Models

In [None]:
# Train multiple multiclass models
trainer = ModelTrainer(random_state=42)

model_configs = {
    'Random Forest': {
        'model_type': 'random_forest',
        'n_estimators': 100,
        'max_depth': 10
    },
    'Gradient Boosting': {
        'model_type': 'gradient_boosting',
        'n_estimators': 100,
        'learning_rate': 0.1
    },
    'Logistic Regression': {
        'model_type': 'logistic_regression',
        'max_iter': 1000
    },
    'KNN': {
        'model_type': 'knn',
        'n_neighbors': 5
    }
}

trained_models = trainer.train_multiple_models(X_train, y_train, model_configs)

print(f"Successfully trained {len(trained_models)} models")
for model_name in trained_models.keys():
    print(f"  âœ“ {model_name}")

## 5. Hyperparameter Tuning

In [None]:
# Tune Random Forest hyperparameters
rf_model = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

tuner = HyperparameterTuner(
    rf_model,
    param_grid,
    cv=5,
    scoring='accuracy',
    random_state=42
)

print("Performing grid search...")
best_rf_model = tuner.grid_search(X_train, y_train, n_jobs=-1)

print(f"\nBest parameters: {tuner.get_best_params()}")
print(f"Best CV score: {tuner.get_best_score():.4f}")

## 7. Model Evaluation

In [None]:
# Evaluate best model on test set
evaluator = ClassificationEvaluator(model_name='Tuned Random Forest')

y_pred = best_rf_model.predict(X_test)
y_pred_proba = best_rf_model.predict_proba(X_test)

metrics = evaluator.evaluate(y_test, y_pred, y_pred_proba, average='weighted')

print("Test Set Metrics:")
print("-" * 40)
for metric_name, value in metrics.items():
    print(f"  {metric_name:20s}: {value:.4f}")

## 8. Visualization - Confusion Matrix

In [None]:
# Plot confusion matrix
fig = evaluator.plot_confusion_matrix(
    y_test,
    y_pred,
    class_names=['Class 0', 'Class 1', 'Class 2'],
    normalize=False,
    figsize=(8, 6)
)
plt.show()

## 9. Visualization - Classification Report

In [None]:
# Print detailed classification report
print_evaluation_summary(
    y_test,
    y_pred,
    y_pred_proba,
    model_name='Tuned Random Forest',
    class_names=['Class 0', 'Class 1', 'Class 2']
)

## 10. Compare All Models

In [None]:
# Compare all trained models
from src.evaluation import compare_models

comparison_results = compare_models(
    trained_models,
    X_test,
    y_test,
    average='weighted'
)

print("\nModel Comparison Results:")
print("=" * 60)
print(comparison_results)

## 11. Feature Importance

In [None]:
# Get feature importance from best model
from src.models import get_feature_importance

importance_df = get_feature_importance(
    best_rf_model,
    feature_names=['Feature 0', 'Feature 1', 'Feature 2', 'Feature 3']
)

print("\nFeature Importance:")
print(importance_df)

# Plot feature importance
fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(data=importance_df, x='importance', y='feature', ax=ax)
ax.set_title('Feature Importance - Random Forest')
plt.tight_layout()
plt.show()

## 12. Save Best Model

In [None]:
# Save the best model
from src.models import save_model

save_model(best_rf_model, 'models/best_multiclass_model.pkl')
print("Model saved successfully!")