# XGBoost Classifier for IRIS Dataset\n\nThis notebook implements comprehensive training and evaluation.

In [9]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
import pickle
import warnings
warnings.filterwarnings('ignore')
from xgboost import XGBClassifier

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

## 1. Load Dataset

In [10]:
df = pd.read_csv('../models/cleaned_iris_dataset.csv')
X = df.drop('species', axis=1)
y = df['species']
print(f'Dataset Shape: {df.shape}')
print(f'Features: {list(X.columns)}')
print(f'Classes: {sorted(y.unique())}')

Dataset Shape: (149, 5)
Features: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
Classes: [0, 1, 2]


## 2. Train-Test Split & Scaling

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(f'Training set: {X_train.shape[0]} samples')
print(f'Testing set: {X_test.shape[0]} samples')

Training set: 119 samples
Testing set: 30 samples


## 3. Model Training with Hyperparameter Tuning

In [12]:
from xgboost import XGBClassifier

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.2]
}

# GridSearchCV
grid_search = GridSearchCV(
    XGBClassifier(random_state=42, eval_metric='logloss'),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

print('Starting hyperparameter tuning for XGBoost...')
grid_search.fit(X_train_scaled, y_train)

xgb_best = grid_search.best_estimator_
print(f'\nBest parameters: {grid_search.best_params_}')
print(f'Best CV score: {grid_search.best_score_:.4f}')
print(f'Test accuracy: {xgb_best.score(X_test_scaled, y_test):.4f}')

Starting hyperparameter tuning for XGBoost...


AttributeError: 'super' object has no attribute '__sklearn_tags__'

## 4. Model Evaluation

In [None]:
species_names = ['Setosa', 'Versicolor', 'Virginica']\ny_test_pred = xgb_best.predict(X_test_scaled)\ntest_accuracy = accuracy_score(y_test, y_test_pred)\nprint(f'Test Accuracy: {test_accuracy:.4f}')\nprint('\\nClassification Report:')\nprint(classification_report(y_test, y_test_pred, target_names=species_names))\n\ncm = confusion_matrix(y_test, y_test_pred)\nfig, ax = plt.subplots(figsize=(8, 6))\ndisp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=species_names)\ndisp.plot(cmap='Blues', ax=ax)\nplt.title('Confusion Matrix - XGBoost Classifier for IRIS Dataset')\nplt.show()

## Feature Importance

In [None]:
import matplotlib.pyplot as plt\nfrom xgboost import plot_importance\nfig, ax = plt.subplots(figsize=(10, 6))\nplot_importance(xgb_best, ax=ax, max_num_features=4)\nplt.title('XGBoost Feature Importance')\nplt.tight_layout()\nplt.show()

## 5. Model Inference

In [None]:
new_samples = np.array([[5.1, 3.5, 1.4, 0.2], [6.2, 2.9, 4.3, 1.3], [7.3, 2.9, 6.3, 1.8]])\nnew_samples_scaled = scaler.transform(new_samples)\npredictions = xgb_best.predict(new_samples_scaled)\nprobabilities = xgb_best.predict_proba(new_samples_scaled)\nfor i, (sample, pred, proba) in enumerate(zip(new_samples, predictions, probabilities)):\n    print(f'Sample {i+1}: {sample} -> {species_names[pred]} (confidence: {proba[pred]:.4f})')

## 6. Save Model

In [None]:
model_data = {'model': xgb_best, 'scaler': scaler, 'feature_names': list(X.columns), 'class_names': species_names, 'test_accuracy': test_accuracy}}\nwith open('../models/xgboost_model.pkl', 'wb') as f:\n    pickle.dump(model_data, f)\nprint('âœ“ Model saved to ../models/xgboost_model.pkl')