In [9]:
!pip -q install scikit-learn matplotlib pandas numpy
# --- Imports ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay

In [12]:
iris = datasets.load_iris()
X = iris.data
y = iris.target
target_names = iris.target_names

In [13]:
# --- 2) Train/validation split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

In [14]:
# --- 3) Quick baseline with default params ---
rf_base = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)
rf_base.fit(X_train, y_train)
y_pred_base = rf_base.predict(X_test)

print("Baseline Accuracy:", accuracy_score(y_test, y_pred_base))
print("\nBaseline Classification Report:")
print(classification_report(y_test, y_pred_base, target_names=target_names))

Baseline Accuracy: 0.8947368421052632

Baseline Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        12
  versicolor       0.80      0.92      0.86        13
   virginica       0.91      0.77      0.83        13

    accuracy                           0.89        38
   macro avg       0.90      0.90      0.90        38
weighted avg       0.90      0.89      0.89        38



In [None]:
param_grid = {
    "n_estimators": [100, 200, 400],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None],
    "bootstrap": [True, False],
}
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring="accuracy",
    cv=5,
    n_jobs=-1,
    verbose=0,
)
grid.fit(X_train, y_train)

print("\nBest Params:", grid.best_params_)
print("Best CV Accuracy:", grid.best_score_)

In [None]:
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
print("\nTest Accuracy (Best Model):", accuracy_score(y_test, y_pred))
print("\nClassification Report (Best Model):")
print(classification_report(y_test, y_pred, target_names=target_names))

# Confusion matrix
disp = ConfusionMatrixDisplay.from_predictions(
    y_test, y_pred, display_labels=target_names, xticks_rotation=45
)
plt.title("Confusion Matrix - Random Forest (Best Model)")
plt.tight_layout()
plt.show()

In [None]:
# --- 6) Feature importance plot ---
importances = best_model.feature_importances_
indices = np.argsort(importances)

plt.figure(figsize=(6, 4.5))
plt.barh(range(len(indices)), importances[indices])
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Feature Importance")
plt.title("Random Forest Feature Importances")
plt.tight_layout()
plt.show()

# --- 7) Save the trained model ---
joblib.dump(best_model, "random_forest_best_model.joblib")
print("\nSaved best model to random_forest_best_model.joblib")

In [None]:
# --- 8) Predict on a new sample (match feature order!) ---
# For Iris: [sepal length, sepal width, petal length, petal width]
sample = np.array([[5.4, 3.9, 1.7, 0.4]])
pred = best_model.predict(sample)[0]
proba = best_model.predict_proba(sample)[0]
print(f"\nNew sample predicted class: {target_names[pred]}")
print("Class probabilities:", dict(zip(target_names, np.round(proba, 3))))