## 1. Import dependencies
Import standard data science and ML libraries: NumPy, Pandas, Matplotlib, Seaborn, Scikit-Learn and joblib to save the best model.

In [None]:
# Data science imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# scikit-learn imports
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# utility
import joblib

# visualization settings
%matplotlib inline
sns.set(style="whitegrid")

## 2. Load dataset and show first few rows
Load the Iris dataset from scikit-learn and convert it to a pandas DataFrame for easier analysis.
We will also add a `species` column using the target names for readability.

In [None]:
# Load Iris dataset from scikit-learn
iris = datasets.load_iris()
X = iris.data
y = iris.target
feature_names = iris.feature_names
target_names = iris.target_names

# Create pandas DataFrame for a cleaner view
df = pd.DataFrame(X, columns=feature_names)
df['species'] = pd.Categorical.from_codes(y, target_names)

# Display first 5 rows
df.head()

## 3. Dataset info, shape, null-check, basic statistics
Quick checks to ensure dataset health and basic properties.

In [None]:
# General info
print('Shape:', df.shape)
print('
Column info:')
print(df.info())

# Check nulls and basic stats
print('
Null values:')
print(df.isnull().sum())

# Summary statistics
df.describe().T

## 4. Exploratory Data Analysis (EDA)
Let's visualize the dataset to discover patterns, relationships and class separation.
We'll create: pairplot, correlation heatmap, and distribution plots.

In [None]:
# Pairplot shows relationships between features colored by species
sns.pairplot(df, hue='species', corner=True, height=1.6);

In [None]:
# Correlation matrix heatmap
plt.figure(figsize=(8,6))
corr = df.drop('species', axis=1).corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()

In [None]:
# Distribution and box plots for features
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
for ax, feature in zip(axes.ravel(), feature_names):
    sns.histplot(data=df, x=feature, hue='species', ax=ax, bins=15, kde=True)
    ax.set_title(f'Distribution of {feature}')
plt.tight_layout()
plt.show()

## 5. Data preprocessing
- Convert category labels to numeric if needed (already numeric in `y`).
- No missing values in Iris dataset; scale features if desired.
We will proceed by splitting the dataset into train and test sets.

In [None]:
# Preprocessing: create X (features) and y (labels) as numpy arrays or data frames
X = df[feature_names]
y = df['species']  # categorical names

# Optional: check distribution of classes
print('Class counts:')
print(y.value_counts())

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print('
Train size:', X_train.shape)
print('Test size:', X_test.shape)

## 6. Train multiple classification models
We'll train four simple, well-known models and compare their performance: Logistic Regression, SVM, Decision Tree, and Random Forest.
We use default hyperparameters as a starting point (good for a small dataset).

In [None]:
# Create and train models
models = {
    'Logistic Regression': LogisticRegression(solver='liblinear', random_state=42),
    'SVM': SVC(kernel='rbf', probability=True, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

trained_models = {}
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    cr = classification_report(y_test, y_pred, output_dict=True)
    trained_models[name] = model
    results.append({
        'Model': name,
        'Accuracy': round(acc, 4),
        'Confusion Matrix': cm,
        'Classification Report': cr
    })

# Create results DataFrame
results_df = pd.DataFrame([{'Model': r['Model'], 'Accuracy': r['Accuracy']} for r in results])
results_df = results_df.sort_values(by='Accuracy', ascending=False).reset_index(drop=True)
results_df

## 7. Evaluate models: accuracy, confusion matrix & classification report
Let's print detailed evaluation metrics for each classifier and visualize the best one using a confusion matrix heatmap.

In [None]:
# Print evaluation details
for r in results:
    print('Model:', r['Model'])
    print('Accuracy:', r['Accuracy'])
    print('Confusion Matrix:')
    print(r['Confusion Matrix'])
    print('
Classification Report:')
    print(pd.DataFrame(r['Classification Report']).T)
    print('
' + '-'*60 + '
')

In [None]:
# Visualize confusion matrix for the best model (highest accuracy)
best_model_name = results_df.iloc[0]['Model']
best_model = trained_models[best_model_name]
y_pred_best = best_model.predict(X_test)
cm_best = confusion_matrix(y_test, y_pred_best)

plt.figure(figsize=(6,5))
sns.heatmap(cm_best, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title(f'Confusion Matrix: {best_model_name}')
plt.show()

## 8. Choose the best model and explain why
We choose the model with the highest accuracy and balanced performance across classes. If multiple models tie, consider preferred criteria like f1-score and model interpretability.

## 9. Save the best model using joblib
We'll save the best model to `iris_best_model.joblib` so it can be loaded later without retraining.

In [None]:
# Save best model
save_path = 'iris_best_model.joblib'
joblib.dump(best_model, save_path)
print(f'Best model ({best_model_name}) saved to {save_path}')

## 10. Create a predict() helper function for sample input
We'll write a simple function to load the saved model and predict the species for a new sample (single observation). This is useful for integrating the model into apps or for sharing with others.

In [None]:
def predict_sample(features, model_path=save_path):
    """
    Predict species for a single input sample.

    Args:
        features: list, tuple or numpy array of 4 feature values [sepal length, sepal width, petal length, petal width]
        model_path: path to the saved model file

    Returns:
        predicted species name
    """
    # Load model
    model = joblib.load(model_path)
    # Ensure it is a 2D array for sklearn
    arr = np.array(features).reshape(1, -1)
    pred = model.predict(arr)
    return pred[0]

# Example usage
sample = [5.1, 3.5, 1.4, 0.2]  # small iris sample
predicted = predict_sample(sample)
print('Sample:', sample, '-> Predicted species:', predicted)

## 11. Final summary and observations
- We successfully trained and compared 4 models on the Iris dataset.
- We selected the best performing model based on accuracy and class-wise metrics.
- The best model was saved to disk and a small `predict_sample()` function was provided for predictions.

**Key observations:**
- Iris is a small and well-balanced dataset; a simple ensemble like Random Forest or even Logistic Regression achieves high accuracy.
- Visual EDA (pairplots and heatmaps) revealed feature separability and correlation patterns.

Thanks for viewing this notebook â€” suitable for a data science portfolio/assignment with clean code, professional markdown, and reproducible steps.