# 05_retrain_wine_multiclass.ipynb

Retrain a multiclass classifier on the Wine dataset, compare simple models, and save the best-performing model to `models/best_wine_model.pkl`.

This notebook uses scikit-learn's built-in Wine dataset and demonstrates a minimal reproducible workflow: load data, preprocess (train/test split, scaling), train Logistic Regression and Random Forest, evaluate, plot a confusion matrix, and persist the best model with joblib.


In [None]:
# Imports
import os
import joblib
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Determine project root (handles both interactive and non-interactive execution)
# If running from Jupyter: __file__ is not defined, use cwd
# If running via nbconvert: use __file__ to find notebook path, then go up one level
try:
    notebook_dir = Path(__file__).parent.absolute()
    project_root = notebook_dir.parent
except NameError:
    # Running in interactive Jupyter, assume cwd is project root or notebooks/ dir
    cwd = Path.cwd()
    if cwd.name == 'notebooks':
        project_root = cwd.parent
    else:
        project_root = cwd

models_dir = project_root / 'models'
models_dir.mkdir(exist_ok=True)

RANDOM_STATE = 42
TEST_SIZE = 0.2


In [None]:
# Load dataset and quick EDA
data = load_wine(as_frame=True)
X = data.data
y = data.target
feature_names = list(X.columns)
class_names = list(data.target_names)

print('Features shape:', X.shape)
print('Number of classes:', len(np.unique(y)))
print('Class names:', class_names)
print('Class distribution:', y.value_counts())


In [None]:
# Train/test split and scaling
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)


In [None]:
# Define models to compare
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
    'RandomForest': RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE)
}

results = {}

for name, model in models.items():
    print(f'--- Training {name} ---')
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, preds)
    print(f'Accuracy: {acc:.4f}')
    print(classification_report(y_test, preds, target_names=class_names))
    results[name] = {'model': model, 'accuracy': acc, 'preds': preds}

# Select best model by accuracy
best_name = max(results.keys(), key=lambda k: results[k]['accuracy'])
best = results[best_name]
print(f'Best model: {best_name} with accuracy {best["accuracy"]:.4f}')


In [None]:
# Confusion matrix for best model
cm = confusion_matrix(y_test, best['preds'])
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title(f'Confusion Matrix - {best_name}')
plt.show()


In [None]:
# Save the best model and the scaler
model_path = models_dir / 'best_wine_model.pkl'
scaler_path = models_dir / 'best_wine_scaler.pkl'
joblib.dump(best['model'], str(model_path))
joblib.dump(scaler, str(scaler_path))
print(f'Saved best model to: {model_path}')
print(f'Saved scaler to: {scaler_path}')


In [None]:
# Quick load and predict demo
loaded = joblib.load(str(model_path))
loaded_scaler = joblib.load(str(scaler_path))
sample = X_test.iloc[:5]
sample_scaled = loaded_scaler.transform(sample)
print('Sample predictions:', loaded.predict(sample_scaled))


## Conclusion

The notebook trained two simple classifiers and saved the best-performing model. Next steps: hyperparameter tuning with GridSearchCV or RandomizedSearchCV, cross-validation, feature selection, and model interpretability analysis.
