# Model Training — Improved

This notebook demonstrates a robust training workflow:
- Load preprocessed dataset
- Train/test split with stratification
- Use a pipeline combining preprocessing + model
- Cross-validation with `cross_val_score` and `cross_validate`
- Use `GridSearchCV` for hyperparameter tuning
- Evaluate on hold-out test set and save the model


In [16]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score
import xgboost as xgb
import joblib
import json
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

print('📖 Libraries imported successfully!')
print('🚀 Starting model training pipeline...')

# Load preprocessed data
DATA_PATH = '../data/processed/wine_processed.csv'

try:
    df = pd.read_csv(DATA_PATH)
    print('🍷 Loaded preprocessed wine dataset!')
    print(f'📊 Dataset shape: {df.shape}')
    print(f'📋 Columns: {list(df.columns)}')
except Exception as e:
    print('Could not load dataset:', e)
    df = None

📖 Libraries imported successfully!
🚀 Starting model training pipeline...
🍷 Loaded preprocessed wine dataset!
📊 Dataset shape: (1359, 12)
📋 Columns: ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality_binary']


## 1) Prepare X, y and train/test split

In [17]:
if df is not None:
    # Prepare features and target
    features = [col for col in df.columns if col != 'quality_binary']
    X = df[features]
    y = df['quality_binary']

    # Initialize label encoder
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    print(f'🔧 Data preparation:')
    print(f'• Features: {len(features)}')
    print(f'• Feature names: {features}')
    print(f'• Target variable: quality_binary')
    print(f'• Target classes: {y.unique()}')
    print(f'• Encoded target classes: {le.classes_}')
    print(f'• Encoded values: {np.unique(y_encoded)}')

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
    print(f'📚 Train shape: {X_train.shape}, Test shape: {X_test.shape}')

🔧 Data preparation:
• Features: 11
• Feature names: ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
• Target variable: quality_binary
• Target classes: ['Good' 'Bad']
• Encoded target classes: ['Bad' 'Good']
• Encoded values: [0 1]
📚 Train shape: (1087, 11), Test shape: (272, 11)


## 2) Build pipeline (preprocessing + model)

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# load preprocessing pipeline saved earlier if available
try:
    preproc = joblib.load(PIPE_PATH)
    print('Loaded preprocessing pipeline from', PIPE_PATH)
except:
    preproc = None
    print('Preprocessing pipeline not found; it will be built from training data.')

if preproc is None and df is not None:
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
    from sklearn.compose import ColumnTransformer
    # Re-build minimal preproc using training data
    num_cols = X_train.select_dtypes(include=['number']).columns.tolist()
    cat_cols = X_train.select_dtypes(include=['object','category','bool']).columns.tolist()
    num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
    cat_pipeline = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='__MISSING__')), ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))])
    preproc = ColumnTransformer([('num', num_pipeline, num_cols), ('cat', cat_pipeline, cat_cols)], remainder='drop')

model = RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=42)
pipe = Pipeline([('preproc', preproc), ('clf', model)])


Preprocessing pipeline not found; it will be built from training data.


## 3) Cross-validation (stratified) and baseline evaluation

In [19]:
if df is not None:
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scoring = ['accuracy', 'f1_weighted', 'roc_auc'] if len(np.unique(y_train))==2 else ['accuracy','f1_weighted']
    scores = cross_validate(pipe, X_train, y_train, cv=cv, scoring=scoring, return_train_score=False, n_jobs=-1)
    for k,v in scores.items():
        print(k, np.mean(v))

fit_time 0.3645845890045166
score_time 0.06795239448547363
test_accuracy 0.9503234262038642
test_f1_weighted 0.9350862673099749
test_roc_auc 0.8285955499814195


## 4) Hyperparameter tuning (GridSearchCV) — example

In [20]:
if df is not None:
    # Initialize models
    models = {
        'Logistic Regression': LogisticRegression(max_iter=1000),
        'Decision Tree': DecisionTreeClassifier(random_state=42),
        'Random Forest': RandomForestClassifier(random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42),
        'SVM': SVC(probability=True, random_state=42),
        'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    }

    # Train and evaluate each model
    results = []
    for name, model in models.items():
        print(f'\n🔹 Training: {name}')
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        results.append({
            'Model': name,
            'Accuracy': acc,
            'Precision': prec,
            'Recall': rec,
            'F1 Score': f1
        })
        print(f'✅ {name} - Accuracy: {acc:.4f}, F1: {f1:.4f}')

    # Create results dataframe
    results_df = pd.DataFrame(results)
    print('\n📊 Model Performance Summary:')
    print(results_df)

    # Select best model based on F1 Score
    best_model_name = results_df.loc[results_df['F1 Score'].idxmax(), 'Model']
    best_model = models[best_model_name]
    print(f'\n🏆 Best Model: {best_model_name}')


🔹 Training: Logistic Regression
✅ Logistic Regression - Accuracy: 0.9485, F1: 0.9736

🔹 Training: Decision Tree
✅ Decision Tree - Accuracy: 0.9228, F1: 0.9597

🔹 Training: Random Forest
✅ Random Forest - Accuracy: 0.9522, F1: 0.9755

🔹 Training: Gradient Boosting
✅ Gradient Boosting - Accuracy: 0.9375, F1: 0.9676

🔹 Training: SVM
✅ Random Forest - Accuracy: 0.9522, F1: 0.9755

🔹 Training: Gradient Boosting
✅ Gradient Boosting - Accuracy: 0.9375, F1: 0.9676

🔹 Training: SVM
✅ SVM - Accuracy: 0.9522, F1: 0.9755

🔹 Training: XGBoost
✅ XGBoost - Accuracy: 0.9632, F1: 0.9811

📊 Model Performance Summary:
                 Model  Accuracy  Precision    Recall  F1 Score
0  Logistic Regression  0.948529   0.952030  0.996139  0.973585
1        Decision Tree  0.922794   0.954198  0.965251  0.959693
2        Random Forest  0.952206   0.952206  1.000000  0.975518
3    Gradient Boosting  0.937500   0.954887  0.980695  0.967619
4                  SVM  0.952206   0.952206  1.000000  0.975518
5       

## 5) Final evaluation on test set

In [21]:
if df is not None:
    y_pred = best_model.predict(X_test)
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('Classification report:')
    print(classification_report(y_test, y_pred))
    try:
        if len(np.unique(y_test))==2:
            print('ROC AUC:', roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1]))
    except Exception as e:
        print('ROC AUC error:', e)
    cm = confusion_matrix(y_test, y_pred)
    print('Confusion matrix:\n', cm)
    # save model to project folder so path exists on Windows/local
    import os
    os.makedirs('../backend/saved_models', exist_ok=True)
    joblib.dump(best_model, '../backend/saved_models/best_model.joblib')
    print('Saved best model to ../backend/saved_models/best_model.joblib')

Accuracy: 0.9632352941176471
Classification report:
              precision    recall  f1-score   support

           0       1.00      0.23      0.38        13
           1       0.96      1.00      0.98       259

    accuracy                           0.96       272
   macro avg       0.98      0.62      0.68       272
weighted avg       0.96      0.96      0.95       272

ROC AUC: 0.7986337986337987
Confusion matrix:
 [[  3  10]
 [  0 259]]
Saved best model to ../backend/saved_models/best_model.joblib


In [22]:
import joblib
import os
import json

# Create directories if they don't exist
os.makedirs('../backend/saved_models', exist_ok=True)

# Save the best model
if 'best_model' in globals():
    # Save model
    joblib.dump(best_model, '../backend/saved_models/best_model.pkl')
    print('💾 Best model saved to ../backend/saved_models/best_model.pkl')

    # Save label encoder
    if 'le' in globals():
        joblib.dump(le.classes_, '../backend/saved_models/label_encoder.pkl')
        print('💾 Label encoder saved to ../backend/saved_models/label_encoder.pkl')

    # Save model metrics
    if 'results_df' in globals():
        metrics_dict = results_df.set_index('Model').to_dict(orient='index')
        with open('../backend/saved_models/model_metrics.json', 'w') as f:
            json.dump(metrics_dict, f, indent=4)
        print('💾 Model metrics saved to ../backend/saved_models/model_metrics.json')

    print('\n🎉 Model training pipeline completed successfully!')
    print(f'Best performing model: {best_model_name}')
else:
    print('ERROR: best_model is not defined. Please run the training cells first.')

💾 Best model saved to ../backend/saved_models/best_model.pkl
💾 Label encoder saved to ../backend/saved_models/label_encoder.pkl
💾 Model metrics saved to ../backend/saved_models/model_metrics.json

🎉 Model training pipeline completed successfully!
Best performing model: XGBoost


## Notes & next steps
- If classes are imbalanced, consider `class_weight='balanced'` in tree models or use oversampling (SMOTE) inside a pipeline.
- For high-cardinality categoricals, consider Target Encoding (use `category_encoders` library) or embedding approaches.
- Try XGBoost/LightGBM if available — they often improve performance.
- Feature selection (SHAP, permutation importance) can further improve results.