In [1]:
# xgboost_wine_quality.py
import os
import urllib.request
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor, plot_importance
import matplotlib.pyplot as plt

# 1. Download datasets (red and white)
BASE_DIR = "./data_wine"
os.makedirs(BASE_DIR, exist_ok=True)
urls = {
    "winequality-red.csv": "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv",
    "winequality-white.csv": "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
}
for fname, url in urls.items():
    outpath = os.path.join(BASE_DIR, fname)
    if not os.path.exists(outpath):
        print(f"Downloading {fname}...")
        urllib.request.urlretrieve(url, outpath)

# 2. Load and combine (semicolon-separated CSVs)
df_red = pd.read_csv(os.path.join(BASE_DIR, "winequality-red.csv"), sep=";")
df_white = pd.read_csv(os.path.join(BASE_DIR, "winequality-white.csv"), sep=";")

df_red["type"] = 1  # red = 1
df_white["type"] = 0  # white = 0

df = pd.concat([df_red, df_white], ignore_index=True)
print("Combined shape:", df.shape)
print(df.head())

# 3. Features and target
target = "quality"
X = df.drop(columns=[target])
y = df[target].astype(float)

# 4. Train / validation / test split (60/20/20)
X_tmp, X_test, y_tmp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_tmp, y_tmp, test_size=0.25, random_state=42)
print("Train / Val / Test:", X_train.shape[0], X_val.shape[0], X_test.shape[0])

# 5. Configure and train XGBoost regressor with early stopping
"""
model = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.0,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="rmse",
    early_stopping_rounds=30,
    verbose=50
)
"""

import xgboost as xgb
from xgboost import XGBRegressor

model = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    eval_metric="rmse"
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[xgb.callback.EarlyStopping(rounds=30, save_best=True)],
    verbose=50
)

# 6. Predictions and evaluation on test set
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Test RMSE: {rmse:.4f}")
print(f"Test MAE : {mae:.4f}")
print(f"Test R2  : {r2:.4f}")

# 7. Feature importance (gain)
fig, ax = plt.subplots(figsize=(8,6))
plot_importance(model, ax=ax, importance_type="gain", show_values=True)
ax.set_title("XGBoost feature importance (gain)")
plt.tight_layout()
plt.show()

# 8. Optional: SHAP explanations (install shap to use)
try:
    import shap
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)
    shap.summary_plot(shap_values, X_test, plot_type="bar")
except Exception as e:
    print("SHAP not available or failed:", e)

# 9. Example: get raw margins (untransformed predictions)
raw_margin = model.predict(X_test, output_margin=True)
print("Raw margin shape:", raw_margin.shape)


Combined shape: (6497, 13)
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  type  
0      9.4        5     1  
1  

TypeError: XGBModel.fit() got an unexpected keyword argument 'callbacks'