In [2]:
# Basic setup
import pandas as pd
import numpy as np

# Modeling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


# Plotting
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
pip install xgboost


Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 1.8/150.0 MB 12.7 MB/s eta 0:00:12
   - -------------------------------------- 5.2/150.0 MB 14.9 MB/s eta 0:00:10
   -- ------------------------------------- 9.4/150.0 MB 16.4 MB/s eta 0:00:09
   --- ------------------------------------ 13.1/150.0 MB 16.8 MB/s eta 0:00:09
   ---- ----------------------------------- 16.8/150.0 MB 17.5 MB/s eta 0:00:08
   ----- ---------------------------------- 21.0/150.0 MB 17.5 MB/s eta 0:00:08
   ------ --------------------------------- 24.4/150.0 MB 17.2 MB/s eta 0:00:08
   ------- -------------------------------- 28.6/150.0 MB 17.6 MB/s eta 0:00:07
   -------- ------------------------------- 32.5/150.0 MB 17.8 MB/s eta 0:00:07
   --------- ------------------------------ 36.7/150.0 MB 18.1 M


[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
from xgboost import XGBClassifier


In [6]:
print(df.columns)


Index(['Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response'],
      dtype='object')


In [7]:
# Create a new column with the name of the top product category
product_cols = ['MntWines', 'MntFruits', 'MntMeatProducts',
                'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']

df["Top_Product"] = df[product_cols].idxmax(axis=1)


In [8]:
y = df["Top_Product"]
X = df.drop(product_cols + ["Top_Product", "Dt_Customer"], axis=1)
X = pd.get_dummies(X, drop_first=True)


In [12]:
# Check how many NaNs exist now
print(df.isna().sum())

# Or just total rows with any NaN
print("Rows with NaN:", df.isna().any(axis=1).sum())


Year_Birth              0
Education               0
Marital_Status          0
Income                 24
Kidhome                 0
Teenhome                0
Dt_Customer             0
Recency                 0
MntWines                0
MntFruits               0
MntMeatProducts         0
MntFishProducts         0
MntSweetProducts        0
MntGoldProds            0
NumDealsPurchases       0
NumWebPurchases         0
NumCatalogPurchases     0
NumStorePurchases       0
NumWebVisitsMonth       0
AcceptedCmp3            0
AcceptedCmp4            0
AcceptedCmp5            0
AcceptedCmp1            0
AcceptedCmp2            0
Complain                0
Z_CostContact           0
Z_Revenue               0
Response                0
Top_Product             0
dtype: int64
Rows with NaN: 24


In [13]:
df_clean = df.dropna()


In [14]:
# Set target and features
y = df_clean["Top_Product"]
X = df_clean.drop(product_cols + ["Top_Product", "Dt_Customer"], axis=1)

# Encode categoricals
X = pd.get_dummies(X, drop_first=True)

# Train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [16]:
from sklearn.preprocessing import LabelEncoder

# Encode y labels numerically for XGBoost (e.g., MntWines → 0, etc.)
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)


In [17]:
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train with encoded labels ONLY for XGBoost
    if name == "XGBoost":
        model.fit(X_train, y_train_enc)
        y_pred = model.predict(X_test)
        y_pred_labels = le.inverse_transform(y_pred)  # Convert back to string labels
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_labels = y_pred

    # Evaluation
    acc = accuracy_score(y_test, y_pred_labels)
    prec = precision_score(y_test, y_pred_labels, average='macro', zero_division=0)
    rec = recall_score(y_test, y_pred_labels, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred_labels, average='macro', zero_division=0)

    # Probabilities + AUC-ROC
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)
        y_test_bin = LabelBinarizer().fit_transform(y_test)
        auc = roc_auc_score(y_test_bin, y_proba, average='macro', multi_class='ovr')
    else:
        auc = "N/A"

    results.append({
        "Model": name,
        "Accuracy": round(acc, 4),
        "Precision": round(prec, 4),
        "Recall": round(rec, 4),
        "F1-Score": round(f1, 4),
        "AUC-ROC (Macro)": round(auc, 4) if auc != "N/A" else "N/A"
    })



Training Logistic Regression...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=300).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Training k-NN...

Training Decision Tree...

Training Random Forest...

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [19]:
results_df = pd.DataFrame(results).set_index("Model")


In [20]:
print(results_df)


                     Accuracy  Precision  Recall  F1-Score  AUC-ROC (Macro)
Model                                                                      
Logistic Regression    0.7365     0.2010  0.2207    0.2065           0.7828
k-NN                   0.6959     0.4618  0.3824    0.4009           0.7299
Decision Tree          0.7095     0.3747  0.4168    0.3842           0.6586
Random Forest          0.7883     0.5106  0.4132    0.4437           0.8855
Logistic Regression    0.7365     0.2010  0.2207    0.2065           0.7828
k-NN                   0.6959     0.4618  0.3824    0.4009           0.7299
Decision Tree          0.7095     0.3747  0.4168    0.3842           0.6586
Random Forest          0.7883     0.5106  0.4132    0.4437           0.8855
XGBoost                0.7725     0.5331  0.4313    0.4666           0.8896


### HyperParameter Tuning

In [22]:
from sklearn.preprocessing import LabelEncoder

# Encode string labels into integers (needed for XGBoost)
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)


In [23]:
for name, model in models.items():
    print(f"\n🔍 Tuning {name}...")
    grid = GridSearchCV(model, param_grids[name], cv=3, scoring='f1_macro', n_jobs=-1, verbose=1)
    
    if name == "XGBoost":
        grid.fit(X_train, y_train_enc)
        best_models[name] = grid.best_estimator_
    else:
        grid.fit(X_train, y_train)
        best_models[name] = grid.best_estimator_
    
    print("✅ Best Params:", grid.best_params_)



🔍 Tuning Logistic Regression...
Fitting 3 folds for each of 16 candidates, totalling 48 fits


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=300).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


✅ Best Params: {'C': 0.1, 'max_iter': 300, 'solver': 'lbfgs'}

🔍 Tuning k-NN...
Fitting 3 folds for each of 16 candidates, totalling 48 fits
✅ Best Params: {'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}

🔍 Tuning Decision Tree...
Fitting 3 folds for each of 36 candidates, totalling 108 fits
✅ Best Params: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2}

🔍 Tuning Random Forest...
Fitting 3 folds for each of 48 candidates, totalling 144 fits
✅ Best Params: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}

🔍 Tuning XGBoost...
Fitting 3 folds for each of 32 candidates, totalling 96 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Best Params: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200, 'subsample': 0.8}


In [24]:
from sklearn.metrics import classification_report, roc_auc_score

# Store results
final_results = []

print("\n🔍 Final Evaluation of Tuned Models:\n")

for name, model in best_models.items():
    print(f"📌 {name}")
    
    # Use encoded labels only for XGBoost
    if name == "XGBoost":
        y_pred = model.predict(X_test)
        y_true = y_test_enc
        proba = model.predict_proba(X_test)
    else:
        y_pred = model.predict(X_test)
        y_true = y_test
        proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None

    # Print classification report
    print(classification_report(y_true, y_pred, target_names=le.classes_))

    # Calculate macro AUC-ROC
    try:
        auc_macro = roc_auc_score(le.transform(y_true), proba, multi_class='ovr', average='macro')
    except:
        auc_macro = "N/A"

    # Append to final results
    final_results.append({
        "Model": name,
        "Accuracy": round(accuracy_score(y_true, y_pred), 4),
        "Precision": round(precision_score(y_true, y_pred, average='macro', zero_division=0), 4),
        "Recall": round(recall_score(y_true, y_pred, average='macro', zero_division=0), 4),
        "F1-Score": round(f1_score(y_true, y_pred, average='macro', zero_division=0), 4),
        "AUC-ROC (Macro)": round(auc_macro, 4) if auc_macro != "N/A" else "N/A"
    })
    
    print(f"🔹 AUC-ROC (Macro): {auc_macro}\n" + "-"*60 + "\n")

# Show final performance table
results_df_tuned = pd.DataFrame(final_results).set_index("Model")
print("✅ Summary Table After Hyperparameter Tuning:")
display(results_df_tuned)



🔍 Final Evaluation of Tuned Models:

📌 Logistic Regression
                  precision    recall  f1-score   support

 MntFishProducts       0.00      0.00      0.00        12
       MntFruits       0.00      0.00      0.00         2
    MntGoldProds       0.41      0.30      0.35        30
 MntMeatProducts       0.00      0.00      0.00        77
MntSweetProducts       0.00      0.00      0.00         3
        MntWines       0.75      0.99      0.85       320

        accuracy                           0.73       444
       macro avg       0.19      0.22      0.20       444
    weighted avg       0.57      0.73      0.64       444

🔹 AUC-ROC (Macro): 0.7929455985598715
------------------------------------------------------------

📌 k-NN
                  precision    recall  f1-score   support

 MntFishProducts       0.20      0.08      0.12        12
       MntFruits       0.50      0.50      0.50         2
    MntGoldProds       0.32      0.40      0.36        30
 MntMeatProducts 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                  precision    recall  f1-score   support

 MntFishProducts       0.17      0.08      0.11        12
       MntFruits       1.00      0.50      0.67         2
    MntGoldProds       0.43      0.50      0.46        30
 MntMeatProducts       0.60      0.47      0.53        77
MntSweetProducts       0.00      0.00      0.00         3
        MntWines       0.87      0.93      0.90       320

        accuracy                           0.79       444
       macro avg       0.51      0.41      0.44       444
    weighted avg       0.77      0.79      0.78       444

🔹 AUC-ROC (Macro): 0.8854542325209951
------------------------------------------------------------

📌 XGBoost
                  precision    recall  f1-score   support

 MntFishProducts       0.29      0.17      0.21        12
       MntFruits       1.00      0.50      0.67         2
    MntGoldProds       0.52      0.57      0.54        30
 MntMeatProducts       0.55      0.48      0.51        77
MntSweetProducts

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1-Score,AUC-ROC (Macro)
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Logistic Regression,0.7342,0.1934,0.2151,0.2001,0.7929
k-NN,0.6982,0.3685,0.3591,0.3589,0.7535
Decision Tree,0.7027,0.3693,0.4168,0.3817,0.6578
Random Forest,0.7883,0.5106,0.4132,0.4437,0.8855
XGBoost,0.7883,0.5375,0.4382,0.4707,


In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [26]:
from sklearn.ensemble import VotingClassifier
ensemble = VotingClassifier(estimators=[
    ('rf', best_models['Random Forest']),
    ('xgb', best_models['XGBoost'])
], voting='soft')


In [27]:
pip install tensorflow


Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement tensorflow (from versions: none)

[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for tensorflow


In [30]:
pip install tensorflow keras scikit-learn pandas matplotlib


Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement tensorflow (from versions: none)

[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for tensorflow


In [45]:
print(df.columns.tolist())


['Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome', 'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response', 'Top_Product']


In [47]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Select features and drop rows with missing values
features = ['Income', 'Recency', 'MntWines', 'MntMeatProducts']
df_cluster = df.dropna(subset=features).copy()

# Scale features
scaler = StandardScaler()
scaled = scaler.fit_transform(df_cluster[features])

# Apply KMeans
kmeans = KMeans(n_clusters=4, random_state=42)
df_cluster['Customer Segment'] = kmeans.fit_predict(scaled)

# Sample 10 for dashboard
dashboard = df_cluster[['Top_Product', 'Customer Segment']].sample(10, random_state=42)
dashboard.reset_index(inplace=True)
dashboard.rename(columns={'index': 'Dt_customer', 'Top_Product': 'Predicted Product'}, inplace=True)

# Show dashboard
print("📊 Dashboard Snapshot:")
print(dashboard)


📊 Dashboard Snapshot:
   Dt_customer Predicted Product  Customer Segment
0          961          MntWines                 2
1          229   MntMeatProducts                 2
2         1093          MntWines                 0
3          427          MntWines                 3
4         1650   MntFishProducts                 2
5          543          MntWines                 3
6         1801          MntWines                 1
7         1708          MntWines                 2
8          994          MntWines                 2
9          976          MntWines                 1
