In [16]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingClassifier, BaggingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings("ignore")



In [2]:
penguins = pd.read_csv("data/penguins_size.csv")
penguins.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [3]:
penguins = penguins.dropna()
penguins.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE


### Gradient Boost

In [13]:
X = penguins.drop(['species'],axis=1)
y = penguins['species']

# Dumify
X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

GradientBoost1 = GradientBoostingClassifier(n_estimators=100,learning_rate=0.1,max_depth=1)

GradientBoost1.fit(X_train,y_train)

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

scores = cross_val_score(GradientBoost1, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

print("Cross-validation accuracy:", scores.mean())

Cross-validation accuracy: 0.992038027332145
Cross-validation accuracy: 0.992038027332145


In [12]:
GradientBoost2 = GradientBoostingClassifier(n_estimators=200,learning_rate=0.1,max_depth=3)

GradientBoost2.fit(X_train,y_train)

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

scores = cross_val_score(GradientBoost2, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

print("Cross-validation accuracy:", scores.mean())

Cross-validation accuracy: 0.983986928104575
Cross-validation accuracy: 0.983986928104575


In [32]:
GradientBoost3 = GradientBoostingClassifier(n_estimators=300,learning_rate=1,max_depth=5)

GradientBoost3.fit(X_train,y_train)

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

scores = cross_val_score(GradientBoost3, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

print("Cross-validation accuracy:", scores.mean())

Cross-validation accuracy: 0.9830956625074272
Cross-validation accuracy: 0.9830956625074272


### Dropping variables to make it more difficult

In [49]:
X = penguins.drop(['species','island','body_mass_g'],axis=1)
y = penguins['species']

# Dumify
X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

GradientBoost1 = GradientBoostingClassifier(n_estimators=100,learning_rate=0.1,max_depth=1)

GradientBoost1.fit(X_train,y_train)

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

scores = cross_val_score(GradientBoost1, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

# Report
print("Cross-validation accuracy:", scores.mean())


Cross-validation accuracy: 0.9841651812240049
Cross-validation accuracy: 0.9841651812240049


### XGBoost

In [46]:
X = penguins.drop(['species'], axis=1)
y = penguins['species']

X = pd.get_dummies(X, drop_first=True)

# Encode Species to integers
le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

xgb_model1 = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    eval_metric='mlogloss'    # avoids warning for multi-class
)

xgb_model1.fit(X_train,y_train)

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

scores = cross_val_score(xgb_model1, X, y_encoded, scoring='accuracy', cv=cv, n_jobs=-1)

print("Cross-validation accuracy:", scores.mean())

Cross-validation accuracy: 0.987997623291741
Cross-validation accuracy: 0.987997623291741


In [37]:
xgb_model2 = XGBClassifier(
    n_estimators=100,
    learning_rate=1,
    max_depth=1,
    eval_metric='mlogloss'    # avoids warning for multi-class
)

xgb_model2.fit(X_train,y_train)

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

scores = cross_val_score(xgb_model2, X, y_encoded, scoring='accuracy', cv=cv, n_jobs=-1)

print("Cross-validation accuracy:", scores.mean())

Cross-validation accuracy: 0.9909387997623291
Cross-validation accuracy: 0.9909387997623291


## Dropping variables to make it more difficult to predict

In [47]:
X = penguins.drop(['species','island','body_mass_g'],axis=1)
y = penguins['species']

X = pd.get_dummies(X, drop_first=True)

# Encode Species to integers
le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

xgb_model2 = XGBClassifier(
    n_estimators=100,
    learning_rate=1,
    max_depth=1,
    eval_metric='mlogloss'    # avoids warning for multi-class
)

xgb_model2.fit(X_train,y_train)

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

scores = cross_val_score(xgb_model2, X, y_encoded, scoring='accuracy', cv=cv, n_jobs=-1)

print("Cross-validation accuracy:", scores.mean())

Cross-validation accuracy: 0.9830065359477124
Cross-validation accuracy: 0.9830065359477124


In [42]:
xgb_clf = XGBClassifier(
    objective='multi:softprob',
    eval_metric='mlogloss',
    use_label_encoder=False,  # optional in older versions
    num_class=3,              # number of penguin species
    random_state=42
)

# XGBoost classifier setup
xgb_clf = XGBClassifier(
    objective='multi:softprob',
    eval_metric='mlogloss',
    num_class=3,
    random_state=1
)

# Grid of hyperparameters
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Grid search with repeated cross-validation
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

grid = GridSearchCV(estimator=xgb_clf, param_grid=param_grid,
                    scoring='accuracy', cv=cv, n_jobs=-1, verbose=1)

# Fit grid search
grid.fit(X, y_encoded)

# Best model and score
print("Best Accuracy Score: {:.3f}".format(grid.best_score_))
print("Best Parameters:", grid.best_params_)

Fitting 30 folds for each of 32 candidates, totalling 960 fits
Fitting 30 folds for each of 32 candidates, totalling 960 fits


Best Accuracy Score: 0.991
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
Best Accuracy Score: 0.991
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}


In [1]:
xgb_model3 = XGBClassifier(
    colsample_bytree = 0.8,
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    subsample = 0.8,
    eval_metric='mlogloss'    # avoids warning for multi-class
)

xgb_model3.fit(X_train,y_train)

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

scores = cross_val_score(xgb_model3, X, y_encoded, scoring='accuracy', cv=cv, n_jobs=-1)

print("Cross-validation accuracy:", scores.mean())

NameError: name 'XGBClassifier' is not defined

When fitting the best AdaBoost, XGBoost and Gradient Boosting models I get very close to 100% accuracy for this dataset. So overall, all of the boosting models perform well. Even when dropping some predictor variables from the model, they all still got over 98% accuarcy. I think that this dataset is hard to use for judgement because it is very easy for the models to make their classification.