In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from utils import load_processed_data 
from plotting_utils import plot_confusion_matrix, plot_roc_curve

## 1. Load and Prepare Data

We'll use particle-level features from the jet data. See kaggle for a detailed description of the data. 

In [None]:
# Load data
# Create feature matrix and labels

X_train, y_train, train_ids, X_val, y_val, val_ids, X_test, test_ids= load_processed_data()

X_train.shape 

In [None]:
X_train.head()

## 2. Train XGBoost Model

XGBoost has several important hyperparameters:
- `n_estimators`: How many trees to we use 
- `max_depth`: How deep is each tree - i.e. how many decisions does it make. Can you see a reason why we might want to set a max here? 
- `learning_rate`: How big is our gradient step 
- `objective`: Learning task and objective function - since we can do classification or regression it's important to select the right one here. 

In [None]:
# Initialize and train model
model = xgb.XGBClassifier(
    n_estimators=500,  # Number of boosting rounds
    max_depth=10,      # Maximum tree depth
    learning_rate=0.2, # Step size shrinkage
    objective='binary:logistic',  # Binary classification
    random_state=42
)

# Train the model
model.fit(X_train, y_train,
          eval_set=[(X_val, y_val)],
          verbose=True)

## 3. Evaluate Model

Let's evaluate our model using:
- Accuracy: Overall prediction accuracy
- Confusion Matrix: This is common for classifications. How often did we classify or misclassify each category? It's common that some categories are harder than others. 

In [None]:
# Make predictions

# this gives us probabilities for both categories - we only want for ttbar, so we select one column
# with a binary classification, the probability for one category implies the other 
y_pred = model.predict_proba(X_val)[:, 1]

# to test accuracy and confusion matrix, we need labels 0 and 1, so we set that based on a threshold
discrete_pred = np.where(y_pred > 0.5, 1, 0)
# Calculate accuracy
accuracy = accuracy_score(y_val, np.where(y_pred > 0.5, 1, 0))
print(f"Test Accuracy: {accuracy:.4f}")

# Plot confusion matrix
plot_confusion_matrix(y_val, discrete_pred)

In [None]:
plot_roc_curve(y_val, y_pred)

## 4. Feature Importance Analysis

One of the advantages of BDTs is that they provide feature importance scores, which help us understand which features are most important for the classification task. Beware that this doesn't translate directly to which are the best features in the data - only to what the model thinks. So if you have a bad model, your feature importance will be equally useless. Another model could also pick up on features that the BDT didn't, so it's only an indicator. 

In [None]:
# Plot feature importance
importance = model.feature_importances_
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': importance
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance')
plt.tight_layout() 
plt.show()

# 5 - Make kaggle predictions¶


In [None]:
test_predictions = model.predict_proba(X_test)[:, 1]
solution = pd.DataFrame({'id':test_ids, 'label':test_predictions})
solution.to_csv('submission.csv', index=False)