In [19]:
import pandas as pd
import plotly.express as px
import plotly.io as pio
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
pio.templates.default = "plotly_white"

## 1. Loading the Data set

In [20]:
df = pd.read_csv('../data/outputs/06_final_dataset.csv')

## 2. Set the target value as categorical

In [21]:
df['race_rank'] = pd.Categorical(df['race_rank'])

In [22]:
df['race_rank'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 6465 entries, 0 to 6464
Series name: race_rank
Non-Null Count  Dtype   
--------------  -----   
6465 non-null   category
dtypes: category(1)
memory usage: 7.2 KB


## 2. Split the model

In [23]:
y = 'race_rank'
features = df.columns.to_list()
features.remove(y)

# Création des jeux de données features (X) et targets(y)
X = df[features]
y = df[y]

# Séparation de notre jeu de données, 30% de test et 70% d'entrainement
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [24]:
# Model initialization
model = xgb.XGBClassifier()
# Model training
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

Accuracy: 0.10979381443298969
Classification Report:
              precision    recall  f1-score   support

           0       0.48      0.66      0.56        79
           1       0.19      0.16      0.17       101
           2       0.12      0.15      0.13        82
           3       0.08      0.10      0.09        84
           4       0.16      0.16      0.16       101
           5       0.13      0.10      0.12        97
           6       0.04      0.03      0.04        91
           7       0.10      0.08      0.09        95
           8       0.10      0.11      0.10        82
           9       0.06      0.05      0.05        93
          10       0.03      0.03      0.03        89
          11       0.02      0.02      0.02        94
          12       0.07      0.06      0.07        93
          13       0.05      0.06      0.05        90
          14       0.05      0.05      0.05       103
          15       0.06      0.06      0.06        85
          16       0.08     

In [25]:
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
fig = px.imshow(conf_matrix,
    labels=dict(x="Predicted", y="Actual"),
    x=[f"Predicted {i}" for i in range(1, conf_matrix.shape[1] + 1)],
    y=[f"Actual {i}" for i in range(1, conf_matrix.shape[0] + 1)],
    title="Confusion Matrix",
    width=1200,  # Set width to 800 pixels
    height=800)
fig.show()

## Analysing the feature importance

In [26]:
# Get feature importances
feature_importances = model.feature_importances_

# Get feature names
feature_names = X_train.columns

# Create DataFrame with feature names and importances
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

# Sort features by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

In [27]:
# Plot feature importances
fig = px.bar(feature_importance_df, x='Feature', y='Importance',
             title='Feature Importances',
             labels={'Importance': 'Importance Score', 'Feature': 'Feature Name'},
             width=1200, height=700)
fig.show()