In [27]:
import pandas as pd
import plotly.express as px
import plotly.io as pio
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
pio.templates.default = "plotly_white"

In [28]:
from libs.transformer_utils import encode_labels

## 1. Loading the Data set

In [29]:
df = pd.read_csv('../data/outputs/06_final_dataset.csv')

## 2. Set the target value as categorical

In [30]:
df = encode_labels(df, ['race_rank'])

In [31]:
df['race_rank'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 26080 entries, 0 to 26079
Series name: race_rank
Non-Null Count  Dtype
--------------  -----
26080 non-null  int64
dtypes: int64(1)
memory usage: 203.9 KB


## 2. Split the model

In [32]:
y = 'race_rank'
features = df.columns.to_list()
features.remove(y)

# Création des jeux de données features (X) et targets(y)
X = df[features]
y = df[y]

# Séparation de notre jeu de données, 30% de test et 70% d'entrainement
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [33]:
u = y_train.unique().tolist()
v = y_test.unique().tolist()
print(sorted(u))
print(sorted(v))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38]


In [35]:
# Model initialization
model = xgb.XGBClassifier()
# Model training
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

Accuracy: 0.14148773006134968
Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.59      0.52       315
           1       0.26      0.29      0.27       331
           2       0.24      0.21      0.22       344
           3       0.16      0.17      0.16       323
           4       0.16      0.17      0.17       334
           5       0.15      0.16      0.15       327
           6       0.11      0.10      0.11       342
           7       0.11      0.11      0.11       336
           8       0.07      0.07      0.07       300
           9       0.13      0.10      0.12       346
          10       0.11      0.10      0.10       304
          11       0.10      0.08      0.09       334
          12       0.08      0.07      0.07       335
          13       0.09      0.07      0.08       336
          14       0.07      0.06      0.07       315
          15       0.06      0.05      0.06       322
          16       0.06     

In [36]:
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
fig = px.imshow(conf_matrix,
    labels=dict(x="Predicted", y="Actual"),
    x=[f"Predicted {i}" for i in range(1, conf_matrix.shape[1] + 1)],
    y=[f"Actual {i}" for i in range(1, conf_matrix.shape[0] + 1)],
    title="Confusion Matrix",
    width=1200,  # Set width to 800 pixels
    height=800)
fig.show()

## Analysing the feature importance

In [37]:
# Get feature importances
feature_importances = model.feature_importances_

# Get feature names
feature_names = X_train.columns

# Create DataFrame with feature names and importances
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

# Sort features by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

In [38]:
# Plot feature importances
fig = px.bar(feature_importance_df, x='Feature', y='Importance',
             title='Feature Importances',
             labels={'Importance': 'Importance Score', 'Feature': 'Feature Name'},
             width=1200, height=700)
fig.show()