In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from sklearn.preprocessing import  StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
pio.templates.default = "plotly_white"

## 1. Loading the Data set

In [2]:
df = pd.read_csv('../data/outputs/06_final_dataset.csv')

## 2. Set the target value as categorical

In [3]:
df['race_rank'] = pd.Categorical(df['race_rank'])

In [4]:
df['race_rank'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 26080 entries, 0 to 26079
Series name: race_rank
Non-Null Count  Dtype   
--------------  -----   
26080 non-null  category
dtypes: category(1)
memory usage: 26.3 KB


## 2. Split the model

In [5]:
y = 'race_rank'
features = df.columns.to_list()
features.remove(y)

# Création des jeux de données features (X) et targets(y)
X = df[features]
y = df[y]

# Séparation de notre jeu de données, 30% de test et 70% d'entrainement
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [6]:
# Model initialization
model = xgb.XGBClassifier()
# Model training
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

Accuracy: 0.7299335378323109
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5563
           1       0.13      0.13      0.13       114
           2       0.17      0.18      0.18       103
           3       0.12      0.12      0.12       106
           4       0.09      0.10      0.09       115
           5       0.06      0.05      0.05       112
           6       0.02      0.03      0.03       101
           7       0.07      0.07      0.07        97
           8       0.08      0.06      0.07       116
           9       0.04      0.05      0.05        97
          10       0.06      0.07      0.06       103
          11       0.06      0.06      0.06       122
          12       0.06      0.05      0.05       119
          13       0.06      0.06      0.06       124
          14       0.10      0.08      0.09       119
          15       0.06      0.06      0.06       108
          16       0.02      

In [20]:
type(conf_matrix)

numpy.ndarray

In [7]:
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
fig = px.imshow(conf_matrix,
    labels=dict(x="Predicted", y="Actual"),
    x=[f"Predicted {i}" for i in range(1, conf_matrix.shape[1] + 1)],
    y=[f"Actual {i}" for i in range(1, conf_matrix.shape[0] + 1)],
    title="Confusion Matrix",
    width=1200,  # Set width to 800 pixels
    height=800)
fig.show()

## Analysing the feature importance

In [8]:
# Get feature importances
feature_importances = model.feature_importances_

# Get feature names
feature_names = X_train.columns

# Create DataFrame with feature names and importances
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

# Sort features by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

In [9]:
# Plot feature importances
fig = px.bar(feature_importance_df, x='Feature', y='Importance',
             title='Feature Importances',
             labels={'Importance': 'Importance Score', 'Feature': 'Feature Name'},
             width=1200, height=700)
fig.show()

In [10]:
# Create a correlation matrix plot using plotly
def create_corr_matrix(correlation_matrix):
    fig = px.imshow(correlation_matrix,
                    labels=dict(color="Correlation"),
                    x=correlation_matrix.columns,
                    y=correlation_matrix.columns,
                    color_continuous_scale="Viridis",
                    width=1200,
                    height=800)

    # Add text annotations with correlation values
    for i in range(len(correlation_matrix.columns)):
      for j in range(len(correlation_matrix.columns)):
        fig.add_annotation(x=correlation_matrix.columns[i], y=correlation_matrix.columns[j],
                        text=f'{correlation_matrix.iloc[i, j]:.2f}',
                        showarrow=False, font=dict(color='white'))

    # Show the plot
    fig.show()

In [11]:
create_corr_matrix(df.corr())