In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [2]:
df = pd.read_csv('/Users/guenounraphael/Desktop/HEC/Algo_Fairness/dataproject2025.csv')

## Encoding


In [33]:
exclude_columns = [
    'target',
    'Predicted probabilities',
    'Predictions',   
    'Unnamed: 0',  
    'zip_code',
    'Pct_afro_american']

X = df.drop(columns=exclude_columns)
y = df['Predicted probabilities']

In [34]:
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

In [35]:
emp_length_mapping = {
        '< 1 year': 0, '1 year': 1, '2 years': 2, '3 years': 3, '4 years': 4, 
        '5 years': 5, '6 years': 6, '7 years': 7, '8 years': 8, '9 years': 9, 
        '10+ years': 10
    }

X['emp_length'] = X['emp_length'].map(emp_length_mapping)

In [36]:
emp_title_freq = X['emp_title'].value_counts().to_dict()
X['emp_title'] = X['emp_title'].map(emp_title_freq)

grade_mapping = {'A': 6, 'B': 5, 'C': 4, 'D': 3, 'E': 2, 'F': 1, 'G': 0}
X['grade'] = X['grade'].map(grade_mapping)

home_dummies = pd.get_dummies(X['home_ownership'], prefix='home', drop_first=True)  
X = pd.concat([X, home_dummies], axis=1)
X = X.drop('home_ownership', axis=1)

purpose_dummies = pd.get_dummies(X['purpose'], prefix='purpose', drop_first=True)  # drop_first=True !
X = pd.concat([X, purpose_dummies], axis=1)
X = X.drop('purpose', axis=1)

grades = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
subgrades = []
for grade in grades:
    for i in range(1, 6):
        subgrades.append(f"{grade}{i}")
subgrade_mapping = {sg: len(subgrades) - i - 1 for i, sg in enumerate(subgrades)}
X['sub_grade'] = X['sub_grade'].map(subgrade_mapping)


In [37]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Linear Regression

In [38]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)

r2_lr = r2_score(y_test, y_pred_lr)
mse_lr = mean_squared_error(y_test, y_pred_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)

print(f"R²: {r2_lr:.4f}")
print(f"MSE: {mse_lr:.6f}")
print(f"MAE: {mae_lr:.6f}")

R²: 0.9375
MSE: 0.000878
MAE: 0.021028


In [41]:
feature_importance_lr = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lr.coef_,
    'Importance': np.abs(lr.coef_)
}).sort_values('Importance', ascending=False)

print("\nTop 10 variables importantes (coefficients):")
for i, row in feature_importance_lr.iterrows():
    print(f"  {row['Feature']}: {row['Coefficient']:.4f}")


Top 10 variables importantes (coefficients):
  sub_grade: -0.0662
  loan duration: 0.0372
  dti: 0.0217
  home_RENT: 0.0162
  fico_range_high: -0.0146
  funded_amnt: 0.0140
  mort_acc: -0.0112
  purpose_debt_consolidation: 0.0096
  mths_since_recent_bc: -0.0087
  revol_bal: -0.0078
  mo_sin_old_rev_tl_op: -0.0078
  num_actv_bc_tl: 0.0077
  mo_sin_rcnt_tl: -0.0075
  grade: -0.0070
  inq_last_6mths: 0.0066
  purpose_small_business: 0.0064
  purpose_credit_card: 0.0064
  revol_util: -0.0061
  purpose_home_improvement: 0.0061
  open_acc: 0.0059
  int_rate: -0.0058
  num_bc_tl: -0.0057
  bc_open_to_buy: -0.0055
  emp_title: -0.0048
  home_OWN: 0.0046
  delinq_2yrs: 0.0045
  num_il_tl: -0.0040
  pub_rec: 0.0038
  purpose_major_purchase: 0.0038
  avg_cur_bal: -0.0036
  bc_util: -0.0036
  annual_inc: -0.0032
  purpose_other: 0.0029
  purpose_medical: 0.0029
  purpose_wedding: -0.0020
  pub_rec_bankruptcies: 0.0019
  purpose_vacation: 0.0019
  purpose_moving: 0.0016
  mo_sin_rcnt_rev_tl_op: 0.

## Decision Tree Regressor

In [21]:
dt_params = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [20, 50, 100],
    'min_samples_leaf': [10, 20, 50]
}

dt_grid = GridSearchCV(
    DecisionTreeRegressor(random_state=42), 
    dt_params, 
    cv=5, 
    scoring='r2',
    n_jobs=-1
)

In [22]:
dt_grid.fit(X_train, y_train)

best_dt = dt_grid.best_estimator_
y_pred_dt = best_dt.predict(X_test)

r2_dt = r2_score(y_test, y_pred_dt)
mse_dt = mean_squared_error(y_test, y_pred_dt)
mae_dt = mean_absolute_error(y_test, y_pred_dt)

print(f"R²: {r2_dt:.4f}")
print(f"MSE: {mse_dt:.6f}")
print(f"MAE: {mae_dt:.6f}")
print(f"Meilleurs paramètres: {dt_grid.best_params_}")
print(f"Profondeur finale: {best_dt.get_depth()}")

python(39147) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(39148) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(39149) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(39150) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(39151) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(39152) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(39153) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(39154) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(39155) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(39156) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


R²: 0.9020
MSE: 0.001377
MAE: 0.026781
Meilleurs paramètres: {'max_depth': 10, 'min_samples_leaf': 20, 'min_samples_split': 20}
Profondeur finale: 10


In [23]:
feature_importance_dt = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_dt.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 10 variables importantes (arbre):")
for i, row in feature_importance_dt.head(10).iterrows():
    print(f"  {row['Feature']}: {row['Importance']:.4f}")



Top 10 variables importantes (arbre):
  sub_grade: 0.7558
  loan duration: 0.1184
  avg_cur_bal: 0.0574
  dti: 0.0218
  mths_since_recent_bc: 0.0139
  home_RENT: 0.0111
  fico_range_high: 0.0092
  mort_acc: 0.0046
  mo_sin_rcnt_tl: 0.0027
  grade: 0.0013
