In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [2]:
df = pd.read_csv('dataproject2025.csv')

In [3]:
exclude_columns = [
    'target',
    'Predicted probabilities',   
    'Unnamed: 0',  
    'zip_code',
    'Pct_afro_american']

X = df.drop(columns=exclude_columns)
y = df['Predicted probabilities']

In [4]:
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

In [5]:
emp_length_mapping = {
        '< 1 year': 0, '1 year': 1, '2 years': 2, '3 years': 3, '4 years': 4, 
        '5 years': 5, '6 years': 6, '7 years': 7, '8 years': 8, '9 years': 9, 
        '10+ years': 10
    }

X['emp_length'] = X['emp_length'].map(emp_length_mapping)

In [6]:
emp_title_freq = X['emp_title'].value_counts().to_dict()
X['emp_title'] = X['emp_title'].map(emp_title_freq)

grade_mapping = {'A': 6, 'B': 5, 'C': 4, 'D': 3, 'E': 2, 'F': 1, 'G': 0}
X['grade'] = X['grade'].map(grade_mapping)

home_dummies = pd.get_dummies(X['home_ownership'], prefix='home')
X = pd.concat([X, home_dummies], axis=1)
X = X.drop('home_ownership', axis=1)

purpose_dummies = pd.get_dummies(X['purpose'], prefix='purpose')
X = pd.concat([X, purpose_dummies], axis=1)
X = X.drop('purpose', axis=1)

grades = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
subgrades = []
for grade in grades:
    for i in range(1, 6):
        subgrades.append(f"{grade}{i}")
subgrade_mapping = {sg: len(subgrades) - i - 1 for i, sg in enumerate(subgrades)}
X['sub_grade'] = X['sub_grade'].map(subgrade_mapping)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)

r2_lr = r2_score(y_test, y_pred_lr)
mse_lr = mean_squared_error(y_test, y_pred_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)

print(f"R²: {r2_lr:.4f}")
print(f"MSE: {mse_lr:.6f}")
print(f"MAE: {mae_lr:.6f}")

R²: 0.9561
MSE: 0.000617
MAE: 0.017883


In [9]:
feature_importance_lr = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lr.coef_,
    'Importance': np.abs(lr.coef_)
}).sort_values('Importance', ascending=False)

print("\nTop 10 variables importantes (coefficients):")
for i, row in feature_importance_lr.head(10).iterrows():
    print(f"  {row['Feature']}: {row['Coefficient']:.4f}")


Top 10 variables importantes (coefficients):
  home_MORTGAGE: -531947193.7678
  home_RENT: -523098269.4153
  home_OWN: -327326800.0985
  purpose_debt_consolidation: -92306911.7943
  purpose_credit_card: -76983743.4433
  purpose_other: -44815951.3387
  purpose_home_improvement: -44723117.6904
  purpose_major_purchase: -26180368.0660
  purpose_small_business: -20637390.2637
  purpose_medical: -19544208.2536


In [10]:
dt_params = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [20, 50, 100],
    'min_samples_leaf': [10, 20, 50]
}

dt_grid = GridSearchCV(
    DecisionTreeRegressor(random_state=42), 
    dt_params, 
    cv=5, 
    scoring='r2',
    n_jobs=-1
)

In [11]:
dt_grid.fit(X_train, y_train)

best_dt = dt_grid.best_estimator_
y_pred_dt = best_dt.predict(X_test)

r2_dt = r2_score(y_test, y_pred_dt)
mse_dt = mean_squared_error(y_test, y_pred_dt)
mae_dt = mean_absolute_error(y_test, y_pred_dt)

print(f"R²: {r2_dt:.4f}")
print(f"MSE: {mse_dt:.6f}")
print(f"MAE: {mae_dt:.6f}")
print(f"Meilleurs paramètres: {dt_grid.best_params_}")
print(f"Profondeur finale: {best_dt.get_depth()}")

R²: 0.9251
MSE: 0.001053
MAE: 0.024190
Meilleurs paramètres: {'max_depth': 10, 'min_samples_leaf': 20, 'min_samples_split': 20}
Profondeur finale: 10


In [12]:
feature_importance_dt = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_dt.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 10 variables importantes (arbre):")
for i, row in feature_importance_dt.head(10).iterrows():
    print(f"  {row['Feature']}: {row['Importance']:.4f}")



Top 10 variables importantes (arbre):
  Predictions: 0.6626
  sub_grade: 0.2318
  grade: 0.0226
  avg_cur_bal: 0.0203
  loan duration: 0.0203
  dti: 0.0121
  home_RENT: 0.0074
  mths_since_recent_bc: 0.0074
  fico_range_high: 0.0049
  home_MORTGAGE: 0.0035
