In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import matplotlib.pyplot as plt

In [None]:
me_data = pd.read_csv('./dataset/melb_data.csv')
me_data.head()

In [None]:
me_data.describe()

In [3]:
# Imputation des valeurs manquantes
imputer = SimpleImputer(strategy='median')
me_data[me_data.select_dtypes(include=[np.number]).columns] = imputer.fit_transform(me_data.select_dtypes(include=[np.number]))

# Encodage par Target Encoding
if 'Suburb' in me_data.columns:
    suburb_mean_price = me_data.groupby('Suburb')['Price'].mean()
    me_data['Suburb_encoded'] = me_data['Suburb'].map(suburb_mean_price)
    me_data.drop(columns=['Suburb'], inplace=True)

# Création de caractéristiques supplémentaires
me_data['Landsize_squared'] = me_data['Landsize'] ** 2
me_data['BuildingArea_log'] = np.log1p(me_data['BuildingArea'])
me_data['Rooms_Bathroom'] = me_data['Rooms'] * me_data['Bathroom']

# Préparation des données
y = me_data['Price']
fme_features = [
    'Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt',
    'Lattitude', 'Longtitude', 'Suburb_encoded',
    'Landsize_squared', 'BuildingArea_log', 'Rooms_Bathroom'
]
X = me_data[fme_features]

# Discrétisation de la cible
y_binned = pd.qcut(y, q=5, labels=False)

# Normalisation
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Nettoyage des indices invalides
valid_indices = ~pd.isna(y_binned)
X_scaled = X_scaled[valid_indices]
y = y.reset_index(drop=True)[valid_indices]
y_binned = y_binned.reset_index(drop=True)[valid_indices].astype(int)

# Validation des tailles
print(f"X_scaled: {X_scaled.shape}, y: {y.shape}, y_binned: {y_binned.shape}")

# Validation des catégories
print(f"Distribution dans y_binned: \n{y_binned.value_counts()}")

# Paramètres pour GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

# Modèle Random Forest
model = RandomForestRegressor(random_state=42, n_jobs=-1)

# GridSearchCV avec 5-Fold CV (non stratifié)
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',
    cv=5,  # 5-fold CV standard
    verbose=1,
    n_jobs=-1
)

# Séparation des données
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Entraînement du modèle
grid_search.fit(X_train, y_train)

# Résultats
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Performances
print(f"Best MAE (Positive): {-grid_search.best_score_:.2f}")
preds = best_model.predict(X_test)
mae_final = mean_absolute_error(y_test, preds)
print(f"Final Mean Absolute Error on test data: {mae_final:.2f}")

ValueError: Found input variables with inconsistent numbers of samples: [10864, 13580]

In [14]:
# Imputation des valeurs manquantes
imputer = SimpleImputer(strategy='median')
me_data[me_data.select_dtypes(include=[np.number]).columns] = imputer.fit_transform(me_data.select_dtypes(include=[np.number]))

# Encoder les colonnes catégoriques via Target Encoding
if 'Suburb' in me_data.columns:
    suburb_mean_price = me_data.groupby('Suburb')['Price'].mean()
    me_data['Suburb_encoded'] = me_data['Suburb'].map(suburb_mean_price)
    me_data.drop(columns=['Suburb'], inplace=True)

# Créer des caractéristiques supplémentaires
me_data['Landsize_squared'] = me_data['Landsize'] ** 2
me_data['BuildingArea_log'] = np.log1p(me_data['BuildingArea'])
me_data['Rooms_Bathroom'] = me_data['Rooms'] * me_data['Bathroom']

# Sélectionner les caractéristiques importantes et la cible
y = me_data['Price']
fme_features = [
    'Suburb_encoded', 'Rooms', 'Longtitude', 'Lattitude', 
    'Landsize_squared', 'Landsize', 'Rooms_Bathroom',
    'YearBuilt', 'BuildingArea_log', 'BuildingArea', 'Bathroom', 'Postcode', 'Bedroom2', 'Distance'
]

X = me_data[fme_features]

# Normalisation des données
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Entraîner le modèle avec les meilleurs paramètres
best_params = {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000}
model = RandomForestRegressor(
    max_depth=best_params['max_depth'],
    min_samples_leaf=best_params['min_samples_leaf'],
    min_samples_split=best_params['min_samples_split'],
    n_estimators=best_params['n_estimators'],
    bootstrap= False,
    random_state=42
)
model.fit(X_train, y_train)

# Prédictions et évaluation
preds = model.predict(X_test)
mae_final = mean_absolute_error(y_test, preds)
print("Final Mean Absolute Error:", mae_final)

Final Mean Absolute Error: 226925.9660191458
