In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('./dataset/melb_data.csv')
df.head()

In [None]:
df.describe()

In [9]:
# Charger les données
me_file_path = './dataset/melb_data.csv'
me_data = pd.read_csv(me_file_path)

# Imputation des valeurs manquantes
imputer = SimpleImputer(strategy='median')
me_data[me_data.select_dtypes(include=[np.number]).columns] = imputer.fit_transform(me_data.select_dtypes(include=[np.number]))

# Encoder les colonnes catégoriques via Target Encoding
if 'Suburb' in me_data.columns:
    suburb_mean_price = me_data.groupby('Suburb')['Price'].mean()
    me_data['Suburb_encoded'] = me_data['Suburb'].map(suburb_mean_price)
    me_data.drop(columns=['Suburb'], inplace=True)

# Créer des caractéristiques supplémentaires
me_data['Landsize_squared'] = me_data['Landsize'] ** 2
me_data['BuildingArea_log'] = np.log1p(me_data['BuildingArea'])
me_data['Rooms_Bathroom'] = me_data['Rooms'] * me_data['Bathroom']

# Sélectionner les caractéristiques et la cible
y = me_data['Price']
fme_features = [
    'Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt',
    'Lattitude', 'Longtitude', 'Suburb_encoded',
    'Landsize_squared', 'BuildingArea_log', 'Rooms_Bathroom'
]
X = me_data[fme_features]

# Discrétiser la cible pour utiliser StratifiedKFold
y_binned = pd.qcut(y, q=5, labels=False)  # Diviser les prix en 5 catégories

# Initialiser StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Normalisation
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Paramètres pour GridSearchCV
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],  # Chaînes valides pour max_features
    'bootstrap': [True, False]
}

# Modèle Random Forest
model = RandomForestRegressor(random_state=42, n_jobs=-1)  # Fixer les paramètres constants ici

# GridSearchCV avec StratifiedKFold
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',
    cv=skf.split(X_scaled, y_binned),  # Validation croisée stratifiée
    verbose=1,
    n_jobs=-1
)

# Entraîner le modèle avec GridSearchCV
grid_search.fit(X_train, y)

# Meilleurs paramètres
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Meilleure performance (MAE)
print(f"Best MAE (Negative): {grid_search.best_score_:.2f}")
print(f"Best MAE (Positive): {-grid_search.best_score_:.2f}")

# Résultats finaux sur tout l'ensemble de données (optionnel)
preds = best_model.predict(X_test)
mae_final = mean_absolute_error(y, preds)
print(f"Final Mean Absolute Error on all data: {mae_final:.2f}")


Fitting 5 folds for each of 486 candidates, totalling 2430 fits


810 fits failed out of a total of 2430.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
587 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Skayne\Desktop\cours_tech\Machine learning\env\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Skayne\Desktop\cours_tech\Machine learning\env\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "c:\Users\Skayne\Desktop\cours_tech\Machine learning\env\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Skayne\Desktop\cours_tech\Machine learning\

Best Parameters: {'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 500}
Best MAE (Negative): -172324.04
Best MAE (Positive): 172324.04
Final Mean Absolute Error on all data: 50556.86


In [14]:
# Imputation des valeurs manquantes
imputer = SimpleImputer(strategy='median')
me_data[me_data.select_dtypes(include=[np.number]).columns] = imputer.fit_transform(me_data.select_dtypes(include=[np.number]))

# Encoder les colonnes catégoriques via Target Encoding
if 'Suburb' in me_data.columns:
    suburb_mean_price = me_data.groupby('Suburb')['Price'].mean()
    me_data['Suburb_encoded'] = me_data['Suburb'].map(suburb_mean_price)
    me_data.drop(columns=['Suburb'], inplace=True)

# Créer des caractéristiques supplémentaires
me_data['Landsize_squared'] = me_data['Landsize'] ** 2
me_data['BuildingArea_log'] = np.log1p(me_data['BuildingArea'])
me_data['Rooms_Bathroom'] = me_data['Rooms'] * me_data['Bathroom']

# Sélectionner les caractéristiques importantes et la cible
y = me_data['Price']
fme_features = [
    'Suburb_encoded', 'Rooms', 'Longtitude', 'Lattitude', 
    'Landsize_squared', 'Landsize', 'Rooms_Bathroom',
    'YearBuilt', 'BuildingArea_log', 'BuildingArea', 'Bathroom', 'Postcode', 'Bedroom2', 'Distance'
]

X = me_data[fme_features]

# Normalisation des données
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Entraîner le modèle avec les meilleurs paramètres
best_params = {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000}
model = RandomForestRegressor(
    max_depth=best_params['max_depth'],
    min_samples_leaf=best_params['min_samples_leaf'],
    min_samples_split=best_params['min_samples_split'],
    n_estimators=best_params['n_estimators'],
    bootstrap= False,
    random_state=42
)
model.fit(X_train, y_train)

# Prédictions et évaluation
preds = model.predict(X_test)
mae_final = mean_absolute_error(y_test, preds)
print("Final Mean Absolute Error:", mae_final)

Final Mean Absolute Error: 226925.9660191458
