# Les import

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.cluster import DBSCAN
import optuna

  from .autonotebook import tqdm as notebook_tqdm


# Fonctions utiles

## dbscan (supprimer les outliner)

In [2]:
from sklearn.discriminant_analysis import StandardScaler

def dbscan_func(X, y):

    # Étape 1 : Standardiser les données
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Étape 2 : Appliquer DBSCAN
    dbscan = DBSCAN(eps=1, min_samples=5)  # Ajustez eps et min_samples si nécessaire
    clusters = dbscan.fit_predict(X_scaled)

    X = X.copy()
    X['Cluster'] = clusters
    y = y.reset_index(drop=True)  # Reset index to align X and y
    X['Target'] = y  # Add target variable to the DataFrame

    # Ajouter les clusters aux données
    X['Cluster'] = clusters
    y = y.reset_index(drop=True)  # Réinitialisation de l'index pour aligner X et y
    X['Target'] = y  # Ajouter y dans le DataFrame pour faciliter le filtrage

    # Étape 3 : Filtrer et supprimer les valeurs extrêmes
    filtered_data = X[X['Cluster'] != -1]  # Conserver uniquement les points qui ne sont pas des outliers
    filtered_X = filtered_data.drop(['Cluster', 'Target'], axis=1)  # Supprimer les colonnes auxiliaires
    filtered_y = filtered_data['Target']  # Cible sans outliers

    # Résultat après suppression
    print(f"Nombre de valeurs extrêmes supprimées : {len(X) - len(filtered_X)}")
    print(f"Données restantes après suppression : {len(filtered_X)}")

    return filtered_X, filtered_y

## Trouver les meilleurs paramètres avec optuna

In [3]:
def get_best_params(model_class, param_distributions, X_train, y_train, n_trials=100, scoring='neg_mean_absolute_error', cv=3):
    """
    Optimise les hyperparamètres d'un modèle en utilisant Optuna.
    
    Args:
        model_class (class): Classe du modèle (par exemple, RandomForestRegressor, XGBRegressor, etc.).
        param_distributions (dict): Dictionnaire des distributions de recherche pour les hyperparamètres.
        X_train (pd.DataFrame or np.ndarray): Données d'entraînement (features).
        y_train (pd.Series or np.ndarray): Données d'entraînement (target).
        n_trials (int): Nombre d'essais pour l'optimisation.
        scoring (str): Métrique d'évaluation (compatible avec scikit-learn).
        cv (int): Nombre de plis pour la validation croisée.
    
    Returns:
        dict: Meilleurs hyperparamètres trouvés par Optuna.
    """
    
    def objective(trial):
        # Générer les hyperparamètres pour le modèle
        params = {
            key: _suggest_from_trial(trial, key, dist)
            for key, dist in param_distributions.items()
        }
        
        # Instancier le modèle avec les hyperparamètres suggérés
        model = model_class(**params)
        
        # Validation croisée
        scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)
        
        # Retourner le score moyen (négatif car on minimise)
        return -scores.mean()
    
    # Lancer l'étude Optuna
    optuna.logging.set_verbosity(optuna.logging.INFO)
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials)
    
    # Afficher et retourner les meilleurs hyperparamètres
    print(f"Best trial: {study.best_trial.params}")
    print("Best parameters:", study.best_params)
    print("Best score:", study.best_value)
    return study.best_params


def _suggest_from_trial(trial, name, dist):
    """
    Génère une valeur pour un hyperparamètre à partir de sa distribution spécifiée.
    
    Args:
        trial (optuna.trial.Trial): Instance d'un essai Optuna.
        name (str): Nom de l'hyperparamètre.
        dist (tuple): Distribution de l'hyperparamètre sous la forme (type, params).
    
    Returns:
        float or int or str: Valeur générée pour l'hyperparamètre.
    """
    dist_type, *dist_params = dist
    if dist_type == 'int':
        return trial.suggest_int(name, *dist_params)
    elif dist_type == 'float':
        return trial.suggest_float(name, *dist_params)
    elif dist_type == 'categorical':
        return trial.suggest_categorical(name, dist_params[0])
    else:
        raise ValueError(f"Unsupported distribution type: {dist_type}")


# Importation de la data

In [4]:
# Charger les données
me_file_path = './dataset/melb_data.csv'
me_data = pd.read_csv(me_file_path)

In [5]:
me_data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


# Transformation des datas

In [6]:
# Imputation des valeurs manquantes
imputer = SimpleImputer(strategy='median')
me_data[me_data.select_dtypes(include=[np.number]).columns] = imputer.fit_transform(me_data.select_dtypes(include=[np.number]))

In [7]:
# Encoder les colonnes catégoriques via Target Encoding
if 'Suburb' in me_data.columns:
    suburb_mean_price = me_data.groupby('Suburb')['Price'].mean()
    me_data['Suburb_encoded'] = me_data['Suburb'].map(suburb_mean_price)
    me_data.drop(columns=['Suburb'], inplace=True)

In [8]:
# Créer des caractéristiques supplémentaires
me_data['Landsize_squared'] = me_data['Landsize'] ** 2
me_data['BuildingArea_log'] = np.log1p(me_data['BuildingArea'])
me_data['Rooms_Bathroom'] = me_data['Rooms'] * me_data['Bathroom']

In [9]:
# Sélectionner les caractéristiques et la cible
y = me_data['Price']
fme_features = [
    'Suburb_encoded', 'Rooms', 'Longtitude', 'Lattitude', 
    'Landsize_squared', 'Landsize', 'Rooms_Bathroom',
    'YearBuilt', 'BuildingArea_log', 'BuildingArea', 'Bathroom', 'Postcode', 'Bedroom2', 'Distance'
]
X = me_data[fme_features]

# Machine learning

## Split des datas

In [10]:
filtered_X, filtered_y = dbscan_func(X, y)

# Diviser les données
X_train, X_val, y_train, y_val = train_test_split(filtered_X, filtered_y, test_size=0.2, random_state=42)

imputer = SimpleImputer(strategy="mean")  # ou "median"
X_train = imputer.fit_transform(X_train)
X_val = imputer.transform(X_val)

Nombre de valeurs extrêmes supprimées : 1615
Données restantes après suppression : 11965


# Xtreme Gradiant Boosting (XGBoost)

In [14]:
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

param_distributions_xgb = {
    'n_estimators': ('int', 1000, 3000),         # Nombre d'arbres
    'max_depth': ('int', 0, 2),               # Profondeur maximale de l'arbre
    'learning_rate': ('float', 0.01, 0.3),     # Taux d'apprentissage
    'subsample': ('float', 0.5, 1.0),          # Fraction d'échantillons pour chaque arbre
    'colsample_bytree': ('float', 0.5, 1.0),   # Fraction de colonnes utilisées par arbre
    'gamma': ('float', 0, 5),                  # Minimum perte de réduction pour diviser un noeud
    'reg_alpha': ('float', 0, 1),              # Régularisation L1
    'reg_lambda': ('float', 0, 1),             # Régularisation L2
}

best_params = get_best_params(XGBRegressor, param_distributions_xgb, X_train, y_train)

# Création du modèle avec les meilleurs hyperparamètres
my_model = XGBRegressor(**best_params)

# Entraînement du modèle
my_model.fit(X_train, y_train)

# Prédictions sur l'ensemble de validation
predictions = my_model.predict(X_val)

# Calcul de l'erreur absolue moyenne
mae = mean_absolute_error(y_val, predictions)
print(f"Mean Absolute Error: {mae}")

[I 2024-12-17 10:32:54,036] A new study created in memory with name: no-name-70e0f559-d811-45ad-ac60-474e4a120087
[I 2024-12-17 10:36:04,290] Trial 0 finished with value: 159614.23713290866 and parameters: {'n_estimators': 1505, 'max_depth': 0, 'learning_rate': 0.011818935285365982, 'subsample': 0.7028575582074797, 'colsample_bytree': 0.6880849756498016, 'gamma': 4.705596683653485, 'reg_alpha': 0.5286906494086222, 'reg_lambda': 0.2634585262091298}. Best is trial 0 with value: 159614.23713290866.
[I 2024-12-17 10:39:23,060] Trial 1 finished with value: 163026.74546728065 and parameters: {'n_estimators': 2049, 'max_depth': 0, 'learning_rate': 0.016549664718852043, 'subsample': 0.9408998862310535, 'colsample_bytree': 0.5219058970460276, 'gamma': 3.5604685120093555, 'reg_alpha': 0.4937272400506435, 'reg_lambda': 0.1403499033364527}. Best is trial 0 with value: 159614.23713290866.
[I 2024-12-17 10:41:01,870] Trial 2 finished with value: 174023.02897529668 and parameters: {'n_estimators': 11

KeyboardInterrupt: 

## Lightgbm

In [32]:
from lightgbm import LGBMRegressor

In [None]:
param_distributions_lgbm = {
    'n_estimators': ('int', 900, 3000),        # Nombre d'arbres
    'max_depth': ('int', 0, 1000000),                         # Profondeur maximale de l'arbre
    'learning_rate': ('float', 0.01, 0.3),     # Taux d'apprentissage
    'subsample': ('float', 0.5, 1.0),          # Fraction d'échantillons pour chaque arbre
    'colsample_bytree': ('float', 0.5, 1.0),   # Fraction de colonnes utilisées par arbre
    'reg_alpha': ('float', 0, 1),              # Régularisation L1
    'reg_lambda': ('float', 0, 1),             # Régularisation L2
    'min_child_samples': ('int', 0, 50),
    'num_leaves': ('int', 100, 1000),
}

best_params = get_best_params(LGBMRegressor, param_distributions_lgbm, X_train, y_train)

mod = LGBMRegressor(**best_params)
mod.fit(X_train, y_train)
pred = mod.predict(X_val)
print("Mean Absolute Error: " + str(mean_absolute_error(pred, y_val)))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000414 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2178
[LightGBM] [Info] Number of data points in the train set: 8960, number of used features: 14
[LightGBM] [Info] Start training from score 968512.960268
Mean Absolute Error: 128089.93714711459


## Combinaison de model

In [1]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Modèle Random Forest
best_params_rf = {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000}
rf = RandomForestRegressor(**best_params_rf)

# Modèle XGBoost
best_param_gb = {
    'n_estimators': 3000, 
    'max_depth': 10,  # None peut poser problème, spécifiez une valeur
    'learning_rate': 0.03, 
    'subsample': 0.7, 
    'colsample_bytree': 0.7, 
    'gamma': 3.5, 
    'reg_alpha': 0.7, 
    'reg_lambda': 0.005
}
gb = XGBRegressor(**best_param_gb)

# Modèle LightGBM
lgbm = LGBMRegressor(
    n_estimators=397, 
    learning_rate=0.024346815011452024, 
    max_depth=14, 
    num_leaves=262, 
    min_child_samples=27, 
    subsample=0.8470333971539084, 
    colsample_bytree=0.6484245995233271, 
    reg_alpha=0.5660804365476069, 
    reg_lambda=4.4571328591015574e-07
)

# Meta-modèle
meta_model = Ridge()

# Stacking Regressor
stacking_regressor = StackingRegressor(
    estimators=[('rf', rf), ('gb', gb), ('lgbm', lgbm)],
    final_estimator=meta_model
)

# Ajustement du modèle
stacking_regressor.fit(X_train, y_train)

# Prédictions
y_pred = stacking_regressor.predict(X_val)

# Évaluation
print("Mean Absolute Error: " + str(mean_absolute_error(y_val, y_pred)))
print("MSE:", mean_squared_error(y_val, y_pred))


KeyboardInterrupt: 