In [1]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from lightgbm import LGBMClassifier
from app.ml_logic.preprocess import preprocess
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('data/dpe.csv')
df = df.dropna(subset=['classe_bilan_dpe'])
target = df['classe_bilan_dpe']
df = df.drop(['classe_bilan_dpe', 'batiment_groupe_id',], axis=1)

## PREP

In [3]:
for column in df.columns:
    if df[column].dtype == type(object):
        if df[column].str.isnumeric().all():
            df[column] = pd.to_numeric(df[column])
        else:
            try:
                df[column] = df[column].astype(float)
            except ValueError:
                df[column] = df[column].astype(str)
                le = LabelEncoder()
                df[column] = le.fit_transform(df[column])

In [4]:
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
scaler = StandardScaler()
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

In [5]:
# HANDLING MISSING VALUES 

imputer = SimpleImputer(strategy='mean')
df = pd.DataFrame(imputer.fit_transform(df), columns = df.columns)

In [6]:
df = df.dropna()

## Splitting and trainning the data

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.2, random_state=42)

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

## Define the parameter grid for RandomizedSearchCV
param_grid = {
    'max_depth': [10, 20, 30,],
    'n_estimators': [100, 200, 300,]
}

rf = RandomForestClassifier()
random_search = RandomizedSearchCV(rf, param_distributions=param_grid, n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)

## Encoding target variable using OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
y_train = ordinal_encoder.fit_transform(np.array(y_train).reshape(-1, 1))
y_test = ordinal_encoder.transform(np.array(y_test).reshape(-1, 1))

In [9]:
## Training the model
random_search.fit(X_train, y_train.ravel())

## Making predictions
y_pred = random_search.predict(X_test)

## Evaluating the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))

## Print the best parameters from the Randomized Search
print("Best Parameters:", random_search.best_params_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits




[CV] END .....................max_depth=10, n_estimators=100; total time= 1.3min
[CV] END .....................max_depth=10, n_estimators=100; total time= 1.3min
[CV] END .....................max_depth=10, n_estimators=100; total time= 1.3min
[CV] END .....................max_depth=10, n_estimators=100; total time= 1.3min
[CV] END .....................max_depth=10, n_estimators=100; total time= 1.3min
[CV] END .....................max_depth=10, n_estimators=200; total time= 2.6min
[CV] END .....................max_depth=10, n_estimators=200; total time= 2.6min
[CV] END .....................max_depth=10, n_estimators=200; total time= 2.6min
[CV] END .....................max_depth=10, n_estimators=200; total time= 2.6min
[CV] END .....................max_depth=10, n_estimators=200; total time= 2.6min
[CV] END .....................max_depth=20, n_estimators=100; total time= 2.2min
[CV] END .....................max_depth=20, n_estimators=100; total time= 2.2min
[CV] END ...................