**Import des donn√©es / Lib**

In [57]:
# üì¶ Imports
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import shap

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_parquet('../data/prod/df_model_appart_2020.parquet.gz', engine='pyarrow')
# Calcul de la fr√©quence des communes
df

Unnamed: 0,surface_reelle_bati,nombre_pieces_principales,latitude,longitude,has_dependance,nom_commune,prix_m2
0,62.0,3.0,46.198795,5.219443,True,Bourg-en-Bresse,2193.548340
1,47.0,2.0,46.307400,4.842984,True,Saint-Laurent-sur-Sa√¥ne,1531.914917
2,46.0,2.0,46.205639,5.222975,False,Bourg-en-Bresse,1521.739136
3,60.0,2.0,46.208492,5.220961,True,Bourg-en-Bresse,583.333313
4,55.0,2.0,46.248257,5.130623,True,Polliat,2215.109619
...,...,...,...,...,...,...,...
190517,15.0,1.0,48.868458,2.345359,False,Paris 2e Arrondissement,9419.533203
190518,57.0,3.0,48.865822,2.342885,False,Paris 2e Arrondissement,12894.737305
190519,74.0,3.0,48.867874,2.352858,False,Paris 2e Arrondissement,9459.458984
190520,32.0,2.0,48.855644,2.367553,False,Paris 4e Arrondissement,11093.750000


**Random Forest**

In [54]:
# --- Encodage boolean ---
df["has_dependance"] = df["has_dependance"].astype(int)

# --- D√©finir features de base et target ---
FEATURES_BASE = [
    "surface_reelle_bati",
    "nombre_pieces_principales",
    "latitude",
    "longitude",
    "has_dependance",
]

TARGET = "prix_m2"

X = df[FEATURES_BASE + ["nom_commune"]].copy()
y = df[TARGET]

# --- Split train/test AVANT feature engineering ---
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

# --- Cr√©er feature nb_ventes_commune sur TRAIN uniquement ---
commune_sales = (
    X_train.groupby("nom_commune")
           .size()
           .rename("nb_ventes_commune")
)

# --- Appliquer au train et au test ---
X_train = X_train.merge(
    commune_sales,
    on="nom_commune",
    how="left"
)

X_test = X_test.merge(
    commune_sales,
    on="nom_commune",
    how="left"
)

# --- G√©rer communes jamais vues dans le test ---
median_sales = commune_sales.median()

X_train["nb_ventes_commune"] = X_train["nb_ventes_commune"].fillna(median_sales)
X_test["nb_ventes_commune"] = X_test["nb_ventes_commune"].fillna(median_sales)

# --- S√©lection finale des features ---
FEATURES_FINAL = FEATURES_BASE + ["nb_ventes_commune"]
X_train_final = X_train[FEATURES_FINAL]
X_test_final = X_test[FEATURES_FINAL]

# --- Entra√Ænement Random Forest ---
rf_model = RandomForestRegressor(
    n_estimators=300,
    max_depth=22,
    min_samples_leaf=20,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train_final, y_train)

# --- Pr√©dictions et √©valuation ---
y_pred = rf_model.predict(X_test_final)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Random Forest")
print("RMSE :", rmse)
print("R2   :", r2)

# --- Importance des features ---
importances = pd.Series(
    rf_model.feature_importances_,
    index=FEATURES_FINAL
).sort_values(ascending=False)

print("\nImportance des features :")
print(importances)

Random Forest
RMSE : 1055.2417058588
R2   : 0.8334907754404861

Importance des features :
latitude                     0.654240
longitude                    0.163473
nb_ventes_commune            0.147776
surface_reelle_bati          0.028748
has_dependance               0.003677
nombre_pieces_principales    0.002086
dtype: float64


**GradientBoostingRegressor**

In [38]:
# --- 1Ô∏è‚É£ Encodage bool√©en ---
df["has_dependance"] = df["has_dependance"].astype(int)

# --- 2Ô∏è‚É£ D√©finir features de base et target ---
FEATURES_BASE = [
    "surface_reelle_bati",
    "nombre_pieces_principales",
    "latitude",
    "longitude",
    "has_dependance",
]

TARGET = "prix_m2"

X = df[FEATURES_BASE + ["nom_commune"]].copy()
y = df[TARGET]

# --- 3Ô∏è‚É£ Split train/test AVANT feature engineering ---
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

# --- 4Ô∏è‚É£ Cr√©er feature nb_ventes_commune sur TRAIN uniquement ---
commune_sales = (
    X_train.groupby("nom_commune")
           .size()
           .rename("nb_ventes_commune")
)

# --- 5Ô∏è‚É£ Appliquer au train et au test ---
X_train = X_train.merge(
    commune_sales,
    on="nom_commune",
    how="left"
)

X_test = X_test.merge(
    commune_sales,
    on="nom_commune",
    how="left"
)

# --- 6Ô∏è‚É£ G√©rer communes jamais vues dans le test ---
median_sales = commune_sales.median()

X_train["nb_ventes_commune"] = X_train["nb_ventes_commune"].fillna(median_sales)
X_test["nb_ventes_commune"] = X_test["nb_ventes_commune"].fillna(median_sales)

# --- 7Ô∏è‚É£ S√©lection finale des features ---
FEATURES_FINAL = FEATURES_BASE + ["nb_ventes_commune"]
X_train_final = X_train[FEATURES_FINAL]
X_test_final = X_test[FEATURES_FINAL]

# --- 8Ô∏è‚É£ Entra√Ænement Gradient Boosting ---
gbr_model = GradientBoostingRegressor(
    n_estimators=300,
    max_depth=5,
    min_samples_leaf=20,
    learning_rate=0.1,
    random_state=42
)

gbr_model.fit(X_train_final, y_train)

# --- 9Ô∏è‚É£ Pr√©dictions et √©valuation ---
y_pred = gbr_model.predict(X_test_final)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Gradient Boosting Regressor")
print("RMSE :", rmse)
print("R2   :", r2)

# --- 10Ô∏è‚É£ Importance des features ---
importances = pd.Series(
    gbr_model.feature_importances_,
    index=FEATURES_FINAL
).sort_values(ascending=False)

print("\nImportance des features :")
print(importances)

Gradient Boosting Regressor
RMSE : 1115.2213351384414
R2   : 0.8140241543399761

Importance des features :
latitude                     0.650219
longitude                    0.166097
nb_ventes_commune            0.148249
surface_reelle_bati          0.028423
nombre_pieces_principales    0.003620
has_dependance               0.003393
dtype: float64


**LightGBM**

In [51]:
# --- 1Ô∏è‚É£ Encodage bool√©en ---
df["has_dependance"] = df["has_dependance"].astype(int)

# --- 2Ô∏è‚É£ D√©finir features de base et target ---
FEATURES_BASE = [
    "surface_reelle_bati",
    "nombre_pieces_principales",
    "latitude",
    "longitude",
    "has_dependance",
]

TARGET = "prix_m2"

X = df[FEATURES_BASE + ["nom_commune"]].copy()
y = df[TARGET]

# --- 3Ô∏è‚É£ Split train/test AVANT feature engineering ---
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

# --- 4Ô∏è‚É£ Cr√©er feature nb_ventes_commune sur TRAIN uniquement ---
commune_sales = (
    X_train.groupby("nom_commune")
           .size()
           .rename("nb_ventes_commune")
)

# --- 5Ô∏è‚É£ Appliquer au train et au test ---
X_train = X_train.merge(
    commune_sales,
    on="nom_commune",
    how="left"
)

X_test = X_test.merge(
    commune_sales,
    on="nom_commune",
    how="left"
)

# --- 6Ô∏è‚É£ G√©rer communes jamais vues dans le test ---
median_sales = commune_sales.median()

X_train["nb_ventes_commune"] = X_train["nb_ventes_commune"].fillna(median_sales)
X_test["nb_ventes_commune"] = X_test["nb_ventes_commune"].fillna(median_sales)

# --- 7Ô∏è‚É£ S√©lection finale des features ---
FEATURES_FINAL = FEATURES_BASE + ["nb_ventes_commune"]
X_train_final = X_train[FEATURES_FINAL]
X_test_final = X_test[FEATURES_FINAL]

# --- 8Ô∏è‚É£ Entra√Ænement LightGBM ---
lgb_model = LGBMRegressor(
    n_estimators=1000,
    max_depth=5,
    learning_rate=0.05,
    num_leaves=31,
    random_state=42,
    verbose=-1  # üîπ pas de logs LightGBM
)

lgb_model.fit(X_train_final, y_train)

# √âvaluation
y_pred = lgb_model.predict(X_test_final)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print("LightGBM Regressor")
print("RMSE :", rmse)
print("R2   :", r2)

# Importances utiles seulement
importances = pd.Series(
    lgb_model.feature_importances_,
    index=FEATURES_FINAL
).sort_values(ascending=False)
print("\nImportance des features :")
print(importances[importances > 0])

LightGBM Regressor
RMSE : 1105.0476386270318
R2   : 0.8174018364370954

Importance des features :
latitude                     8114
longitude                    7510
nb_ventes_commune            5299
surface_reelle_bati          4333
nombre_pieces_principales     915
has_dependance                557
dtype: int32


**Pr√©diction uniquement sur Paris**

In [68]:
df_paris = df[
    df["nom_commune"].str.contains("Paris", case=False, na=False)
    & ~df["nom_commune"].str.contains("Seyssinet-Pariset|Le Touquet-Paris-Plage|Villeparisis|Fontenay-en-Parisis|Cormeilles-en-Parisis", case=False, na=False)
]["nom_commune"].unique()
print(df_paris)

['Paris 3e Arrondissement' 'Paris 1er Arrondissement'
 'Paris 8e Arrondissement' 'Paris 20e Arrondissement'
 'Paris 17e Arrondissement' 'Paris 10e Arrondissement'
 'Paris 18e Arrondissement' 'Paris 4e Arrondissement'
 'Paris 7e Arrondissement' 'Paris 5e Arrondissement'
 'Paris 6e Arrondissement' 'Paris 11e Arrondissement'
 'Paris 9e Arrondissement' 'Paris 12e Arrondissement'
 'Paris 13e Arrondissement' 'Paris 14e Arrondissement'
 'Paris 15e Arrondissement' 'Paris 16e Arrondissement'
 'Paris 19e Arrondissement' 'Paris 2e Arrondissement']


In [69]:
df_paris = df[
    df["nom_commune"].str.contains("Paris", case=False, na=False)
    & ~df["nom_commune"].str.contains("Seyssinet-Pariset|Le Touquet-Paris-Plage|Villeparisis|Fontenay-en-Parisis|Cormeilles-en-Parisis", case=False, na=False)
].copy()

df_paris["has_dependance"] = df_paris["has_dependance"].astype(int)

X_paris = df_paris[FEATURES_BASE + ["nom_commune"]].copy()

X_paris = X_paris.merge(commune_sales, on="nom_commune", how="left")
X_paris["nb_ventes_commune"] = X_paris["nb_ventes_commune"].fillna(median_sales)

X_paris_final = X_paris[FEATURES_FINAL]

y_pred_paris = rf_model.predict(X_paris_final)

# ============================================================
# 8Ô∏è‚É£ R√âSULTATS PARIS
# ============================================================

df_result_paris = df_paris.copy()
df_result_paris["prix_m2_pred"] = y_pred_paris

print("\nParis (toutes variantes) ‚Äì aper√ßu")
print(df_result_paris[[
    "nom_commune",
    "surface_reelle_bati",
    "nombre_pieces_principales",
    "prix_m2",
    "prix_m2_pred"
]].head())

# (Optionnel) m√©triques Paris
print("\nParis uniquement")
print("RMSE :", np.sqrt(mean_squared_error(df_result_paris["prix_m2"], y_pred_paris)))
print("R2   :", r2_score(df_result_paris["prix_m2"], y_pred_paris))


Paris (toutes variantes) ‚Äì aper√ßu
                     nom_commune  surface_reelle_bati  \
178396   Paris 3e Arrondissement                 12.0   
178397  Paris 1er Arrondissement                 27.0   
178398  Paris 1er Arrondissement                 84.0   
178399  Paris 1er Arrondissement                120.0   
178400  Paris 1er Arrondissement                 24.0   

        nombre_pieces_principales       prix_m2  prix_m2_pred  
178396                        1.0  12333.333008  10375.472789  
178397                        2.0  10000.000000  11186.707361  
178398                        4.0  13238.095703  10146.789198  
178399                        5.0  13916.666992  10911.069535  
178400                        1.0  13089.583008  11340.654078  

Paris uniquement
RMSE : 2110.5979904202354
R2   : 0.21953957545657254
