In [21]:
# Feature Engineering




In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [3]:
if os.path.basename(os.getcwd()) == "notebooks":
    os.chdir("..")

print(" Dossier courant :", os.getcwd())


 Dossier courant : c:\Users\selma\Desktop\projet_seattle


In [5]:
# --- Charger le dataset nettoyé
file_path = "data/data_cleaned.csv"
df = pd.read_csv(file_path)

print(" Données nettoyées chargées :", df.shape)
display(df.head())

# --- Suppression des colonnes inutiles ou redondantes

colonnes_a_supprimer = [
    "DataYear",                # année fixe (2016)
    "Comments",                # texte non pertinent
    "OSEBuildingID",           # identifiant unique sans valeur explicative
    "PropertyName",            # nom du bâtiment (non généralisable)
    "Address", "City", "State", "ZipCode",  # localisation détaillée inutile
    "Latitude", "Longitude",               # doublon géographique
    "CouncilDistrictCode", "TaxParcelIdentificationNumber",  # identifiants administratifs
    "ListOfAllPropertyUseTypes",           # information déjà dans LargestPropertyUseType
    "SteamUse(kBtu)", "Electricity(kBtu)", "NaturalGas(kBtu)"  # doublons en d'autres unités
]

# Vérifier et supprimer uniquement celles présentes
colonnes_presentes = [col for col in colonnes_a_supprimer if col in df.columns]
df.drop(columns=colonnes_presentes, inplace=True)

print(f" Colonnes supprimées : {len(colonnes_presentes)}")
print(colonnes_presentes)
print(" Données prêtes pour sélection des features.")


 Données nettoyées chargées : (1514, 43)


Unnamed: 0,OSEBuildingID,DataYear,BuildingType,PrimaryPropertyType,PropertyName,Address,City,State,ZipCode,TaxParcelIdentificationNumber,...,SiteEnergyUse(kBtu),SiteEnergyUseWN(kBtu),SteamUse(kBtu),Electricity(kWh),Electricity(kBtu),NaturalGas(therms),NaturalGas(kBtu),Comments,TotalGHGEmissions,GHGEmissionsIntensity
0,1,2016,NonResidential,Hotel,Mayflower park hotel,405 Olive way,Seattle,WA,98101.0,659000030,...,7226362.5,7456910.0,2003882.0,1156514.0,3946027.0,12764.5293,1276453.0,,249.98,2.83
1,2,2016,NonResidential,Hotel,Paramount Hotel,724 Pine street,Seattle,WA,98101.0,659000220,...,8387933.0,8664479.0,0.0,950425.2,3242851.0,51450.81641,5145082.0,,295.86,2.86
2,3,2016,NonResidential,Hotel,5673-The Westin Seattle,1900 5th Avenue,Seattle,WA,98101.0,659000475,...,72587024.0,73937112.0,21566554.0,14515440.0,49526664.0,14938.0,1493800.0,,2089.28,2.19
3,5,2016,NonResidential,Hotel,HOTEL MAX,620 STEWART ST,Seattle,WA,98101.0,659000640,...,6794584.0,6946800.5,2214446.25,811525.3,2768924.0,18112.13086,1811213.0,,286.43,4.67
4,8,2016,NonResidential,Hotel,WARWICK SEATTLE HOTEL (ID8),401 LENORA ST,Seattle,WA,98121.0,659000970,...,14172606.0,14656503.0,0.0,1573449.0,5368607.0,88039.98438,8803998.0,,505.01,2.88


 Colonnes supprimées : 16
['DataYear', 'Comments', 'OSEBuildingID', 'PropertyName', 'Address', 'City', 'State', 'ZipCode', 'Latitude', 'Longitude', 'CouncilDistrictCode', 'TaxParcelIdentificationNumber', 'ListOfAllPropertyUseTypes', 'SteamUse(kBtu)', 'Electricity(kBtu)', 'NaturalGas(kBtu)']
 Données prêtes pour sélection des features.


In [13]:
# --- Simplification de la localisation : garder uniquement 'Neighborhood'

colonnes_geo = ["City", "State", "ZipCode", "Latitude", "Longitude"]
colonnes_geo_presentes = [col for col in colonnes_geo if col in df.columns]

df.drop(columns=colonnes_geo_presentes, inplace=True)
print(f" Colonnes géographiques supprimées : {colonnes_geo_presentes}")

# Vérification que 'Neighborhood' existe
if "Neighborhood" in df.columns:
    print(" Colonne 'Neighborhood' conservée pour représenter la localisation.")
else:
    print(" 'Neighborhood' non trouvée — vérifie le dataset.")


 Colonnes géographiques supprimées : []
 Colonne 'Neighborhood' conservée pour représenter la localisation.


In [15]:
# --- Suppression des colonnes énergétiques redondantes

colonnes_redondantes = [
    "Electricity(kBtu)",
    "NaturalGas(kBtu)",
    "SteamUse(kBtu)",
    "SiteEnergyUseWN(kBtu)",   # version 'weather-normalized' redondante
    "SourceEUIWN(kBtu/sf)",    # redondante avec SourceEUI
    "SiteEUIWN(kBtu/sf)"       # redondante avec SiteEUI
]

# Supprimer uniquement celles présentes
colonnes_presentes = [col for col in colonnes_redondantes if col in df.columns]
df.drop(columns=colonnes_presentes, inplace=True)

print(f" Colonnes énergétiques redondantes supprimées : {colonnes_presentes}")


 Colonnes énergétiques redondantes supprimées : []


In [16]:
# --- Transformation des relevés énergétiques en indicateurs binaires

colonnes_energie = [
    "Electricity(kWh)",
    "NaturalGas(therms)",
    "SteamUse(kBtu)"
]

for col in colonnes_energie:
    if col in df.columns:
        df[f"{col}_used"] = df[col].apply(lambda x: 1 if x > 0 else 0)
        df.drop(columns=[col], inplace=True)

print(" Transformation des relevés énergétiques en variables binaires terminée.")
print("Colonnes ajoutées :", [c for c in df.columns if c.endswith("_used")])


 Transformation des relevés énergétiques en variables binaires terminée.
Colonnes ajoutées : ['Electricity(kWh)_used', 'NaturalGas(therms)_used']


In [6]:
# --- Choix de la cible (target)
target = "SiteEnergyUse(kBtu)"  # prédiction de la consommation d'énergie

y = df[target]                     # variable cible
X = df.drop(columns=[target])      # variables explicatives

print(" Target choisie :", target)
print(" Dimensions du dataset :", X.shape)


 Target choisie : SiteEnergyUse(kBtu)
 Dimensions du dataset : (1514, 26)


In [7]:
# --- Identifier les variables numériques et catégorielles
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

print(f" Colonnes numériques : {len(num_cols)}")
print(f" Colonnes catégorielles : {len(cat_cols)}")


 Colonnes numériques : 19
 Colonnes catégorielles : 7


In [8]:
# --- Pipeline de transformation
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),  # standardisation des valeurs numériques
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)  # encodage des catégories
])

print("  Pipeline de prétraitement créé avec succès.")


  Pipeline de prétraitement créé avec succès.


In [9]:
# --- Application de la transformation
X_encoded = preprocessor.fit_transform(X)
X_encoded = pd.DataFrame(X_encoded)

print(" Données transformées :", X_encoded.shape)
display(X_encoded.head())


 Données transformées : (1514, 270)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,260,261,262,263,264,265,266,267,268,269
0,-1.052909,-0.060733,1.122296,-0.125091,-0.318963,-0.058583,-0.01503,,,-0.119035,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.050332,-0.060733,0.976157,-0.045573,0.022815,-0.058174,-0.044332,-0.321727,-0.313274,-0.084309,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.227325,-0.060733,5.36032,4.434511,4.144256,3.974572,4.283422,,,-0.709375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-1.08339,-0.060733,0.830018,-0.267574,-0.318963,-0.221566,-0.189488,,,-0.257938,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.562624,-0.060733,1.999128,0.332858,1.087719,0.09257,0.210239,0.522046,-0.477906,0.401853,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [18]:
# --- Sélection des cibles principales (consommation & émissions)

# Cible de consommation
if "SiteEnergyUse(kBtu)" in df.columns:
    target_energy = "SiteEnergyUse(kBtu)"
else:
    # Si absente, on cherche une alternative
    target_energy = [c for c in df.columns if "EnergyUse" in c][0]

# Cible d’émissions
if "TotalGHGEmissions" in df.columns:
    target_emission = "TotalGHGEmissions"
else:
    target_emission = [c for c in df.columns if "GHG" in c][0]

# Suppression des autres colonnes similaires
colonnes_a_supprimer = [
    "SiteEnergyUseWN(kBtu)", "SourceEUI(kBtu/sf)", "SiteEUI(kBtu/sf)",
    "SourceEUIWN(kBtu/sf)", "GHGEmissionsIntensity"
]

colonnes_presentes = [col for col in colonnes_a_supprimer if col in df.columns]
df.drop(columns=colonnes_presentes, inplace=True)

print(" Cibles sélectionnées :")
print(f" - Consommation : {target_energy}")
print(f" - Émissions : {target_emission}")
print(" Colonnes similaires supprimées :", colonnes_presentes)


 Cibles sélectionnées :
 - Consommation : SiteEnergyUse(kBtu)
 - Émissions : TotalGHGEmissions
 Colonnes similaires supprimées : []


In [10]:
# --- Découpage en train/test
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

print(f" X_train : {X_train.shape}")
print(f" X_test  : {X_test.shape}")
print(f" y_train : {y_train.shape}")
print(f" y_test  : {y_test.shape}")


 X_train : (1211, 270)
 X_test  : (303, 270)
 y_train : (1211,)
 y_test  : (303,)


In [11]:
# --- Sauvegarde des datasets dans le dossier /data/
os.makedirs("data", exist_ok=True)

X_train.to_csv("data/X_train.csv", index=False)
X_test.to_csv("data/X_test.csv", index=False)
y_train.to_csv("data/y_train.csv", index=False)
y_test.to_csv("data/y_test.csv", index=False)

print(" Données sauvegardées dans le dossier /data/")


 Données sauvegardées dans le dossier /data/


In [12]:
print(" Préparation des features terminée avec succès !")
print(" Dossier courant :", os.getcwd())


 Préparation des features terminée avec succès !
 Dossier courant : c:\Users\selma\Desktop\projet_seattle
