In [7]:
# Feature Engineering




In [8]:
#  Feature Engineering : préparation des données pour la modélisation

import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

if os.path.basename(os.getcwd()) == "notebooks":
    os.chdir("..")

print(" Dossier courant :", os.getcwd())



 Dossier courant : c:\Users\selma\Desktop\projet_seattle


In [9]:
# --- Charger le dataset nettoyé
df = pd.read_csv("data/data_cleaned.csv")
print(" Données nettoyées chargées :", df.shape)
display(df.head())


 Données nettoyées chargées : (1514, 43)


Unnamed: 0,OSEBuildingID,DataYear,BuildingType,PrimaryPropertyType,PropertyName,Address,City,State,ZipCode,TaxParcelIdentificationNumber,...,SiteEnergyUse(kBtu),SiteEnergyUseWN(kBtu),SteamUse(kBtu),Electricity(kWh),Electricity(kBtu),NaturalGas(therms),NaturalGas(kBtu),Comments,TotalGHGEmissions,GHGEmissionsIntensity
0,1,2016,NonResidential,Hotel,Mayflower park hotel,405 Olive way,Seattle,WA,98101.0,659000030,...,7226362.5,7456910.0,2003882.0,1156514.0,3946027.0,12764.5293,1276453.0,,249.98,2.83
1,2,2016,NonResidential,Hotel,Paramount Hotel,724 Pine street,Seattle,WA,98101.0,659000220,...,8387933.0,8664479.0,0.0,950425.2,3242851.0,51450.81641,5145082.0,,295.86,2.86
2,3,2016,NonResidential,Hotel,5673-The Westin Seattle,1900 5th Avenue,Seattle,WA,98101.0,659000475,...,72587024.0,73937112.0,21566554.0,14515440.0,49526664.0,14938.0,1493800.0,,2089.28,2.19
3,5,2016,NonResidential,Hotel,HOTEL MAX,620 STEWART ST,Seattle,WA,98101.0,659000640,...,6794584.0,6946800.5,2214446.25,811525.3,2768924.0,18112.13086,1811213.0,,286.43,4.67
4,8,2016,NonResidential,Hotel,WARWICK SEATTLE HOTEL (ID8),401 LENORA ST,Seattle,WA,98121.0,659000970,...,14172606.0,14656503.0,0.0,1573449.0,5368607.0,88039.98438,8803998.0,,505.01,2.88


In [10]:
# --- Suppression des colonnes inutiles ou redondantes
colonnes_a_supprimer = [
    "DataYear", "Comments", "OSEBuildingID", "PropertyName",
    "Address", "City", "State", "ZipCode",
    "Latitude", "Longitude", "CouncilDistrictCode", "TaxParcelIdentificationNumber",
    "ListOfAllPropertyUseTypes",
    "Electricity(kBtu)", "NaturalGas(kBtu)"  # on garde SteamUse(kBtu)
]

colonnes_presentes = [col for col in colonnes_a_supprimer if col in df.columns]
df.drop(columns=colonnes_presentes, inplace=True)

print(f" Colonnes supprimées : {len(colonnes_presentes)}")


 Colonnes supprimées : 15


In [11]:
# --- Définir la cible principale
target = "SiteEnergyUse(kBtu)"
y = df[target]
X = df.drop(columns=[target])

num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

print(f" Colonnes numériques : {len(num_cols)}")
print(f" Colonnes catégorielles : {len(cat_cols)}")



 Colonnes numériques : 20
 Colonnes catégorielles : 7


In [12]:
# --- Pipeline de transformation
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
])

X_encoded = preprocessor.fit_transform(X)
feature_names = preprocessor.get_feature_names_out()
X_encoded = pd.DataFrame(X_encoded, columns=feature_names)

print(" Données transformées :", X_encoded.shape)
display(X_encoded.head())



 Données transformées : (1514, 271)


Unnamed: 0,num__YearBuilt,num__NumberofBuildings,num__NumberofFloors,num__PropertyGFATotal,num__PropertyGFAParking,num__PropertyGFABuilding(s),num__LargestPropertyUseTypeGFA,num__SecondLargestPropertyUseTypeGFA,num__ThirdLargestPropertyUseTypeGFA,num__ENERGYSTARScore,...,cat__YearsENERGYSTARCertified_20172016,cat__YearsENERGYSTARCertified_201720162012,cat__YearsENERGYSTARCertified_201720162014201320122011201020092007,cat__YearsENERGYSTARCertified_201720162015,cat__YearsENERGYSTARCertified_20172016201520092008,cat__YearsENERGYSTARCertified_2017201620152012200920072006,cat__YearsENERGYSTARCertified_201720162015201420132008,cat__YearsENERGYSTARCertified_2017201620152014201320112008,cat__YearsENERGYSTARCertified_2017201620152014201320122011201020092008,cat__YearsENERGYSTARCertified_nan
0,-1.052909,-0.060733,1.122296,-0.125091,-0.318963,-0.058583,-0.01503,,,-0.119035,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.050332,-0.060733,0.976157,-0.045573,0.022815,-0.058174,-0.044332,-0.321727,-0.313274,-0.084309,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.227325,-0.060733,5.36032,4.434511,4.144256,3.974572,4.283422,,,-0.709375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-1.08339,-0.060733,0.830018,-0.267574,-0.318963,-0.221566,-0.189488,,,-0.257938,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.562624,-0.060733,1.999128,0.332858,1.087719,0.09257,0.210239,0.522046,-0.477906,0.401853,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [13]:
# --- Découpage train/test et sauvegarde
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

os.makedirs("data", exist_ok=True)
X_train.to_csv("data/X_train.csv", index=False)
X_test.to_csv("data/X_test.csv", index=False)
y_train.to_csv("data/y_train.csv", index=False)
y_test.to_csv("data/y_test.csv", index=False)

print(" Données sauvegardées dans /data/")



 Données sauvegardées dans /data/
