In [48]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import kagglehub
import os, pandas as pd
from joblib import dump, load

In [49]:
# Download latest version
path = kagglehub.dataset_download("andreylss/residential-and-commercial-energy-cost-dataset")

print("Path to dataset files:", path)

Path to dataset files: /home/rohnak.agarwal/.cache/kagglehub/datasets/andreylss/residential-and-commercial-energy-cost-dataset/versions/1


In [50]:
df = pd.read_csv(os.path.join(path, "energy_consumption.csv"))
df.head()

Unnamed: 0,customer_id,customer_type,regions,building_size_m2,occupants,energy_cost_brl
0,CUSTOMER_0001,residential,Northeast,24,2,64.51
1,CUSTOMER_0002,commercial,Midwest,24,1,55.26
2,CUSTOMER_0003,commercial,Southeast,24,1,74.54
3,CUSTOMER_0004,residential,Northeast,45,4,147.06
4,CUSTOMER_0005,residential,Southeast,45,4,143.06


In [51]:
df.drop(["customer_id"], axis=1, inplace=True)
df.head()

Unnamed: 0,customer_type,regions,building_size_m2,occupants,energy_cost_brl
0,residential,Northeast,24,2,64.51
1,commercial,Midwest,24,1,55.26
2,commercial,Southeast,24,1,74.54
3,residential,Northeast,45,4,147.06
4,residential,Southeast,45,4,143.06


In [52]:
X = df[["customer_type", "regions", "building_size_m2", "occupants"]]
y = df["energy_cost_brl"]

In [53]:
X.head()

Unnamed: 0,customer_type,regions,building_size_m2,occupants
0,residential,Northeast,24,2
1,commercial,Midwest,24,1
2,commercial,Southeast,24,1
3,residential,Northeast,45,4
4,residential,Southeast,45,4


In [54]:
y.head()

0     64.51
1     55.26
2     74.54
3    147.06
4    143.06
Name: energy_cost_brl, dtype: float64

In [55]:
# Separate categorical and numerical columns
cat_cols = X.select_dtypes(include='object').columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns

print(cat_cols, num_cols)

Index(['customer_type', 'regions'], dtype='object') Index(['building_size_m2', 'occupants'], dtype='object')


In [56]:
# One-hot encode categorical columns
X_encoded = pd.get_dummies(X, columns=cat_cols, dtype=float, drop_first=True)
X_encoded.head()

Unnamed: 0,building_size_m2,occupants,customer_type_residential,regions_North,regions_Northeast,regions_South,regions_Southeast
0,24,2,1.0,0.0,1.0,0.0,0.0
1,24,1,0.0,0.0,0.0,0.0,0.0
2,24,1,0.0,0.0,0.0,0.0,1.0
3,45,4,1.0,0.0,1.0,0.0,0.0
4,45,4,1.0,0.0,0.0,0.0,1.0


In [57]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=327)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])


(4000, 7) (1000, 7) (4000,) (1000,)


In [58]:
!mkdir -p ./model

In [59]:
dump(scaler, "./model/stdscaler.joblib")
pd.DataFrame(list(zip(scaler.feature_names_in_, scaler.mean_, scaler.scale_)), columns=["param", "mean", "scale"])

Unnamed: 0,param,mean,scale
0,building_size_m2,39.6635,17.650305
1,occupants,2.30225,1.03121


In [60]:
scaler = load("./model/stdscaler.joblib")
pd.DataFrame(list(zip(scaler.feature_names_in_, scaler.mean_, scaler.scale_)), columns=["param", "mean", "scale"])

Unnamed: 0,param,mean,scale
0,building_size_m2,39.6635,17.650305
1,occupants,2.30225,1.03121


In [61]:
!mkdir -p ./data/

In [62]:
X_train.to_csv("./data/x_train.csv", index=0)
X_test.to_csv("./data/x_test.csv", index=0)
y_train.to_csv("./data/y_train.csv", index=0)
y_test.to_csv("./data/y_test.csv", index=0)