# Milestone 3: Feature Scaling and Data Splitting

This notebook prepares the energy consumption dataset
for machine learning by scaling features and splitting
the data into training and testing sets.


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler


In [2]:
df = pd.read_csv(
    "../data/household_power_consumption.txt",
    sep=";",
    na_values="?",
    low_memory=False
)

df.head()


Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,16/12/2006,17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
1,16/12/2006,17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2,16/12/2006,17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
3,16/12/2006,17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
4,16/12/2006,17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0


In [3]:
df["datetime"] = pd.to_datetime(
    df["Date"] + " " + df["Time"],
    dayfirst=True
)

df.set_index("datetime", inplace=True)
df.drop(["Date", "Time"], axis=1, inplace=True)

df.fillna(method="ffill", inplace=True)

df.head()


Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-12-16 17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
2006-12-16 17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2006-12-16 17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
2006-12-16 17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
2006-12-16 17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0


In [4]:
features = [
    "Global_active_power",
    "Global_reactive_power",
    "Voltage",
    "Global_intensity",
    "Sub_metering_1",
    "Sub_metering_2",
    "Sub_metering_3"
]

data = df[features]
data.head()


Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-12-16 17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
2006-12-16 17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2006-12-16 17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
2006-12-16 17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
2006-12-16 17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0


In [5]:
scaler = MinMaxScaler()

scaled_data = pd.DataFrame(
    scaler.fit_transform(data),
    columns=data.columns,
    index=data.index
)

scaled_data.head()


Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-12-16 17:24:00,0.374796,0.300719,0.37609,0.377593,0.0,0.0125,0.548387
2006-12-16 17:25:00,0.478363,0.313669,0.336995,0.473029,0.0,0.0125,0.516129
2006-12-16 17:26:00,0.479631,0.358273,0.32601,0.473029,0.0,0.025,0.548387
2006-12-16 17:27:00,0.480898,0.361151,0.340549,0.473029,0.0,0.0125,0.548387
2006-12-16 17:28:00,0.325005,0.379856,0.403231,0.323651,0.0,0.0125,0.548387


In [6]:
train_size = int(len(scaled_data) * 0.8)

train_data = scaled_data.iloc[:train_size]
test_data = scaled_data.iloc[train_size:]

print("Train shape:", train_data.shape)
print("Test shape:", test_data.shape)


Train shape: (1660207, 7)
Test shape: (415052, 7)


In [7]:
train_data.describe()


Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
count,1660207.0,1660207.0,1660207.0,1660207.0,1660207.0,1660207.0,1660207.0
mean,0.09363339,0.08789877,0.566629,0.09351436,0.01316743,0.01689285,0.2053034
std,0.09877642,0.08051198,0.1083199,0.09515078,0.0712991,0.07504451,0.2710862
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.02064096,0.03309353,0.5033926,0.02489627,0.0,0.0,0.0
50%,0.0468948,0.07194245,0.5725363,0.04979253,0.0,0.0,0.03225806
75%,0.133442,0.1381295,0.6358643,0.1286307,0.0,0.0125,0.5483871
max,1.0,1.0,1.0,1.0,0.9545455,1.0,1.0


In [8]:
test_data.describe()


Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
count,415052.0,415052.0,415052.0,415052.0,415052.0,415052.0,415052.0
mean,0.082722,0.092031,0.583501,0.082813,0.010455,0.012926,0.213717
std,0.079563,0.082575,0.086514,0.076605,0.061728,0.060192,0.273527
min,0.005613,0.0,0.068498,0.008299,0.0,0.0,0.0
25%,0.022995,0.03741,0.530856,0.024896,0.0,0.0,0.032258
50%,0.048705,0.070504,0.58546,0.049793,0.0,0.0,0.032258
75%,0.124932,0.142446,0.634895,0.120332,0.0,0.0125,0.580645
max,0.873438,0.808633,0.979968,0.887967,1.0,0.975,1.0
