# **Feature Engineering Predicting Indoor Air Quality**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install nbformat nbclient



In [3]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def load_data(filename, resample_interval='h'):
    """
    Load, scale, and resample data from a CSV file.

    Parameters:
    - filename (str): The path to the CSV file.
    - resample_interval (str): The resampling interval, e.g., 'H' for hourly, '15T' for 15 minutes.

    Returns:
    - pd.DataFrame: The processed data with scaled values and resampled to the specified interval.
    """
    try:
        # Load data
        data = pd.read_csv(
            filename,
            parse_dates=["ts"],
            index_col="ts"
        )
    except FileNotFoundError:
        print(f"Error: File '{filename}' not found.")
        return None

    # Scale specified columns
    cols_to_scale = data.columns
    scaler = MinMaxScaler()
    data[cols_to_scale] = scaler.fit_transform(data[cols_to_scale])

    # Resample data to specified interval and sort index
    data = data.resample(resample_interval).mean()
    data.sort_index(inplace=True)

    return data

# Load hourly data
hourly_data = load_data('/content/drive/MyDrive/ML & Deep Learning/Airquality/gams_indoor.csv', 'h')

# Load 15-minute data
data_15min = load_data('/content/drive/MyDrive/ML & Deep Learning/Airquality/gams_preprocessed_hourly.csv', 'min')


In [4]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def load_hourly_data():
    # Update this path to the correct location in Google Drive
    filename = '/content/drive/MyDrive/ML & Deep Learning/Airquality/gams_indoor.csv'

    try:
        # Load data:
        data = pd.read_csv(
            filename,
            parse_dates=["ts"],
            index_col="ts"
        )
    except FileNotFoundError:
        print(f"Error: File '{filename}' not found.")
        return None

    # Scale specified columns
    cols_to_scale = data.columns
    scaler = MinMaxScaler()
    data[cols_to_scale] = scaler.fit_transform(data[cols_to_scale])

    # Resample to hourly data
    data = data.resample('h').mean()
    data.sort_index(inplace=True)

    return data

# Load data
data = load_hourly_data()
if data is not None:
    data.head()


In [5]:
data.head()

Unnamed: 0_level_0,co2,humidity,pm10,pm25,temperature,voc
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-11-21 00:00:00,0.143928,0.92541,0.056407,0.087367,0.392345,0.0
2016-11-21 01:00:00,0.173406,0.79601,0.037141,0.054913,0.616867,0.000459
2016-11-21 02:00:00,0.356757,0.758707,0.100577,0.145691,0.7968,0.000619
2016-11-21 03:00:00,0.4558,0.75365,0.142179,0.214482,0.863704,0.002179
2016-11-21 04:00:00,0.462883,0.736541,0.082496,0.127187,0.890333,0.000399


In [6]:
import pandas as pd
from statsmodels.tsa.stattools import adfuller

# Initialize empty list to store results
results = []
df=data.dropna()
# Run ADF test for each column and append results to the list
for col in df.columns:

    result = adfuller(df[col])
    results.append([col, result[0], result[1], result[4]['5%']])

# Convert results list to pandas DataFrame
results_df = pd.DataFrame(results, columns=['Variable', 'ADF Statistic', 'p-value', '5% Critical Value'])

# Print results table
results_df

Unnamed: 0,Variable,ADF Statistic,p-value,5% Critical Value
0,co2,-6.493665,1.209143e-08,-2.862514
1,humidity,-5.658085,9.518321e-07,-2.862513
2,pm10,-9.695736,1.105668e-16,-2.862513
3,pm25,-9.919901,3.010287e-17,-2.862513
4,temperature,-6.93491,1.060737e-09,-2.862514
5,voc,-8.727667,3.255272e-14,-2.862514


In [7]:
import numpy as np  # Import numpy for np.where function

# Extract date and time features
data["Month"] = data.index.month
data["Week"] = data.index.isocalendar().week
data["Day"] = data.index.day
data["Day_of_week"] = data.index.day_of_week
data["Hour"] = data.index.hour

# Determine if it's a weekend
data["is_weekend"] = np.where(data["Day_of_week"] > 4, 1, 0)

# Show new variables
data.head()

Unnamed: 0_level_0,co2,humidity,pm10,pm25,temperature,voc,Month,Week,Day,Day_of_week,Hour,is_weekend
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2016-11-21 00:00:00,0.143928,0.92541,0.056407,0.087367,0.392345,0.0,11,47,21,0,0,0
2016-11-21 01:00:00,0.173406,0.79601,0.037141,0.054913,0.616867,0.000459,11,47,21,0,1,0
2016-11-21 02:00:00,0.356757,0.758707,0.100577,0.145691,0.7968,0.000619,11,47,21,0,2,0
2016-11-21 03:00:00,0.4558,0.75365,0.142179,0.214482,0.863704,0.002179,11,47,21,0,3,0
2016-11-21 04:00:00,0.462883,0.736541,0.082496,0.127187,0.890333,0.000399,11,47,21,0,4,0


In [8]:
# Define pollutants list with the column names you want to use
pollutants = ["co2", "humidity", "pm10", "pm25", "temperature", "voc"]
# raw time series
variables = pollutants

# Shift the data forward 1 Hr.
tmp = data[variables].shift(freq="1H")

# Names for the new variables.
tmp.columns = [v + "_lag_1" for v in variables]

# Add the variables to the original data.
print("data size before")
print(data.shape)

data = data.merge(tmp, left_index=True, right_index=True, how="left")

print("data size after")
print(data.shape)

data.head()

data size before
(3058, 12)
data size after
(3058, 18)


  tmp = data[variables].shift(freq="1H")


Unnamed: 0_level_0,co2,humidity,pm10,pm25,temperature,voc,Month,Week,Day,Day_of_week,Hour,is_weekend,co2_lag_1,humidity_lag_1,pm10_lag_1,pm25_lag_1,temperature_lag_1,voc_lag_1
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2016-11-21 00:00:00,0.143928,0.92541,0.056407,0.087367,0.392345,0.0,11,47,21,0,0,0,,,,,,
2016-11-21 01:00:00,0.173406,0.79601,0.037141,0.054913,0.616867,0.000459,11,47,21,0,1,0,0.143928,0.92541,0.056407,0.087367,0.392345,0.0
2016-11-21 02:00:00,0.356757,0.758707,0.100577,0.145691,0.7968,0.000619,11,47,21,0,2,0,0.173406,0.79601,0.037141,0.054913,0.616867,0.000459
2016-11-21 03:00:00,0.4558,0.75365,0.142179,0.214482,0.863704,0.002179,11,47,21,0,3,0,0.356757,0.758707,0.100577,0.145691,0.7968,0.000619
2016-11-21 04:00:00,0.462883,0.736541,0.082496,0.127187,0.890333,0.000399,11,47,21,0,4,0,0.4558,0.75365,0.142179,0.214482,0.863704,0.002179


In [9]:
data[variables].head()

Unnamed: 0_level_0,co2,humidity,pm10,pm25,temperature,voc
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-11-21 00:00:00,0.143928,0.92541,0.056407,0.087367,0.392345,0.0
2016-11-21 01:00:00,0.173406,0.79601,0.037141,0.054913,0.616867,0.000459
2016-11-21 02:00:00,0.356757,0.758707,0.100577,0.145691,0.7968,0.000619
2016-11-21 03:00:00,0.4558,0.75365,0.142179,0.214482,0.863704,0.002179
2016-11-21 04:00:00,0.462883,0.736541,0.082496,0.127187,0.890333,0.000399


In [10]:
data.isnull().sum()

Unnamed: 0,0
co2,60
humidity,60
pm10,60
pm25,60
temperature,60
voc,60
Month,0
Week,0
Day,0
Day_of_week,0


In [11]:
# Move forward 24 hrs.
tmp = data[variables].shift(freq="24H")

# Rename the variables.
tmp.columns = [v + "_lag_24" for v in variables]

# Add the features to the original data.
print("data size before")
print(data.shape)

data = data.merge(tmp, left_index=True, right_index=True, how="left")

print("data size after")
print(data.shape)

data[variables].head(25)

data size before
(3058, 18)
data size after
(3058, 24)


  tmp = data[variables].shift(freq="24H")


Unnamed: 0_level_0,co2,humidity,pm10,pm25,temperature,voc
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-11-21 00:00:00,0.143928,0.92541,0.056407,0.087367,0.392345,0.0
2016-11-21 01:00:00,0.173406,0.79601,0.037141,0.054913,0.616867,0.000459
2016-11-21 02:00:00,0.356757,0.758707,0.100577,0.145691,0.7968,0.000619
2016-11-21 03:00:00,0.4558,0.75365,0.142179,0.214482,0.863704,0.002179
2016-11-21 04:00:00,0.462883,0.736541,0.082496,0.127187,0.890333,0.000399
2016-11-21 05:00:00,0.510435,0.740581,0.069299,0.107438,0.900539,0.000462
2016-11-21 06:00:00,0.596438,0.742474,0.073075,0.113483,0.921834,0.001445
2016-11-21 07:00:00,0.746419,0.747353,0.047713,0.072885,0.950472,0.014052
2016-11-21 08:00:00,0.907359,0.732932,0.028885,0.04143,0.95765,0.007928
2016-11-21 09:00:00,0.97537,0.733449,0.022391,0.03016,0.936674,0.013557


In [12]:
data.isnull().sum()

Unnamed: 0,0
co2,60
humidity,60
pm10,60
pm25,60
temperature,60
voc,60
Month,0
Week,0
Day,0
Day_of_week,0


In [13]:
# Use the mean of the 3 previous hours as input variables.
tmp = (
    data[variables]
    .rolling(window="3H")
    .mean()  # Average the last 3 hr values.
    .shift(freq="1H")  # Move the average 1 hour forward
)

# Rename the columns
tmp.columns = [v + "_window" for v in variables]


# view of the result
tmp.head(10)

  data[variables]


Unnamed: 0_level_0,co2_window,humidity_window,pm10_window,pm25_window,temperature_window,voc_window
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-11-21 01:00:00,0.143928,0.92541,0.056407,0.087367,0.392345,0.0
2016-11-21 02:00:00,0.158667,0.86071,0.046774,0.07114,0.504606,0.000229
2016-11-21 03:00:00,0.224697,0.826709,0.064708,0.09599,0.602004,0.000359
2016-11-21 04:00:00,0.328654,0.769456,0.093299,0.138362,0.759124,0.001086
2016-11-21 05:00:00,0.425147,0.749633,0.108418,0.162453,0.850279,0.001066
2016-11-21 06:00:00,0.476373,0.743591,0.097992,0.149702,0.884858,0.001013
2016-11-21 07:00:00,0.523252,0.739865,0.074957,0.116036,0.904235,0.000768
2016-11-21 08:00:00,0.617764,0.743469,0.063362,0.097935,0.924282,0.00532
2016-11-21 09:00:00,0.750072,0.74092,0.049891,0.075932,0.943318,0.007808
2016-11-21 10:00:00,0.876382,0.737911,0.032996,0.048158,0.948265,0.011845


In [14]:
# Join the new variables to the original data.
print("data size before")
print(data.shape)

data = data.merge(tmp, left_index=True, right_index=True, how="left")

print("data size after")
print(data.shape)

data[ variables].head()

data size before
(3058, 24)
data size after
(3058, 30)


Unnamed: 0_level_0,co2,humidity,pm10,pm25,temperature,voc
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-11-21 00:00:00,0.143928,0.92541,0.056407,0.087367,0.392345,0.0
2016-11-21 01:00:00,0.173406,0.79601,0.037141,0.054913,0.616867,0.000459
2016-11-21 02:00:00,0.356757,0.758707,0.100577,0.145691,0.7968,0.000619
2016-11-21 03:00:00,0.4558,0.75365,0.142179,0.214482,0.863704,0.002179
2016-11-21 04:00:00,0.462883,0.736541,0.082496,0.127187,0.890333,0.000399


In [15]:
!pip install feature-engine

Collecting feature-engine
  Downloading feature_engine-1.8.2-py2.py3-none-any.whl.metadata (9.9 kB)
Downloading feature_engine-1.8.2-py2.py3-none-any.whl (374 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m375.0/375.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: feature-engine
Successfully installed feature-engine-1.8.2


In [16]:
from feature_engine.creation import CyclicalFeatures
import numpy as np

# Create features that capture the cyclical representation
cyclical = CyclicalFeatures(
    variables=["Month", "Hour", "Day"],  # The features we want to transform
    drop_original=False  # Whether to drop the original features
)

data = cyclical.fit_transform(data)

In [17]:
cyclical_vars = [var for var in data.columns if "sin" in var or "cos" in var]

data[cyclical_vars].head()

Unnamed: 0_level_0,Month_sin,Month_cos,Hour_sin,Hour_cos,Day_sin,Day_cos
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-11-21 00:00:00,-0.5,0.866025,0.0,1.0,-0.897805,-0.440394
2016-11-21 01:00:00,-0.5,0.866025,0.269797,0.962917,-0.897805,-0.440394
2016-11-21 02:00:00,-0.5,0.866025,0.519584,0.854419,-0.897805,-0.440394
2016-11-21 03:00:00,-0.5,0.866025,0.730836,0.682553,-0.897805,-0.440394
2016-11-21 04:00:00,-0.5,0.866025,0.887885,0.460065,-0.897805,-0.440394


In [18]:
# Determine fraction of missing data.
data.isnull().sum() / len(data)

Unnamed: 0,0
co2,0.019621
humidity,0.019621
pm10,0.019621
pm25,0.019621
temperature,0.019621
voc,0.019621
Month,0.0
Week,0.0
Day,0.0
Day_of_week,0.0


**Imputation**

There is not a lot of data missing

In [19]:
print("data size before")
print(data.shape)

data.dropna(inplace=True)

print("data size after")
print(data.shape)

data size before
(3058, 36)
data size after
(2927, 36)


In [20]:
data.head()

Unnamed: 0_level_0,co2,humidity,pm10,pm25,temperature,voc,Month,Week,Day,Day_of_week,...,pm10_window,pm25_window,temperature_window,voc_window,Month_sin,Month_cos,Hour_sin,Hour_cos,Day_sin,Day_cos
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-11-22 00:00:00,0.125142,0.681239,0.022233,0.033495,0.592737,0.000287,11,47,22,1,...,0.026882,0.041626,0.62421,0.000309,-0.5,0.866025,0.0,1.0,-0.968077,-0.250653
2016-11-22 01:00:00,0.159592,0.659048,0.006791,0.007556,0.613171,0.000258,11,47,22,1,...,0.028853,0.044576,0.607663,0.000319,-0.5,0.866025,0.269797,0.962917,-0.968077,-0.250653
2016-11-22 02:00:00,0.316681,0.633366,0.008445,0.009361,0.645052,0.000405,11,47,22,1,...,0.023795,0.03616,0.604318,0.000239,-0.5,0.866025,0.519584,0.854419,-0.968077,-0.250653
2016-11-22 03:00:00,0.443831,0.611913,0.018425,0.024901,0.666608,0.000563,11,47,22,1,...,0.012489,0.016804,0.616987,0.000317,-0.5,0.866025,0.730836,0.682553,-0.968077,-0.250653
2016-11-22 04:00:00,0.532989,0.624371,0.025304,0.033161,0.695313,0.011083,11,47,22,1,...,0.01122,0.013939,0.64161,0.000409,-0.5,0.866025,0.887885,0.460065,-0.968077,-0.250653


In [21]:
# store new dataset
data.to_csv("gams_preprocessed.csv", index=True)