In [1]:
from os import chdir
chdir('/home/jovyan')

In [2]:
from lib import postgres as pg

# 1. Feature engineering

Create new features to be used by the model

In [None]:
check_target = pg.load_query_to_df("""
SELECT main.* FROM
(
SELECT
    *
FROM 
    clean.individual_household_power_consumption
) main
ORDER BY 1 ASC
LIMIT 10;
""")
check_target.head()

In [None]:
check_target = pg.load_query_to_df("""
SELECT main.* FROM
(
SELECT
    *
FROM 
    clean.individual_household_power_consumption_h
) main
ORDER BY 1 ASC
LIMIT 10;
""")
check_target.head()

## 1.2. Time features

The central objective for load forecasting is to predict the power consumption for an instant **t**, given information about its behavior in the past and additional information about other variables that might be known not only in the past but also in the future, such as *calendar information*. The Individual Household Power Consumption Data Set, referred here as **IHPC**, has different *electric signals* in the household and the *timestamp* for an instant, from which is possible to extract calendar information. Ther following features will be generated for each record of the dataset:

Feature | Type | Description
:---: | :---: | :---
**is_workday** | *BOOLEAN* | day is between monday and friday
**is_morning** | *BOOLEAN* | hour is between 07:00h and 11:59h
**is_afternoon** | *BOOLEAN* | hour is between 12:00h and 17:59h
**is_night** | *BOOLEAN* | hour is between 18:00h and 23:59h
**is_dawn** | *BOOLEAN* | hour is between 00:00h and 06:59h
**is_monday** | *BOOLEAN* | day is monday
**is_tuesday** | *BOOLEAN* | day is tuesday
**is_wednesday** | *BOOLEAN* | day is wednesday
**is_thursday** | *BOOLEAN* | day is thursday
**is_friday** | *BOOLEAN* | day is friday
**is_saturday** | *BOOLEAN* | day is saturday
**is_sunday** | *BOOLEAN* | day is sunday
**is_winter** | *BOOLEAN* | season is winter
**is_summer** | *BOOLEAN* | season is summer
**is_autumn** | *BOOLEAN* | season is autumn
**is_spring** | *BOOLEAN* | season is spring

In [19]:
time_features = pg.load_query_to_df("""
SELECT main.* FROM
(
SELECT
    _id,
    CASE WHEN EXTRACT( DOW FROM present.datetime ) in (0,6) THEN FALSE ELSE TRUE END as is_workday,
    CASE WHEN CAST(present.datetime as time) between '07:00:00' and '11:59:59' THEN TRUE ELSE FALSE END as is_morning,
    CASE WHEN CAST(present.datetime as time) between '12:00:00' and '17:59:59' THEN TRUE ELSE FALSE END as is_afternoon,
    CASE WHEN CAST(present.datetime as time) between '18:00:00' and '23:59:59' THEN TRUE ELSE FALSE END as is_night,
    CASE WHEN CAST(present.datetime as time) between '00:00:00' and '06:59:59' THEN TRUE ELSE FALSE END as is_dawn,
    CASE WHEN EXTRACT( DOW FROM present.datetime ) = 1 THEN TRUE ELSE FALSE END as is_monday,
    CASE WHEN EXTRACT( DOW FROM present.datetime ) = 2 THEN TRUE ELSE FALSE END as is_tuesday,
    CASE WHEN EXTRACT( DOW FROM present.datetime ) = 3 THEN TRUE ELSE FALSE END as is_wednesday,
    CASE WHEN EXTRACT( DOW FROM present.datetime ) = 4 THEN TRUE ELSE FALSE END as is_thursday,
    CASE WHEN EXTRACT( DOW FROM present.datetime ) = 5 THEN TRUE ELSE FALSE END as is_friday,
    CASE WHEN EXTRACT( DOW FROM present.datetime ) = 6 THEN TRUE ELSE FALSE END as is_saturday,
    CASE WHEN EXTRACT( MONTH FROM present.datetime ) IN (12,1,2) THEN TRUE ELSE FALSE END as is_winter,
    CASE WHEN EXTRACT( MONTH FROM present.datetime ) IN (6,7,8) THEN TRUE ELSE FALSE END as is_summer,
    CASE WHEN EXTRACT( MONTH FROM present.datetime ) IN (9,10,11) THEN TRUE ELSE FALSE END as is_autumn,
    CASE WHEN EXTRACT( MONTH FROM present.datetime ) IN (3,4,5) THEN TRUE ELSE FALSE END as is_spring
FROM 
    clean.individual_household_power_consumption present     
) main
ORDER BY 1 ASC
LIMIT 10;
""")
time_features.head()

Unnamed: 0,_id,is_workday,is_morning,is_afternoon,is_night,is_dawn,is_monday,is_tuesday,is_wednesday,is_thursday,is_friday,is_saturday,is_winter,is_summer,is_autumn,is_spring
0,1,False,False,True,False,False,False,False,False,False,False,True,True,False,False,False
1,2,False,False,True,False,False,False,False,False,False,False,True,True,False,False,False
2,3,False,False,True,False,False,False,False,False,False,False,True,True,False,False,False
3,4,False,False,True,False,False,False,False,False,False,False,True,True,False,False,False
4,5,False,False,True,False,False,False,False,False,False,False,True,True,False,False,False


In [18]:
cmd = """
CREATE SCHEMA features;
CREATE TABLE features.IHPC_time (
    _id INT PRIMARY KEY,
    is_workday  BOOLEAN NOT NULL,
    is_morning BOOLEAN NOT NULL,
    is_afternoon BOOLEAN NOT NULL,
    is_night BOOLEAN NOT NULL,
    is_dawn BOOLEAN NOT NULL,
    is_monday BOOLEAN NOT NULL,
    is_tuesday BOOLEAN NOT NULL,
    is_wednesday BOOLEAN NOT NULL,
    is_thursday BOOLEAN NOT NULL,
    is_friday BOOLEAN NOT NULL,
    is_saturday BOOLEAN NOT NULL,
    is_winter BOOLEAN NOT NULL,
    is_summer BOOLEAN NOT NULL,
    is_autumn BOOLEAN NOT NULL,
    is_spring BOOLEAN NOT NULL
);

CREATE TABLE features.IHPCh_time (
    _id INT PRIMARY KEY,
    is_workday  BOOLEAN NOT NULL,
    is_morning BOOLEAN NOT NULL,
    is_afternoon BOOLEAN NOT NULL,
    is_night BOOLEAN NOT NULL,
    is_dawn BOOLEAN NOT NULL,
    is_monday BOOLEAN NOT NULL,
    is_tuesday BOOLEAN NOT NULL,
    is_wednesday BOOLEAN NOT NULL,
    is_thursday BOOLEAN NOT NULL,
    is_friday BOOLEAN NOT NULL,
    is_saturday BOOLEAN NOT NULL,
    is_winter BOOLEAN NOT NULL,
    is_summer BOOLEAN NOT NULL,
    is_autumn BOOLEAN NOT NULL,
    is_spring BOOLEAN NOT NULL
);
"""

pg.run_command(cmd)

None


In [20]:
time_features_data = pg.load_query_to_df("""
SELECT main.* FROM
(
SELECT
    _id,
    CASE WHEN EXTRACT( DOW FROM present.datetime ) in (0,6) THEN FALSE ELSE TRUE END as is_workday,
    CASE WHEN CAST(present.datetime as time) between '07:00:00' and '11:59:59' THEN TRUE ELSE FALSE END as is_morning,
    CASE WHEN CAST(present.datetime as time) between '12:00:00' and '17:59:59' THEN TRUE ELSE FALSE END as is_afternoon,
    CASE WHEN CAST(present.datetime as time) between '18:00:00' and '23:59:59' THEN TRUE ELSE FALSE END as is_night,
    CASE WHEN CAST(present.datetime as time) between '00:00:00' and '06:59:59' THEN TRUE ELSE FALSE END as is_dawn,
    CASE WHEN EXTRACT( DOW FROM present.datetime ) = 1 THEN TRUE ELSE FALSE END as is_monday,
    CASE WHEN EXTRACT( DOW FROM present.datetime ) = 2 THEN TRUE ELSE FALSE END as is_tuesday,
    CASE WHEN EXTRACT( DOW FROM present.datetime ) = 3 THEN TRUE ELSE FALSE END as is_wednesday,
    CASE WHEN EXTRACT( DOW FROM present.datetime ) = 4 THEN TRUE ELSE FALSE END as is_thursday,
    CASE WHEN EXTRACT( DOW FROM present.datetime ) = 5 THEN TRUE ELSE FALSE END as is_friday,
    CASE WHEN EXTRACT( DOW FROM present.datetime ) = 6 THEN TRUE ELSE FALSE END as is_saturday,
    CASE WHEN EXTRACT( MONTH FROM present.datetime ) IN (12,1,2) THEN TRUE ELSE FALSE END as is_winter,
    CASE WHEN EXTRACT( MONTH FROM present.datetime ) IN (6,7,8) THEN TRUE ELSE FALSE END as is_summer,
    CASE WHEN EXTRACT( MONTH FROM present.datetime ) IN (9,10,11) THEN TRUE ELSE FALSE END as is_autumn,
    CASE WHEN EXTRACT( MONTH FROM present.datetime ) IN (3,4,5) THEN TRUE ELSE FALSE END as is_spring
FROM 
    clean.individual_household_power_consumption present   
) main
ORDER BY 1 ASC
""")
time_features_data.head()

Unnamed: 0,_id,is_workday,is_morning,is_afternoon,is_night,is_dawn,is_monday,is_tuesday,is_wednesday,is_thursday,is_friday,is_saturday,is_winter,is_summer,is_autumn,is_spring
0,1,False,False,True,False,False,False,False,False,False,False,True,True,False,False,False
1,2,False,False,True,False,False,False,False,False,False,False,True,True,False,False,False
2,3,False,False,True,False,False,False,False,False,False,False,True,True,False,False,False
3,4,False,False,True,False,False,False,False,False,False,False,True,True,False,False,False
4,5,False,False,True,False,False,False,False,False,False,False,True,True,False,False,False


In [21]:
pg.bulk_load_df(time_features_data, 'features', 'IHPC_time')

Connecting to Database
Starting DataFrame CSV export...
CSV File has been created
The table features.IHPC_time has been successfully truncated.
It wasn't possible to reset serial _id.
The data has been succesfully loaded to table features.IHPC_time
DB connection closed.
Removing temporary files...
Done.
Elapsed time: 33.88196897506714 seconds


In [22]:
check = pg.load_query_to_df("""
SELECT main.* FROM
(
SELECT
    *
FROM 
    features.IHPC_time
) main
ORDER BY 1 ASC
LIMIT 5;
""")
check.head()

Unnamed: 0,_id,is_workday,is_morning,is_afternoon,is_night,is_dawn,is_monday,is_tuesday,is_wednesday,is_thursday,is_friday,is_saturday,is_winter,is_summer,is_autumn,is_spring
0,1,False,False,True,False,False,False,False,False,False,False,True,True,False,False,False
1,2,False,False,True,False,False,False,False,False,False,False,True,True,False,False,False
2,3,False,False,True,False,False,False,False,False,False,False,True,True,False,False,False
3,4,False,False,True,False,False,False,False,False,False,False,True,True,False,False,False
4,5,False,False,True,False,False,False,False,False,False,False,True,True,False,False,False


In [23]:
time_features_data = pg.load_query_to_df("""
SELECT main.* FROM
(
SELECT
    _id,
    CASE WHEN EXTRACT( DOW FROM present.datetime ) in (0,6) THEN FALSE ELSE TRUE END as is_workday,
    CASE WHEN CAST(present.datetime as time) between '07:00:00' and '11:59:59' THEN TRUE ELSE FALSE END as is_morning,
    CASE WHEN CAST(present.datetime as time) between '12:00:00' and '17:59:59' THEN TRUE ELSE FALSE END as is_afternoon,
    CASE WHEN CAST(present.datetime as time) between '18:00:00' and '23:59:59' THEN TRUE ELSE FALSE END as is_night,
    CASE WHEN CAST(present.datetime as time) between '00:00:00' and '06:59:59' THEN TRUE ELSE FALSE END as is_dawn,
    CASE WHEN EXTRACT( DOW FROM present.datetime ) = 1 THEN TRUE ELSE FALSE END as is_monday,
    CASE WHEN EXTRACT( DOW FROM present.datetime ) = 2 THEN TRUE ELSE FALSE END as is_tuesday,
    CASE WHEN EXTRACT( DOW FROM present.datetime ) = 3 THEN TRUE ELSE FALSE END as is_wednesday,
    CASE WHEN EXTRACT( DOW FROM present.datetime ) = 4 THEN TRUE ELSE FALSE END as is_thursday,
    CASE WHEN EXTRACT( DOW FROM present.datetime ) = 5 THEN TRUE ELSE FALSE END as is_friday,
    CASE WHEN EXTRACT( DOW FROM present.datetime ) = 6 THEN TRUE ELSE FALSE END as is_saturday,
    CASE WHEN EXTRACT( MONTH FROM present.datetime ) IN (12,1,2) THEN TRUE ELSE FALSE END as is_winter,
    CASE WHEN EXTRACT( MONTH FROM present.datetime ) IN (6,7,8) THEN TRUE ELSE FALSE END as is_summer,
    CASE WHEN EXTRACT( MONTH FROM present.datetime ) IN (9,10,11) THEN TRUE ELSE FALSE END as is_autumn,
    CASE WHEN EXTRACT( MONTH FROM present.datetime ) IN (3,4,5) THEN TRUE ELSE FALSE END as is_spring
FROM 
    clean.individual_household_power_consumption_h present      
) main
ORDER BY 1 ASC
""")
time_features_data.head()

Unnamed: 0,_id,is_workday,is_morning,is_afternoon,is_night,is_dawn,is_monday,is_tuesday,is_wednesday,is_thursday,is_friday,is_saturday,is_winter,is_summer,is_autumn,is_spring
0,1,False,False,True,False,False,False,False,False,False,False,True,True,False,False,False
1,2,False,False,False,True,False,False,False,False,False,False,True,True,False,False,False
2,3,False,False,False,True,False,False,False,False,False,False,True,True,False,False,False
3,4,False,False,False,True,False,False,False,False,False,False,True,True,False,False,False
4,5,False,False,False,True,False,False,False,False,False,False,True,True,False,False,False


In [24]:
pg.bulk_load_df(time_features_data, 'features', 'IHPCh_time')

Connecting to Database
Starting DataFrame CSV export...
CSV File has been created
The table features.IHPCh_time has been successfully truncated.
It wasn't possible to reset serial _id.
The data has been succesfully loaded to table features.IHPCh_time
DB connection closed.
Removing temporary files...
Done.
Elapsed time: 0.5748617649078369 seconds


In [25]:
check = pg.load_query_to_df("""
SELECT main.* FROM
(
SELECT
    *
FROM 
    features.IHPCh_time
) main
ORDER BY 1 ASC
LIMIT 5;
""")
check.head()

Unnamed: 0,_id,is_workday,is_morning,is_afternoon,is_night,is_dawn,is_monday,is_tuesday,is_wednesday,is_thursday,is_friday,is_saturday,is_winter,is_summer,is_autumn,is_spring
0,1,False,False,True,False,False,False,False,False,False,False,True,True,False,False,False
1,2,False,False,False,True,False,False,False,False,False,False,True,True,False,False,False
2,3,False,False,False,True,False,False,False,False,False,False,True,True,False,False,False
3,4,False,False,False,True,False,False,False,False,False,False,True,True,False,False,False
4,5,False,False,False,True,False,False,False,False,False,False,True,True,False,False,False
