In [1]:
from os import chdir
chdir('/home/jovyan')

In [2]:
from lib import postgres as pg

# 1. How to deal with de missing values?

Replace the missing data with values from exact 7 days ago, otherwise replace with values from 7 days in the future.
- The power consumption of an individual household is very dependent on time variables such as **hour of the day** and **day of the week**.
- **1 Day = 1440 Timesteps**

In [18]:
data = pg.load_query_to_df("""
SELECT main.* FROM
(
SELECT
    present.datetime,
    coalesce(present.global_active_power, past.global_active_power, future.global_active_power)  as global_active_power,
    coalesce(present.global_reactive_power, past.global_reactive_power, future.global_reactive_power)  as global_reactive_power,
    coalesce(present.voltage, past.voltage, future.voltage)  as voltage,
    coalesce(present.current, past.current, future.current)  as current,
    coalesce(present.sub_metering_1, past.sub_metering_1, future.sub_metering_1)  as sub_metering_1,
    coalesce(present.sub_metering_2, past.sub_metering_2, future.sub_metering_2)  as sub_metering_2,
    coalesce(present.sub_metering_3, past.sub_metering_3, future.sub_metering_3)  as sub_metering_3
FROM 
    (
        SELECT 
            * 
        FROM 
            staging.individual_household_power_consumption 
        WHERE 
            global_active_power is null 
        ORDER BY 1 ASC 
        LIMIT 10
    ) present
LEFT JOIN
    staging.individual_household_power_consumption past
ON 
    past._id = present._id - 7*1440 
LEFT JOIN
    staging.individual_household_power_consumption future
ON
    future._id = present._id + 7*1440
) main
ORDER BY 1 ASC
LIMIT 10
""")
data.head()

Unnamed: 0,datetime,global_active_power,global_reactive_power,voltage,current,sub_metering_1,sub_metering_2,sub_metering_3
0,2006-12-21 11:23:00,0.332,0.192,244.66,1.6,0.0,0.0,0.0
1,2006-12-21 11:24:00,0.328,0.188,243.92,1.6,0.0,0.0,0.0
2,2006-12-30 10:08:00,2.73,0.062,240.21,11.2,0.0,0.0,0.0
3,2006-12-30 10:09:00,2.718,0.06,239.38,11.2,0.0,0.0,0.0
4,2007-01-14 18:36:00,4.934,0.34,230.63,22.2,0.0,29.0,16.0


In [19]:
cmd = """
CREATE SCHEMA clean;
CREATE TABLE clean.individual_household_power_consumption (
    _id SERIAL PRIMARY KEY,
    datetime TIMESTAMP,
    global_active_power FLOAT,
    global_reactive_power FLOAT,
    voltage FLOAT,
    current FLOAT,
    sub_metering_1 FLOAT,
    sub_metering_2 FLOAT,
    sub_metering_3 FLOAT
);
"""

pg.run_command(cmd)

None


In [5]:
data = pg.load_query_to_df("""
SELECT main.* FROM
(
SELECT
    present.datetime,
    coalesce(present.global_active_power, past.global_active_power, future.global_active_power)  as global_active_power,
    coalesce(present.global_reactive_power, past.global_reactive_power, future.global_reactive_power)  as global_reactive_power,
    coalesce(present.voltage, past.voltage, future.voltage)  as voltage,
    coalesce(present.current, past.current, future.current)  as current,
    coalesce(present.sub_metering_1, past.sub_metering_1, future.sub_metering_1)  as sub_metering_1,
    coalesce(present.sub_metering_2, past.sub_metering_2, future.sub_metering_2)  as sub_metering_2,
    coalesce(present.sub_metering_3, past.sub_metering_3, future.sub_metering_3)  as sub_metering_3
FROM 
    staging.individual_household_power_consumption present
LEFT JOIN
    staging.individual_household_power_consumption past
ON 
    past._id = present._id - 7*1440 
LEFT JOIN
    staging.individual_household_power_consumption future
ON
    future._id = present._id + 7*1440
) main
ORDER BY 1 ASC;
""")
data.head()

Unnamed: 0,datetime,global_active_power,global_reactive_power,voltage,current,sub_metering_1,sub_metering_2,sub_metering_3
0,2006-12-16 17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
1,2006-12-16 17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2,2006-12-16 17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
3,2006-12-16 17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
4,2006-12-16 17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0


In [6]:
pg.bulk_load_df(data, 'clean', 'individual_household_power_consumption')

Connecting to Database
Starting DataFrame CSV export...
CSV File has been created
Truncated individual_household_power_consumption
Loaded data into individual_household_power_consumption
DB connection closed.
Removing temporary files...
Done.
Elapsed time: 27.292452812194824 seconds


In [10]:
check_target = pg.load_query_to_df("""
SELECT main.* FROM
(
SELECT
    *
FROM 
    clean.individual_household_power_consumption
) main
ORDER BY 1 ASC
LIMIT 10;
""")
check_target.head()

Unnamed: 0,_id,datetime,global_active_power,global_reactive_power,voltage,current,sub_metering_1,sub_metering_2,sub_metering_3
0,1,2006-12-16 17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
1,2,2006-12-16 17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2,3,2006-12-16 17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
3,4,2006-12-16 17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
4,5,2006-12-16 17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0
