In [1]:
from os import chdir
chdir('/home/jovyan')

In [2]:
import pandas as pd
from lib import postgres as pg

# 1. How many rows with missing data?

In [3]:
sql_df = pg.load_query_to_df("""
SELECT count(*) as rows_with_missing_data FROM
(
SELECT 
    TO_TIMESTAMP(CONCAT(Date,' ',Time),'DD-MM-YYYY HH24:MI:SS') as dt,
    CASE WHEN Global_active_power = '?' THEN null ELSE CAST(Global_active_power AS NUMERIC) END as global_active_power,
    CASE WHEN Global_reactive_power = '?' THEN null ELSE  CAST(Global_reactive_power AS NUMERIC) END as global_reactive_power,
    CASE WHEN Voltage = '?' THEN null ELSE  CAST(Voltage AS NUMERIC) END as voltage,
    CASE WHEN Global_intensity = '?' THEN null ELSE  CAST(Global_intensity AS NUMERIC) END as current,
    CASE WHEN Sub_metering_1 = '?' THEN null ELSE  CAST(Sub_metering_1 AS NUMERIC) END as sub_metering_1, 
    CASE WHEN Sub_metering_2 = '?' THEN null ELSE  CAST(Sub_metering_2 AS NUMERIC) END as sub_metering_2,
    CASE WHEN Sub_metering_3 = '?' THEN null ELSE  CAST(Sub_metering_3 AS NUMERIC) END as sub_metering_3
FROM raw.individual_household_power_consumption
) Z
WHERE Z.global_active_power is null
LIMIT 10;
""")
sql_df.head()

Unnamed: 0,rows_with_missing_data
0,25979


# 2. How many dozens with missing timesteps?

In [4]:
sql_df = pg.load_query_to_df("""
SELECT count(*) as dozens_with_missing_timesteps FROM
(
SELECT 
    DATE(TO_TIMESTAMP(CONCAT(Date,' ',Time),'DD-MM-YYYY HH24:MI:SS')) as date,
    EXTRACT(HOUR FROM TO_TIMESTAMP(CONCAT(Date,' ',Time),'DD-MM-YYYY HH24:MI:SS')) as hour,
    -- EXTRACT(MINUTES FROM TO_TIMESTAMP(CONCAT(Date,' ',Time),'DD-MM-YYYY HH24:MI:SS')) as minute,
    FLOOR(EXTRACT(MINUTES FROM TO_TIMESTAMP(CONCAT(Date,' ',Time),'DD-MM-YYYY HH24:MI:SS'))/10) as m,
    count(*) as count
FROM raw.individual_household_power_consumption
GROUP BY 1,2,3
ORDER BY 1,2,3
) Z
WHERE count != 10
LIMIT 10;
""")
sql_df.head()

Unnamed: 0,dozens_with_missing_timesteps
0,2


# 3. Move raw data to appropriate schema

In [5]:
sql_df = pg.load_query_to_df("""
SELECT main.* FROM
(
SELECT 
    TO_TIMESTAMP(CONCAT(Date,' ',Time),'DD-MM-YYYY HH24:MI:SS') as datetime,
    CASE WHEN Global_active_power = '?' THEN null ELSE CAST(Global_active_power AS NUMERIC) END as global_active_power,
    CASE WHEN Global_reactive_power = '?' THEN null ELSE  CAST(Global_reactive_power AS NUMERIC) END as global_reactive_power,
    CASE WHEN Voltage = '?' THEN null ELSE  CAST(Voltage AS NUMERIC) END as voltage,
    CASE WHEN Global_intensity = '?' THEN null ELSE  CAST(Global_intensity AS NUMERIC) END as current,
    CASE WHEN Sub_metering_1 = '?' THEN null ELSE  CAST(Sub_metering_1 AS NUMERIC) END as sub_metering_1, 
    CASE WHEN Sub_metering_2 = '?' THEN null ELSE  CAST(Sub_metering_2 AS NUMERIC) END as sub_metering_2,
    CASE WHEN Sub_metering_3 = '?' THEN null ELSE  CAST(Sub_metering_3 AS NUMERIC) END as sub_metering_3
FROM raw.individual_household_power_consumption
) main
ORDER BY 1 ASC
LIMIT 10;
""")
sql_df.head()

Unnamed: 0,datetime,global_active_power,global_reactive_power,voltage,current,sub_metering_1,sub_metering_2,sub_metering_3
0,2006-12-16 17:24:00+00:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
1,2006-12-16 17:25:00+00:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2,2006-12-16 17:26:00+00:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
3,2006-12-16 17:27:00+00:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
4,2006-12-16 17:28:00+00:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0


In [6]:
sql_df.dtypes

datetime                 datetime64[ns, UTC]
global_active_power                  float64
global_reactive_power                float64
voltage                              float64
current                              float64
sub_metering_1                       float64
sub_metering_2                       float64
sub_metering_3                       float64
dtype: object

CREATE TABLE statement:

`
CREATE SCHEMA staging;
CREATE TABLE staging.individual_household_power_consumption (
    _id SERIAL PRIMARY KEY,
    datetime TIMESTAMP,
    global_active_power FLOAT,
    global_reactive_power FLOAT,
    voltage FLOAT,
    current FLOAT,
    sub_metering_1 FLOAT,
    sub_metering_2 FLOAT,
    sub_metering_3 FLOAT
);`

In [7]:
cmd = """
CREATE SCHEMA staging;
CREATE TABLE staging.individual_household_power_consumption (
    _id SERIAL PRIMARY KEY,
    datetime TIMESTAMP,
    global_active_power FLOAT,
    global_reactive_power FLOAT,
    voltage FLOAT,
    current FLOAT,
    sub_metering_1 FLOAT,
    sub_metering_2 FLOAT,
    sub_metering_3 FLOAT
);
"""

pg.run_command(cmd)

None


In [8]:
data = pg.load_query_to_df("""
SELECT main.* FROM
(
SELECT 
    TO_TIMESTAMP(CONCAT(Date,' ',Time),'DD-MM-YYYY HH24:MI:SS') as datetime,
    CASE WHEN Global_active_power = '?' THEN null ELSE CAST(Global_active_power AS NUMERIC) END as global_active_power,
    CASE WHEN Global_reactive_power = '?' THEN null ELSE  CAST(Global_reactive_power AS NUMERIC) END as global_reactive_power,
    CASE WHEN Voltage = '?' THEN null ELSE  CAST(Voltage AS NUMERIC) END as voltage,
    CASE WHEN Global_intensity = '?' THEN null ELSE  CAST(Global_intensity AS NUMERIC) END as current,
    CASE WHEN Sub_metering_1 = '?' THEN null ELSE  CAST(Sub_metering_1 AS NUMERIC) END as sub_metering_1, 
    CASE WHEN Sub_metering_2 = '?' THEN null ELSE  CAST(Sub_metering_2 AS NUMERIC) END as sub_metering_2,
    CASE WHEN Sub_metering_3 = '?' THEN null ELSE  CAST(Sub_metering_3 AS NUMERIC) END as sub_metering_3
FROM raw.individual_household_power_consumption
) main
ORDER BY 1 ASC;
""")
data.head()

Unnamed: 0,datetime,global_active_power,global_reactive_power,voltage,current,sub_metering_1,sub_metering_2,sub_metering_3
0,2006-12-16 17:24:00+00:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
1,2006-12-16 17:25:00+00:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2,2006-12-16 17:26:00+00:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
3,2006-12-16 17:27:00+00:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
4,2006-12-16 17:28:00+00:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0


In [9]:
pg.bulk_load_df(data, 'staging', 'individual_household_power_consumption')

Connecting to Database
Starting DataFrame CSV export...
CSV File has been created
Truncated individual_household_power_consumption
Loaded data into individual_household_power_consumption
DB connection closed.
Removing temporary files...
Done.
Elapsed time: 38.70808172225952 seconds
