In [None]:
from os import chdir
chdir('/home/jovyan')

# 1. [Individual household electric power consumption Data Set](https://archive.ics.uci.edu/ml/datasets/individual+household+electric+power+consumption)

### Abstract 

Measurements of electric power consumption in one household with a one-minute sampling rate over a period of almost 4 years. Different electrical quantities and some sub-metering values are available.

Property | Description
:---: | :--- 
Type | Multivariate, Time-Series
Area | Physical
Attribute Characteristics | Real
Number of Attributes | 9
Date Donated | 2012-08-30
Associated Tasks | Regression, Clustering
Missing Values | Yes

### Source

Georges Hebrail (georges.hebrail '@' edf.fr), Senior Researcher, EDF R&D, Clamart, France
Alice Berard, TELECOM ParisTech Master of Engineering Internship at EDF R&D, Clamart, France


### Data Set Information

This archive contains **2075259 measurements** gathered in a house located in Sceaux (7km of Paris, France) between December 2006 and November 2010 (47 months).

#### **Notes**:

1. (global_active_power*1000/60 - sub_metering_1 - sub_metering_2 - sub_metering_3) represents the active energy consumed every minute (in watt hour) in the household by electrical equipment not measured in sub-meterings 1, 2 and 3.


2. The dataset contains some missing values in the measurements (nearly 1,25% of the rows). All calendar timestamps are present in the dataset but for some timestamps, the measurement values are missing: a missing value is represented by the absence of value between two consecutive semi-colon attribute separators. For instance, the dataset shows missing values on April 28, 2007.



Attribute | Information
:---: | :---
date | Date in format dd/mm/yyyy
time | time in format hh:mm:ss
global_active_power | household global minute-averaged active power (in kilowatt)
global_reactive_power | household global minute-averaged reactive power (in kilowatt)
voltage | minute-averaged voltage (in volt)
global_intensity | household global minute-averaged current intensity (in ampere)
sub_metering_1 | energy sub-metering No. 1 (in watt-hour of active energy). It corresponds to the kitchen, containing mainly a dishwasher, an oven and a microwave (hot plates are not electric but gas powered).
sub_metering_2 | energy sub-metering No. 2 (in watt-hour of active energy). It corresponds to the laundry room, containing a washing-machine, a tumble-drier, a refrigerator and a light.
sub_metering_3 | energy sub-metering No. 3 (in watt-hour of active energy). It corresponds to an electric water-heater and an air-conditioner.


## 1.1. Download and unzip data

In [2]:
!wget -P data/ \
https://archive.ics.uci.edu/ml/machine-learning-databases/00235/household_power_consumption.zip

--2019-10-08 18:25:02--  https://archive.ics.uci.edu/ml/machine-learning-databases/00235/household_power_consumption.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 20640916 (20M) [application/x-httpd-php]
Saving to: ‘data/household_power_consumption.zip’


2019-10-08 18:25:04 (13.3 MB/s) - ‘data/household_power_consumption.zip’ saved [20640916/20640916]



In [3]:
%%bash
unzip -u data/household_power_consumption.zip -d data/
rm -rf data/*.zip*
sed '/^\s*$/d' data/household_power_consumption.txt > data/household_power_consumption.csv
rm -rf data/*.txt

Archive:  data/household_power_consumption.zip
  inflating: data/household_power_consumption.txt  


## 1.2. Check Dataset Schema

In [4]:
import pandas as pd

dataset = pd.read_csv('data/household_power_consumption.csv',
                      sep=";")

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
dataset.head()

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,16/12/2006,17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
1,16/12/2006,17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2,16/12/2006,17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
3,16/12/2006,17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
4,16/12/2006,17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0


In [6]:
dataset.dtypes

Date                      object
Time                      object
Global_active_power       object
Global_reactive_power     object
Voltage                   object
Global_intensity          object
Sub_metering_1            object
Sub_metering_2            object
Sub_metering_3           float64
dtype: object

CREATE TABLE statement:

`
CREATE TABLE raw.individual_household_power_consumption (
    _id SERIAL PRIMARY KEY,
    Date TEXT,
    Time TEXT,
    Global_active_power TEXT,
    Global_reactive_power TEXT,
    Voltage TEXT,
    Global_intensity TEXT,
    Sub_metering_1 TEXT,
    Sub_metering_2 TEXT,
    Sub_metering_3 TEXT
);`

## 1.3. Create postgres functions

In [14]:
import psycopg2
import os
import sys
from lib import postgres
from datetime import datetime

### 1.3.1. Function to execute scripts:

In [None]:
def run_command(command):
    conn, cur = postgres.connect_to_postgres()
    cur.execute('BEGIN;')
    print(cur.execute(command))
    cur.execute('COMMIT;')
    conn.close()

In [None]:
cmd = """
CREATE TABLE raw.individual_household_power_consumption (
    _id SERIAL PRIMARY KEY,
    Date TEXT,
    Time TEXT,
    Global_active_power TEXT,
    Global_reactive_power TEXT,
    Voltage TEXT,
    Global_intensity TEXT,
    Sub_metering_1 TEXT,
    Sub_metering_2 TEXT,
    Sub_metering_3 TEXT
);"""

run_command(cmd)

### 1.3.2. Function to perform bulk_load on Postgres:

In [41]:
def bulk_load_df(df, schema, table, temp_path = 'data/tmp/'):
    '''
    This function creates a csv file from PostgreSQL with query
    '''
    try:
        # Connect to DB
        conn, cur = postgres.connect_to_postgres()
        print("Connecting to Database")
        
        if not os.path.exists(temp_path):
            os.makedirs(temp_path)
        
        # Write to CSV file
        temp_csv_name = '{}.{}_{}.csv'.format( schema, table, datetime.now() )
        temp_csv_path = temp_path + temp_csv_name
        df.to_csv(temp_csv_path, encoding='utf-8', header = True, doublequote = True, sep=',', index=False)
        print("CSV File has been created")
        
        # Truncate the target table
        cur.execute('BEGIN;')
        cur.execute("TRUNCATE {}.{};".format(schema,table))
        cur.execute('COMMIT;')
        print("Truncated {}".format(table))
        
        # Load table from the file with header
        cur.execute("BEGIN;")
        f = open(temp_csv_path, "r")
        cur.copy_expert("COPY {}.{}({}) FROM STDIN CSV HEADER QUOTE '\"'".format(schema,table,','.join(dataset.columns.values)), f)
        cur.execute("COMMIT;")
        print("Loaded data into {}".format(table))
        
        # Closing the connection
        conn.close()
        print("DB connection closed.")
        
        # Remove temp CSV file
        os.remove(temp_csv_path)

    except Exception as e:
        print("Error: {}".format(str(e)))
        sys.exit(1)

In [42]:
bulk_load_df(dataset,'raw','individual_household_power_consumption')

Connecting to Database
CSV File has been created
Truncated individual_household_power_consumption
Loaded data into individual_household_power_consumption
DB connection closed.


### 1.3.3. Function to read SQL querys to pandas DataFrame

In [43]:
def load_query_to_df(sql_command):
    '''
    This function loads the results from a query to a dataframe.
    '''
    # Connect to DB
    conn, cur = postgres.connect_to_postgres()

    # Load the data
    data = pd.read_sql(sql_command, conn)
    conn.close()
    
    return data

In [53]:
sql_df = load_data("""
SELECT 
    TO_TIMESTAMP(CONCAT(Date,' ',Time),'DD-MM-YYYY HH24:MI:SS') as dt,
    Global_active_power as global_active_power,
    Global_reactive_power as global_reactive_power,
    Voltage as voltage,
    Global_intensity as current,
    Sub_metering_1 as sub_metering_1 
    Sub_metering_2
     Sub_metering_3
FROM raw.individual_household_power_consumption
LIMIT 10;
""")
sql_df.head()

Unnamed: 0,dt
0,2006-12-16 17:24:00+00:00
1,2006-12-16 17:25:00+00:00
2,2006-12-16 17:26:00+00:00
3,2006-12-16 17:27:00+00:00
4,2006-12-16 17:28:00+00:00


In [55]:
sql_df.values

array([[Timestamp('2006-12-16 17:24:00+0000', tz='UTC')],
       [Timestamp('2006-12-16 17:25:00+0000', tz='UTC')],
       [Timestamp('2006-12-16 17:26:00+0000', tz='UTC')],
       [Timestamp('2006-12-16 17:27:00+0000', tz='UTC')],
       [Timestamp('2006-12-16 17:28:00+0000', tz='UTC')],
       [Timestamp('2006-12-16 17:29:00+0000', tz='UTC')],
       [Timestamp('2006-12-16 17:30:00+0000', tz='UTC')],
       [Timestamp('2006-12-16 17:31:00+0000', tz='UTC')],
       [Timestamp('2006-12-16 17:32:00+0000', tz='UTC')],
       [Timestamp('2006-12-16 17:33:00+0000', tz='UTC')]], dtype=object)