# Data Warehouse with Redshift - ETL
Use this notebook to develop the ETL process for each of your tables before completing the `etl.py` file to load the whole datasets.

>
> **Stephanie Anderton**  
> DEND Project \#3  
> May 29, 2019
>

In [1]:
import configparser
import psycopg2
import pandas as pd
import json
import time
import mylib
from   mylib import logger
import re
from   sql_queries import copy_table_queries, insert_table_queries
from   sql_queries import count_table_queries

In [2]:
# On first run through, this MUST be set to 1 
# UNLESS you created the tables from the command line!

b_staging_tables = False

## Function Definitions

In [3]:
def load_staging_tables(cur, conn):
    logger.info('Load staging tables...')
    
    for query in copy_table_queries:
        # the table name is the 2nd word in the query string
        table = re.findall(r'\w+', query)[1]
        logger.info('load staging table [ {} ]'.format(table))
        
        try:
            cur.execute(query)
            conn.commit()
        except psycopg2.Error as e: 
            logger.info('Error :  Staging table [ {} ]'.format(table))
            print(e)
            print(query)


In [4]:
def insert_tables(cur, conn):
    logger.info('Load final tables...')
    
    for query in insert_table_queries:
        # the table name is the 3rd word in the query string
        table = re.findall(r'\w+', query)[2]
        logger.info('insert to table [ {} ]'.format(table))
        
        try:
            cur.execute(query)
            conn.commit()
        except psycopg2.Error as e: 
            logger.info('Error :  Inserting to table [ {} ]'.format(table))
            print(e)
            print(query)


In [5]:
def count_table_rows(cur, conn):
    logger.info('Check table counts...')
    
    for query in count_table_queries:
        # the table name is the last word in the query string
        table = re.findall(r'\w+', query)[-1]
        
        try:
            cur.execute(query)
            conn.commit()
            # the query returns the row count
            result = cur.fetchall()
            rows = result[0][0]
            logger.info("table [ {} ] count:  {}".format(table, rows))
            
        except psycopg2.Error as e: 
            logger.info('Error :  Issue counting table [ {} ]'.format(table))
            print(e)
            print(query)


In [6]:
def disable_result_cache(cur, conn):
    try:
        cur.execute("SET enable_result_cache_for_session TO OFF;")
        conn.commit()
        logger.info('Disable cache for session')
    except psycopg2.Error as e: 
        logger.info("Error :  setting result cache to OFF")
        print(e)


In [7]:
def log_config_params(config):
    LOG_DATA     = config['S3']['LOG_DATA']
    LOG_JSONPATH = config['S3']['LOG_JSONPATH']
    SONG_DATA    = config['S3']['SONG_DATA']

    logger.info('LOG_DATA:  {}'.format(LOG_DATA))
    logger.info('LOG_JSONPATH:  {}'.format(LOG_JSONPATH))
    logger.info('SONG_DATA:  {}'.format(SONG_DATA))


## Main()
### Setup DB connection

In [8]:
logger.info('---[ Begin ETL ]---')
mylib.log_timestamp()
print("Logfile:  " + mylib.get_log_file_name())

config = configparser.ConfigParser()
config.read('dwh.cfg')

log_config_params(config)

try:
    conn_string = "host={} dbname={} user={} password={} port={}"
    conn_string = conn_string.format(*config['CLUSTER'].values())
    conn = psycopg2.connect( conn_string )
    cur = conn.cursor()
    
    print(conn_string)
    logger.info('DB connection :  open')
    
except Exception as e:
    print("Error :  Could not make connection to the sparkify DB")
    print(e)

disable_result_cache(cur, conn)

Logfile:  ./logs/etl-20190530.log
host=dwhcluster.cbsjbxldkge8.us-west-2.redshift.amazonaws.com dbname=sparkify user=dwhuser password=Passw0rd port=5439


### Load Tables

In [9]:
if b_staging_tables == True:
    load_staging_tables(cur, conn)
    
    # Now we can set flag to **False**, so on the next run through
    # I can then work on the dimensional tables only
    b_staging_tables = False

insert_tables(cur, conn)
count_table_rows(cur, conn)

In [10]:
# All done...
conn.close()
logger.info('DB connection :  closed')