# Data Warehouse with Redshift - ETL
Use this notebook to develop the ETL process for each of your tables before completing the `etl.py` file to load the whole datasets.

>
> **Stephanie Anderton**  
> DEND Project \#3  
> May 30, 2019
>

In [1]:
import configparser
import psycopg2
import pandas as pd
import json
import time
from mylib import logger
from sql_queries import copy_table_queries, insert_table_queries
%load_ext sql

## Function Definitions

In [2]:
def load_staging_tables(cur, conn):
    i = 0
    for query in copy_table_queries:
        i += 1
        try:
            print("testing...")
            print(query)
            cur.execute(query)
            conn.commit()
        except psycopg2.Error as e: 
            logger.info('Error: Issue staging table {}'.format(i))
            print(e)
        finally:
            logger.info('load staging table {}'.format(i))


In [3]:
def insert_tables(cur, conn):
    i = 0
    for query in insert_table_queries:
        i += 1
        try:
            print("testing...")
            print(query)
            cur.execute(query)
            conn.commit()
        except psycopg2.Error as e: 
            logger.info('Error: Issue inserting to table {}'.format(i))
            print(e)
        finally:
            logger.info('insert to table {}'.format(i))


## Read Config File & Open Database Connection

In [4]:
logger.info('---[ Begin ETL ]---')
logger.info(time.strftime('%Y-%m-%d  %I:%M:%S %p'))

config = configparser.ConfigParser()
config.read('dwh.cfg')

HOST         = config['CLUSTER']['HOST']
DB_NAME      = config['CLUSTER']['DB_NAME']
DB_USER      = config['CLUSTER']['DB_USER']
DB_PASSWORD  = config['CLUSTER']['DB_PASSWORD']
DB_PORT      = config['CLUSTER']['DB_PORT']

ARN          = config['IAM_ROLE']['ARN']

LOG_DATA     = config['S3']['LOG_DATA']
LOG_JSONPATH = config['S3']['LOG_JSONPATH']
SONG_DATA    = config['S3']['SONG_DATA']

logger.info('LOG_DATA:  {}'.format(LOG_DATA))
logger.info('LOG_JSONPATH:  {}'.format(LOG_JSONPATH))
logger.info('SONG_DATA:  {}'.format(SONG_DATA))

In [5]:
try:
    conn_string = "host={} dbname={} user={} password={} port={}"
    conn_string = conn_string.format(*config['CLUSTER'].values())
    conn = psycopg2.connect( conn_string )
    cur = conn.cursor()
    print(conn_string)

except Exception as e:
    print("Error: Could not make connection to the sparkify DWH")
    print(e)

host=dwhcluster.cbsjbxldkge8.us-west-2.redshift.amazonaws.com dbname=sparkify user=dwhuser password=Passw0rd port=5439


## Disable Cache

In [6]:
try:
    cur.execute("SET enable_result_cache_for_session TO OFF;")
    conn.commit()
except psycopg2.Error as e: 
    print("Error: setting cache to OFF")
    print(e)
finally:
    logger.info('Disable cache for session')

## Load Staging Tables

In [7]:
load_staging_tables(cur, conn)

testing...

    COPY     staging_songs
    FROM     's3://udacity-dend/song_data/A/B/C'
    IAM_ROLE 'arn:aws:iam::376450510082:role/dwhRole'
    JSON     'auto'



## Insert to Final Tables

In [None]:
insert_tables(cur, conn)

## Check Table Counts

In [8]:
conn_string_2 = "postgresql://{}:{}@{}:{}/{}".format(DB_USER, DB_PASSWORD, 
                                                     HOST, DB_PORT, DB_NAME)
print(conn_string_2)
%sql $conn_string_2

postgresql://dwhuser:Passw0rd@dwhcluster.cbsjbxldkge8.us-west-2.redshift.amazonaws.com:5439/sparkify


'Connected: dwhuser@sparkify'

In [9]:
%sql SELECT COUNT(*) AS staging_events FROM staging_events;

 * postgresql://dwhuser:***@dwhcluster.cbsjbxldkge8.us-west-2.redshift.amazonaws.com:5439/sparkify
1 rows affected.


staging_events
0


In [10]:
%sql SELECT COUNT(*) AS staging_songs  FROM staging_songs;

 * postgresql://dwhuser:***@dwhcluster.cbsjbxldkge8.us-west-2.redshift.amazonaws.com:5439/sparkify
1 rows affected.


staging_songs
23


In [12]:
%%sql
SELECT * FROM staging_songs
WHERE  song_id IS NOT NULL
LIMIT  10;

 * postgresql://dwhuser:***@dwhcluster.cbsjbxldkge8.us-west-2.redshift.amazonaws.com:5439/sparkify
10 rows affected.


song_id,artist_id,artist_latitude,artist_longitude,artist_location,artist_name,title,duration,year
SODREIN12A58A7F2E5,ARLTWXK1187FB5A3F8,32.74863,-97.32925,"Fort Worth, TX",King Curtis,A Whiter Shade Of Pale (Live @ Fillmore West),326.00771,0
SOMAPYF12A6D4FEC3E,AR5S9OB1187B9931E3,34.05349,-118.24532,"Los Angeles, CA",Bullet Boys,All Day & All Of The Night,156.62975,0
SOEVPWF12A58A7D254,ARCJWLU1187B9ADD36,,,,Theory In Practice,Astral eyes,274.54649,1997
SODWBIK12AB017F87D,ARSMG8X1187B99CA99,,,,Macaco,Aüita,210.1024,2009
SOGHHXH12A8C13EBCB,ARCWVUK1187FB3C71A,,,,Brigitte Bardot,C'Est Une Bossa Nova,130.2722,1993
SOUPIRU12A6D4FA1E1,ARJIE2Y1187B994AB7,,,,Line Renaud,Der Kleine Dompfaff,152.92036,0
SOIKLJM12A8C136355,AR7AE0W1187B98E40E,,,,Intocable,Eso Duele,196.25751,2003
SOCEMJV12A6D4F7667,ARIOZCU1187FB3A3DC,,,"Hamlet, NC",JOHN COLTRANE,Giant Steps (Alternate Version_ Take 5_ Alternate),220.44689,0
SORRZGD12A6310DBC3,ARVBRGZ1187FB4675A,,,,Gwen Stefani,Harajuku Girls,290.55955,2004
SOTDCIR12AB0184574,ARZGTK71187B9AC7F5,,,"California, USA",Eels,I Need A Mother,158.01424,2010


In [None]:
%sql SELECT COUNT(*) AS songplays      FROM songplays;

In [None]:
%sql SELECT COUNT(*) AS users          FROM users;

In [None]:
%sql SELECT COUNT(*) AS songs          FROM songs;

In [None]:
%sql SELECT COUNT(*) AS artists        FROM artists;

In [None]:
%sql SELECT COUNT(*) AS time           FROM time;

In [None]:
%%sql

SELECT TIMESTAMP 'epoch' + 1541122241796/1000 * INTERVAL '1 second' AS date

## Close Database Connection

In [None]:
conn.close()