# Data Warehouse with Redshift - ETL
Use this notebook to develop the ETL process for each of your tables before completing the `etl.py` file to load the whole datasets.

>
> **Stephanie Anderton**  
> DEND Project \#3  
> May 30, 2019
>

In [1]:
import configparser
import psycopg2
import pandas as pd
import json
import time
import mylib
from mylib import logger
from sql_queries import copy_table_queries, insert_table_queries
%load_ext sql

## Function Definitions

In [2]:
def load_staging_tables(cur, conn):
    i = 0
    for query in copy_table_queries:
        i += 1
        try:
            logger.info('load staging table {}'.format(i))
            cur.execute(query)
            conn.commit()
        except psycopg2.Error as e: 
            logger.info('Error: Issue staging table {}'.format(i))
            print(e)


In [3]:
def insert_tables(cur, conn):
    i = 0
    for query in insert_table_queries:
        i += 1
        try:
            logger.info('insert to table {}'.format(i))
            cur.execute(query)
            conn.commit()
        except psycopg2.Error as e: 
            logger.info('Error: Issue inserting to table {}'.format(i))
            print(e)
            print(query)


In [4]:
def count_table_rows(cur, conn):
    i = 0
    for query in count_table_queries:
        i += 1
        try:
            cur.execute(query)
            conn.commit()
            result = cur.fetchall()
            rows = result[0][0]
            logger.info('table {} count:  {}'.format(i, rows))
        except psycopg2.Error as e: 
            logger.info('Error: Issue counting table {}'.format(i))
            print(e)


## Read Config File

In [5]:
logger.info('---[ Begin ETL ]---')
logger.info(time.strftime('%Y-%m-%d  %I:%M:%S %p'))

config = configparser.ConfigParser()
config.read('dwh.cfg')

HOST         = config['CLUSTER']['HOST']
DB_NAME      = config['CLUSTER']['DB_NAME']
DB_USER      = config['CLUSTER']['DB_USER']
DB_PASSWORD  = config['CLUSTER']['DB_PASSWORD']
DB_PORT      = config['CLUSTER']['DB_PORT']

ARN          = config['IAM_ROLE']['ARN']

LOG_DATA     = config['S3']['LOG_DATA']
LOG_JSONPATH = config['S3']['LOG_JSONPATH']
SONG_DATA    = config['S3']['SONG_DATA']

logger.info('LOG_DATA:  {}'.format(LOG_DATA))
logger.info('LOG_JSONPATH:  {}'.format(LOG_JSONPATH))
logger.info('SONG_DATA:  {}'.format(SONG_DATA))

## Open Database Connection

In [6]:
try:
    conn_string = "host={} dbname={} user={} password={} port={}"
    conn_string = conn_string.format(*config['CLUSTER'].values())
    conn = psycopg2.connect( conn_string )
    cur = conn.cursor()
    print(conn_string)

except Exception as e:
    print("Error: Could not make connection to the sparkify DWH")
    print(e)
finally:
    logger.info('Open database connection')

host=dwhcluster.cbsjbxldkge8.us-west-2.redshift.amazonaws.com dbname=sparkify user=dwhuser password=Passw0rd port=5439


## Disable Cache

In [7]:
try:
    cur.execute("SET enable_result_cache_for_session TO OFF;")
    conn.commit()
except psycopg2.Error as e: 
    print("Error: setting cache to OFF")
    print(e)
finally:
    logger.info('Disable cache for session')

## Load Staging Tables

In [8]:
logger.info('Load staging tables...')
load_staging_tables(cur, conn)

## Insert to Final Tables

In [9]:
logger.info('Load final tables...')
insert_tables(cur, conn)

## Check Table Counts

In [10]:
conn_string_2 = "postgresql://{}:{}@{}:{}/{}".format(DB_USER, DB_PASSWORD, 
                                                     HOST, DB_PORT, DB_NAME)
print(conn_string_2)
%sql $conn_string_2

postgresql://dwhuser:Passw0rd@dwhcluster.cbsjbxldkge8.us-west-2.redshift.amazonaws.com:5439/sparkify


'Connected: dwhuser@sparkify'

In [11]:
from sql_queries import count_table_queries
logger.info('Check table counts...')
count_table_rows(cur, conn)

In [12]:
%%sql
SELECT * FROM staging_events
LIMIT  10;

 * postgresql://dwhuser:***@dwhcluster.cbsjbxldkge8.us-west-2.redshift.amazonaws.com:5439/sparkify
10 rows affected.


event_key,artist,auth,firstname,gender,iteminsession,lastname,length,level,location,method,page,registration,sessionid,song,status,ts,useragent,userid
45,Tamba Trio,Logged In,Kaylee,F,4,Summers,177.18812,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540344794796.0,139,Quem Quiser Encontrar O Amor,200,1541106496796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36""",8
109,M.I.A.,Logged In,Ryan,M,2,Smith,233.7171,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541016707796.0,169,Mango Pickle Down River (With The Wilcannia Mob),200,1541109325796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36""",26
31,Tracy Gang Pussy,Logged In,Stefany,F,2,White,221.33506,free,"Lubbock, TX",PUT,NextSong,1540708070796.0,82,I Have A Wish,200,1541122457796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",83
95,The Decemberists,Logged In,Lily,F,1,Koch,242.59873,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1541048010796.0,172,Everything I Try to Do_ Nothing Seems to Turn Out Right,200,1541149456796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36""",15
159,Tiziano Ferro,Logged In,Lily,F,7,Koch,251.42812,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1541048010796.0,172,Ed Ero Contentissimo,200,1541150809796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36""",15
223,Marques Houston,Logged In,Lily,F,15,Koch,264.98567,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1541048010796.0,172,Naked,200,1541152676796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36""",15
287,She & Him,Logged In,Lily,F,23,Koch,167.83628,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1541048010796.0,172,Got Me,200,1541154085796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36""",15
351,Avantasia,Logged In,Lily,F,31,Koch,368.97914,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1541048010796.0,172,Shelter From The Rain,200,1541155991796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36""",15
415,Calle 13 Featuring CafÃÂ© Tacuba,Logged In,Lily,F,37,Koch,293.32853,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1541048010796.0,172,No Hay Nadie Como TÃÂº,200,1541157637796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36""",15
479,Washed Out,Logged In,Tegan,F,3,Levine,168.6722,free,"Portland-South Portland, ME",PUT,NextSong,1540794356796.0,165,New Theory,200,1541158225796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",80


In [13]:
%%sql
SELECT * FROM staging_songs
LIMIT  10;

 * postgresql://dwhuser:***@dwhcluster.cbsjbxldkge8.us-west-2.redshift.amazonaws.com:5439/sparkify
10 rows affected.


song_id,artist_id,artist_latitude,artist_longitude,artist_location,artist_name,title,duration,year
SOUCGZB12AB01827CB,ARSQDRW1187FB38AE6,,,,volcano!,$40_000 Plus Interest,410.33098,2005
SOBGTOG12AB0183D03,ARYN6H41187B98AA0F,32.77815,-96.7954,"Dallas, TX",Michael Martin Murphey,'Twas In The Moon Of Wintertime,148.79302,0
SOCYHQT12A58A79F35,ARTM8L21187B99F8F9,53.4796,-2.24881,"Manchester, England",Any Trouble,(Get You Off) The Hook,181.89016,0
SOBUFHA12A6702065F,ARP29T31187B98DD5F,37.80506,-122.27302,"Oakland, CA",Keyshia Cole,(I Just Want It) To Be Over,242.28526,2005
SONQVJV12A8C132754,ARKTJAT1187B9B3D8A,,,,The Nectarine No. 9,(Sic),223.03302,0
SOFDEJA12A8C13A055,ARI5E6U1187B9B525A,,,"Chillicothe, OH",Nancy Wilson,(You Don't Know) How Glad I Am (2001 Digital Remaster),159.60771,0
SOCYLNE12A81C223ED,ARCACMY11C8A42C870,,,,John Travolta / Christopher Walken,"(You're) Timeless To Me (""Hairspray"")",287.16363,0
SOBUGJO12A81C20840,ARZPDAD1187B98D940,,,,Fattburger,100 Ways,293.09342,2004
SOZTRCD12AB0185F85,ARMB9SG1187B98DA02,,,,The Flaming Sideburns,13 Women,220.26404,2006
SOFFQQJ12AAF3B285D,ARN092T1187B9928A5,,,,Neon Heights featuring Zed J,16 Again,450.87302,2001


In [14]:
%%sql
SELECT * FROM songplays
LIMIT  10;

 * postgresql://dwhuser:***@dwhcluster.cbsjbxldkge8.us-west-2.redshift.amazonaws.com:5439/sparkify
10 rows affected.


sp_songplay_id,sp_start_time,sp_user_id,sp_level,sp_song_id,sp_artist_id,sp_session_id,sp_location,sp_user_agent
11,2018-11-05 17:00:27,73,paid,SOBANHD12A58A7BB7C,ARSUFX91187FB3B73E,255,"Tampa-St. Petersburg-Clearwater, FL","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2"""
44,2018-11-06 20:12:11,97,paid,SODCQYZ12A6D4F9B26,ARYJ7KN1187B98CC73,293,"Lansing-East Lansing, MI","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"""
453,2018-11-07 15:41:10,15,paid,SOWEUOO12A6D4F6D0C,ARQUMH41187B9AF699,221,"Chicago-Naperville-Elgin, IL-IN-WI","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36"""
351,2018-11-13 17:28:33,97,paid,SOIBHYW12AB0188F49,ARWNARC122BCFCAFEB,537,"Lansing-East Lansing, MI","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"""
86,2018-11-14 06:19:41,80,paid,SOACRBY12AB017C757,ARVGCRM11F50C496F4,548,"Portland-South Portland, ME","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"""
104,2018-11-14 15:47:47,80,paid,SOHOVIP12A6D4F9267,ARQQ5B61187B9B4F61,574,"Portland-South Portland, ME","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"""
437,2018-11-14 23:38:06,49,paid,SOWEUOO12A6D4F6D0C,ARQUMH41187B9AF699,576,"San Francisco-Oakland-Hayward, CA",Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0
121,2018-11-15 07:31:55,49,paid,SOCUITT12AB0187A32,ARKS2FE1187B99325D,606,"San Francisco-Oakland-Hayward, CA",Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0
48,2018-11-18 19:24:51,29,paid,SOCOWCL12A8C1415F7,AR8K3HD1187B9B9CA9,589,"Atlanta-Sandy Springs-Roswell, GA","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2"""
42,2018-11-19 17:41:51,52,free,SOFPDCU12A6D4FD0DC,AR8P6CT1187FB458AB,601,"Houston-The Woodlands-Sugar Land, TX",Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0


In [15]:
%%sql
SELECT * FROM users
LIMIT  10;

 * postgresql://dwhuser:***@dwhcluster.cbsjbxldkge8.us-west-2.redshift.amazonaws.com:5439/sparkify
10 rows affected.


u_user_id,u_first_name,u_last_name,u_gender,u_level
2,Jizelle,Benjamin,F,free
3,Isaac,Valdez,M,free
4,Alivia,Terrell,F,free
5,Elijah,Davis,M,free
6,Cecilia,Owens,F,free
7,Adelyn,Jordan,F,free
8,Kaylee,Summers,F,free
9,Wyatt,Scott,M,free
10,Sylvie,Cruz,F,free
11,Christian,Porter,F,free


In [16]:
%%sql
SELECT * FROM songs
LIMIT  10;

 * postgresql://dwhuser:***@dwhcluster.cbsjbxldkge8.us-west-2.redshift.amazonaws.com:5439/sparkify
10 rows affected.


s_song_id,s_title,s_artist_id,s_year,s_duration
SOAACFC12A8C140567,Supernatural Pt. II,ARNHTE41187B99289A,0,343.09179
SOAADJH12AB018BD30,Black Light (Album Version),AR3FKJ61187B990357,1975,385.90649
SOAAMWQ12A8C144DF1,Happy Nation,AR2IKF71187FB4D0C2,1992,255.08526
SOABVWY12AB018274B,Honeysuckle Dog,ARAT5YT1187B9AE90B,1993,245.99465
SOACBGF12AC9097E79,O'Malley & Delacey,ARQFV881187FB3C24C,0,220.02893
SOACMKO12A8C1448C1,Border Girl,ARCLPWN11F4C83DF42,0,232.88118
SOADFRL12AB01858A6,Techno Powers,ARR3Y511187B9B2932,1997,514.2722
SOADFSR12AB018C17E,Gunman,ARVG8YI1187FB4CBA7,1990,233.40363
SOADMFM12A8C131911,Hard As Steel,AR3K1T51187FB4042A,0,309.4722
SOADZEU12AB018AAFD,This Must Be Love,ARQ35ML1187B99BB8D,0,168.64608


In [17]:
%%sql
SELECT * FROM artists
LIMIT  10;

 * postgresql://dwhuser:***@dwhcluster.cbsjbxldkge8.us-west-2.redshift.amazonaws.com:5439/sparkify
10 rows affected.


a_artist_id,a_name,a_location,a_latitude,a_longitude
AR00MQ31187B9ACD8F,Chris Carrier,,,
AR039B11187B9B30D0,John Williams,"NEW YORK, New York",,
AR048JZ1187B9AEB85,Yellowcard,"Jacksonville, FL",30.33138,-81.6558
AR04S8J1187FB48358,Clifford Brown / Max Roach Quintet,"Wilmington, DE",39.74023,-75.55084
AR04S8J1187FB48358,Clifford Brown,"Wilmington, DE",39.74023,-75.55084
AR050VJ1187B9B13A7,Dead Kennedys,,,
AR05OG01187B9B8A98,Todd Rundgren,"Upper Darby, PA",,
AR06RZX1187B9B9864,Zélia Duncan,,,
AR0737R1187B99F0E2,Julee Cruise,"Creston, IA",,
AR086UB1187B994281,Czech Philharmonic Orchestra,,,


In [18]:
%%sql
SELECT * FROM time
LIMIT  10;

 * postgresql://dwhuser:***@dwhcluster.cbsjbxldkge8.us-west-2.redshift.amazonaws.com:5439/sparkify
10 rows affected.


t_start_time,t_hour,t_day,t_week,t_month,t_year,t_weekday
2018-11-01 21:52:05,21,1,44,11,2018,4
2018-11-02 05:52:29,5,2,44,11,2018,5
2018-11-02 09:22:43,9,2,44,11,2018,5
2018-11-02 10:13:59,10,2,44,11,2018,5
2018-11-02 10:53:11,10,2,44,11,2018,5
2018-11-02 11:04:31,11,2,44,11,2018,5
2018-11-02 11:56:43,11,2,44,11,2018,5
2018-11-02 12:14:55,12,2,44,11,2018,5
2018-11-02 12:19:04,12,2,44,11,2018,5
2018-11-02 12:39:30,12,2,44,11,2018,5


## Close Database Connection

In [19]:
conn.close()