# Data Warehouse with Redshift - ETL
Use this notebook to develop the ETL process for each of your tables before completing the `etl.py` file to load the whole datasets.

>
> **Stephanie Anderton**  
> DEND Project \#3  
> May 30, 2019
>

In [1]:
import configparser
import psycopg2
import pandas as pd
import json
import time

from sql_queries import copy_table_queries, insert_table_queries
%load_ext sql

## Function Definitions

In [2]:
def load_staging_tables(cur, conn):
    for query in copy_table_queries:
        try:
            print("testing...")
            print(query)
            cur.execute(query)
            conn.commit()
        except psycopg2.Error as e: 
            print("Error: Issue staging table")
            print(e)


In [3]:
def insert_tables(cur, conn):
    for query in insert_table_queries:
        try:
            print("testing...")
            print(query)
            cur.execute(query)
            conn.commit()
        except psycopg2.Error as e: 
            print("Error: Issue inserting to table")
            print(e)


## Main()
### Read Config File & Open Database Connection

In [4]:
config = configparser.ConfigParser()
config.read('dwh.cfg')

HOST         = config['CLUSTER']['HOST']
DB_NAME      = config['CLUSTER']['DB_NAME']
DB_USER      = config['CLUSTER']['DB_USER']
DB_PASSWORD  = config['CLUSTER']['DB_PASSWORD']
DB_PORT      = config['CLUSTER']['DB_PORT']

ARN          = config['IAM_ROLE']['ARN']
print(ARN)

'arn:aws:iam::376450510082:role/dwhRole'


In [5]:
try:
    conn_string = "host={} dbname={} user={} password={} port={}"
    conn_string = conn_string.format(*config['CLUSTER'].values())
    conn = psycopg2.connect( conn_string )
    cur = conn.cursor()
    print(conn_string)

except Exception as e:
    print("Error: Could not make connection to the sparkify DWH")
    print(e)

host=dwhcluster.cbsjbxldkge8.us-west-2.redshift.amazonaws.com dbname=sparkify user=dwhuser password=Passw0rd port=5439


### Disable Cache

In [6]:
try:
    cur.execute("SET enable_result_cache_for_session TO OFF;")
    conn.commit()
except psycopg2.Error as e: 
    print("Error: setting cache to OFF")
    print(e)

## Load Staging Tables

In [None]:
load_staging_tables(cur, conn)

## Insert to Final Tables

In [10]:
insert_tables(cur, conn)

testing...

    INSERT INTO songs (
        s_song_id, s_title, s_artist_id, s_year, s_duration
    )
    SELECT  DISTINCT stg_s_song_id,
            stg_s_title, stg_s_artist_id, stg_s_year, stg_s_duration
    FROM    staging_songs

Error: Issue inserting to table
Cannot insert a NULL value into column s_song_id
DETAIL:  
  -----------------------------------------------
  error:  Cannot insert a NULL value into column s_song_id
  code:      8007
  context:   query execution
  query:     1176
  location:  column:1
  process:   query0_126_1176 [pid=25588]
  -----------------------------------------------


testing...

    INSERT INTO artists (
        a_artist_id, a_name, a_location, a_latitude, a_longitude
    )
    SELECT  DISTINCT stg_s_artist_id,
            stg_s_artist_name, stg_s_artist_location,
            stg_s_artist_latitude, stg_s_artist_longitude
    FROM    staging_songs

Error: Issue inserting to table
current transaction is aborted, commands ignored until end of transa

## Check Table Counts

In [8]:
conn_string_2 = "postgresql://{}:{}@{}:{}/{}".format(DB_USER, DB_PASSWORD, 
                                                     HOST, DB_PORT, DB_NAME)
print(conn_string_2)
%sql $conn_string_2

postgresql://dwhuser:Passw0rd@dwhcluster.cbsjbxldkge8.us-west-2.redshift.amazonaws.com:5439/sparkify


'Connected: dwhuser@sparkify'

In [None]:
%%sql
SELECT COUNT(*) AS staging_events FROM staging_events;

In [9]:
%%sql
SELECT COUNT(*) AS staging_songs  FROM staging_songs;

 * postgresql://dwhuser:***@dwhcluster.cbsjbxldkge8.us-west-2.redshift.amazonaws.com:5439/sparkify
1 rows affected.


staging_songs
604


In [None]:
%%sql
SELECT COUNT(*) AS songplays      FROM songplays;

In [None]:
%%sql
SELECT COUNT(*) AS users          FROM users;

In [11]:
%%sql
SELECT COUNT(*) AS songs          FROM songs;

 * postgresql://dwhuser:***@dwhcluster.cbsjbxldkge8.us-west-2.redshift.amazonaws.com:5439/sparkify
1 rows affected.


songs
0


In [None]:
%%sql
SELECT COUNT(*) AS artists        FROM artists;

In [None]:
%%sql
SELECT COUNT(*) AS time           FROM time;

## Close Database Connection

In [12]:
conn.close()