In [1]:
import configparser
import psycopg2
import pandas as pd
from time import time
import matplotlib.pyplot as plt
import boto3
import json
import sql

In [2]:
# CONFIG
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

DB_NAME                 = config.get("CLUSTER","DB_NAME")
DB_USER                 = config.get("CLUSTER","DB_USER")
DB_PASSWORD             = config.get("CLUSTER","DB_PASSWORD")
DB_PORT                 = config.get("CLUSTER","DB_PORT")
HOST                    =config.get("CLUSTER", "HOST")

pd.DataFrame({"Param":
                  ["DB_NAME", "DB_USER", "DB_PASSWORD", "DB_PORT"],
              "Value":
                  [DB_NAME, DB_USER, DB_PASSWORD, DB_PORT]
             })

Unnamed: 0,Param,Value
0,DB_NAME,dwh
1,DB_USER,dwhuser
2,DB_PASSWORD,Passw0rd
3,DB_PORT,5439


### Creating IAM User, EC2, S3 and Redshift

In [3]:
ec2 = boto3.resource('ec2',
                       region_name="us-west-2"
                    )

s3 = boto3.resource('s3',
                       region_name="us-west-2"
                   )

iam = boto3.client('iam', 
                     region_name='us-west-2'
                  )

redshift = boto3.client('redshift',
                       region_name="us-west-2"
                       )

#### Getting a sample song data

In [4]:
# S3 path
# s3://udacity-dend/log_json_path.json

file = s3.Object("udacity-dend", 'log_json_path.json')
content = json.loads(file.get()['Body'].read())
print(json.dumps(content, indent=4, sort_keys=True))


{
    "jsonpaths": [
        "$['artist']",
        "$['auth']",
        "$['firstName']",
        "$['gender']",
        "$['itemInSession']",
        "$['lastName']",
        "$['length']",
        "$['level']",
        "$['location']",
        "$['method']",
        "$['page']",
        "$['registration']",
        "$['sessionId']",
        "$['song']",
        "$['status']",
        "$['ts']",
        "$['userAgent']",
        "$['userId']"
    ]
}


#### Getting s3 objects using S3 Resource

In [5]:
obj_col = s3.Bucket("udacity-dend").objects.filter(Prefix = 'song_data/')

# print(list(obj_col)[0])
for i, obj in enumerate(obj_col):
    print(obj.key)
    if i > 10:
        break


song_data/
song_data/A/A/A/TRAAAAK128F9318786.json
song_data/A/A/A/TRAAAAV128F421A322.json
song_data/A/A/A/TRAAABD128F429CF47.json
song_data/A/A/A/TRAAACN128F9355673.json
song_data/A/A/A/TRAAAEA128F935A30D.json
song_data/A/A/A/TRAAAED128E0783FAB.json
song_data/A/A/A/TRAAAEM128F93347B9.json
song_data/A/A/A/TRAAAEW128F42930C0.json
song_data/A/A/A/TRAAAFD128F92F423A.json
song_data/A/A/A/TRAAAGR128F425B14B.json
song_data/A/A/A/TRAAAHD128F42635A5.json


Example of getting list of objects paths in S3 using Client (just another way of doing it)

In [6]:
s3_client = boto3.client('s3',
                       region_name="us-west-2"
                   )

lst_contents = s3_client.list_objects(Bucket="udacity-dend", Prefix='song_data')['Contents']

print("list length: {}".format(len(lst_contents)))

print(lst_contents[0])
print(lst_contents[1])

list length: 1000
{'Key': 'song_data/', 'LastModified': datetime.datetime(2019, 4, 17, 3, 12, 52, tzinfo=tzutc()), 'ETag': '"d41d8cd98f00b204e9800998ecf8427e"', 'Size': 0, 'StorageClass': 'STANDARD'}
{'Key': 'song_data/A/A/A/TRAAAAK128F9318786.json', 'LastModified': datetime.datetime(2019, 4, 17, 3, 20, 41, tzinfo=tzutc()), 'ETag': '"48e073986610c4997c26f8a394b8fd6e"', 'Size': 225, 'StorageClass': 'STANDARD'}


##### Extracting the content from 1 file in S3 bucket as a sample data

Extracting song_data sample

In [15]:
# files = s3.Bucket("udacity-dend").objects.filter(Prefix = 'song_data')

# getting a sample data from the original dataset
file = s3.Object("udacity-dend", 'song_data/A/A/A/TRAAAAV128F421A322.json')

# printing in a pretty JSON format
content = json.loads(file.get()['Body'].read())
print(json.dumps(content, indent=4))

{
    "artist_id": "AR73AIO1187B9AD57B",
    "artist_latitude": 37.77916,
    "artist_location": "San Francisco, CA",
    "artist_longitude": -122.42005,
    "artist_name": "Western Addiction",
    "duration": 118.07302,
    "num_songs": 1,
    "song_id": "SOQPWCR12A6D4FB2A3",
    "title": "A Poor Recipe For Civic Cohesion",
    "year": 2005
}


Extracting song_data sample

In [9]:
obj_col = s3.Bucket("udacity-dend").objects.filter(Prefix = 'log_data/')

# print(list(obj_col)[0])
for i, obj in enumerate(obj_col):
    print(obj.key)
    if i > 10:
        break

log_data/
log_data/2018/11/2018-11-01-events.json
log_data/2018/11/2018-11-02-events.json
log_data/2018/11/2018-11-03-events.json
log_data/2018/11/2018-11-04-events.json
log_data/2018/11/2018-11-05-events.json
log_data/2018/11/2018-11-06-events.json
log_data/2018/11/2018-11-07-events.json
log_data/2018/11/2018-11-08-events.json
log_data/2018/11/2018-11-09-events.json
log_data/2018/11/2018-11-10-events.json
log_data/2018/11/2018-11-11-events.json


In [18]:
obj_log = s3.Object("udacity-dend", 'log_data/2018/11/2018-11-03-events.json')

content_log = obj_log.get()['Body'].read().decode("utf-8")

df_log = pd.read_json(path_or_buf=content_log, orient='records', lines=True)
df_log[df_log.page == 'NextSong'].head(10)

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
2,Mynt,Logged In,Celeste,F,2,Williams,166.94812,free,"Klamath Falls, OR",PUT,NextSong,1541078000000.0,52,Playa Haters,200,1541207150796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",53
3,Taylor Swift,Logged In,Celeste,F,3,Williams,230.47791,free,"Klamath Falls, OR",PUT,NextSong,1541078000000.0,52,You Belong With Me,200,1541207316796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",53
4,Amy Winehouse,Logged In,Celeste,F,4,Williams,229.85098,free,"Klamath Falls, OR",PUT,NextSong,1541078000000.0,52,Valerie,200,1541207546796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",53
5,Jimmy Eat World,Logged In,Celeste,F,5,Williams,285.83138,free,"Klamath Falls, OR",PUT,NextSong,1541078000000.0,52,Dizzy,200,1541207775796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",53
7,Maldita Nerea,Logged In,Anabelle,F,0,Simpson,241.162,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",PUT,NextSong,1541044000000.0,158,Supelicula,200,1541254670796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",69
8,Fluke,Logged In,Connar,M,0,Moreno,478.92853,free,"Houston-The Woodlands-Sugar Land, TX",PUT,NextSong,1540824000000.0,168,Bermuda,200,1541257880796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10)...",62
9,Habib KoitÃÂ©,Logged In,Jayden,M,0,Fox,285.1522,free,"New Orleans-Metairie, LA",PUT,NextSong,1541034000000.0,185,Din Din Wo,200,1541259368796,"""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebK...",101
10,The Kooks,Logged In,Sara,F,0,Johnson,132.25751,paid,"Winston-Salem, NC",PUT,NextSong,1540809000000.0,152,Eddie's Gun,200,1541260356796,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",95
11,Blues Traveler,Logged In,Sara,F,1,Johnson,290.24608,paid,"Winston-Salem, NC",PUT,NextSong,1540809000000.0,152,Hook,200,1541260488796,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",95
12,Coldplay,Logged In,Sara,F,2,Johnson,298.762,paid,"Winston-Salem, NC",PUT,NextSong,1540809000000.0,152,Shiver,200,1541260778796,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",95


In [11]:
# original
print("Original Extraction")
print("Obj type (Bytes): {0}".format(type(obj_log.get()['Body'].read())))
print("Obj type (Converted to string): {0}".format(type(obj_log.get()['Body'].read().decode("utf-8"))))

# print("Sample")
print("Sample string \n {0}".format(obj_log.get()['Body'].read().decode("utf-8")[:1000])) # print the first 1000 characters

print()
print("Converted Pandas Dataframe")
df_log.head(5)

Original Extraction
Obj type (Bytes): <class 'bytes'>
Obj type (Converted to string): <class 'str'>
Sample string 
 {"artist":null,"auth":"Logged Out","firstName":null,"gender":null,"itemInSession":0,"lastName":null,"length":null,"level":"free","location":null,"method":"PUT","page":"Login","registration":null,"sessionId":52,"song":null,"status":307,"ts":1541207073796,"userAgent":null,"userId":""}
{"artist":null,"auth":"Logged In","firstName":"Celeste","gender":"F","itemInSession":1,"lastName":"Williams","length":null,"level":"free","location":"Klamath Falls, OR","method":"GET","page":"Home","registration":1541077528796.0,"sessionId":52,"song":null,"status":200,"ts":1541207123796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/37.0.2062.103 Safari\/537.36\"","userId":"53"}
{"artist":"Mynt","auth":"Logged In","firstName":"Celeste","gender":"F","itemInSession":2,"lastName":"Williams","length":166.94812,"level":"free","location":"Klamath 

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,,Logged Out,,,0,,,free,,PUT,Login,,52,,307,1541207073796,,
1,,Logged In,Celeste,F,1,Williams,,free,"Klamath Falls, OR",GET,Home,1541078000000.0,52,,200,1541207123796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",53.0
2,Mynt,Logged In,Celeste,F,2,Williams,166.94812,free,"Klamath Falls, OR",PUT,NextSong,1541078000000.0,52,Playa Haters,200,1541207150796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",53.0
3,Taylor Swift,Logged In,Celeste,F,3,Williams,230.47791,free,"Klamath Falls, OR",PUT,NextSong,1541078000000.0,52,You Belong With Me,200,1541207316796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",53.0
4,Amy Winehouse,Logged In,Celeste,F,4,Williams,229.85098,free,"Klamath Falls, OR",PUT,NextSong,1541078000000.0,52,Valerie,200,1541207546796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",53.0


### Connect to Redshift Cluster

In [14]:
%load_ext sql

The sql module is not an IPython extension.


In [12]:
import os 
conn_string="postgresql://{}:{}@{}:{}/{}".format(DB_USER, DB_PASSWORD, HOST, DB_PORT,DB_NAME)
print(conn_string)
%sql $conn_string

postgresql://dwhuser:Passw0rd@dwhcluster.csmamz5zxmle.us-west-2.redshift.amazonaws.com:5439/dwh


UsageError: Line magic function `%sql` not found.


### Create tables