# AWS Redshift Schema Pipeline & ETL Pipeline

In [1]:
# import from etl dir
import sys, os
etl_path = os.path.normpath(os.path.abspath(os.path.join(os.path.abspath(''), os.path.pardir, 'etl')))
if (not (etl_path in sys.path)) :
    sys.path.append(etl_path)

In [2]:
# ETL libs
from connection import Connection
from create_tables import SchemaPipeline
from etl import ETLPipeline

## 1. Initialize Connection and Pipeline Objects

- Set the AWS Redshift connector instance.
- Create the schema pipeline executor.
- Create the ETL pipeline executor.

In [3]:
connection = Connection(redshift=True)
schema_pipeline = SchemaPipeline(connection)
etl_pipeline = ETLPipeline(connection)

## 2. Run the Schema Pipeline

- Drop all tables.
- Create staging tables.
- Create Data Warehouse tables.

In [4]:
schema_pipeline.run()

-----------------------------------------------------
AWS Redshift Schema Pipeline
-----------------------------------------------------
INFO: Droping the database tables...
INFO: Database tables droped.
INFO: Creating the database schema...
INFO: Database schema created.
-----------------------------------------------------
Time Statistics
-----------------------------------------------------
Drop tables time: 7.42 seconds
Create tables time: 19.34 seconds


## 2. Run the ETL Pipeline

- Copy JSON data from S3 buckets to staging tables.
- Select and transform data from staging tables.
- Insert transformed data into Data Warehouse tables.

In [5]:
etl_pipeline.run()

-----------------------------------------------------
AWS Redshift ETL Pipeline
-----------------------------------------------------
INFO: Loading S3 data into staging tables...
INFO: Staging tables loaded.
INFO: Inserting data into DW tables...
INFO: DW tables loaded.
-----------------------------------------------------
Time Statistics
-----------------------------------------------------
Staging tables time: 895.05 seconds
Insert tables time: 7.77 seconds


## 3. Fetch Data from All Tables to Validate the Process

Define the ```fetch_dataframe``` function to query a table and export its data to a CSV file.

In [6]:
def fetch_dataframe(table, folder='../../data'):
    # query table
    connection.cursor.execute(f'SELECT * FROM {table}')
    # fecth data frame
    data = connection.cursor.fetch_dataframe()
    # commit
    connection.commit()
    # export CSV data
    data.to_csv(f'{folder}/{table}.csv')
    # return data frama
    return data

### 3.1 Fetch ```staging_events``` table

In [7]:
staging_events = fetch_dataframe('staging_events')
staging_events.head(2)

Unnamed: 0,userid,firstname,lastname,gender,level,artist,song,length,sessionid,auth,iteminsession,location,registration,ts,page,useragent,status,method
0,39,Walter,Frye,M,free,,,,38,Logged In,0,"San Francisco-Oakland-Hayward, CA",1540919166796,1541105830796,Home,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",200,GET
1,8,Kaylee,Summers,F,free,,,,139,Logged In,0,"Phoenix-Mesa-Scottsdale, AZ",1540344794796,1541106106796,Home,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",200,GET


### 3.2 Fetch ```staging_songs``` table

In [8]:
staging_songs = fetch_dataframe('staging_songs')
staging_songs.head(2)

Unnamed: 0,song_id,title,duration,year,num_songs,artist_id,artist_name,artist_location,artist_latitude,artist_longitude
0,SOCIWDW12A8C13D406,Soul Deep,148.03546,1969,1,ARMJAGH1187FB546F3,The Box Tops,"Memphis, TN",35.14968,-90.04892
1,SOIGICF12A8C141BC5,Game & Watch,580.54485,2004,1,AREWD471187FB49873,Son Kite,,,


### 3.3 Fetch ```users``` table

In [9]:
users = fetch_dataframe('users')
users.head(2)

Unnamed: 0,user_id,first_name,last_name,gender,level
0,2,Jizelle,Benjamin,F,free
1,4,Alivia,Terrell,F,free


### 3.4 Fetch ```songs``` table

In [10]:
songs = fetch_dataframe('songs')
songs.head(2)

Unnamed: 0,song_id,title,year,duration,artist_id
0,SORKDUM12A6D4FA813,A Little Bit Of Love,2007,190.17098,ARYKVZI1187B9AE8A9
1,SOIKPOX12A58A7BA0B,A Lo Clasico,2007,235.15383,ARBNXYA1187FB51C50


### 3.5 Fetch ```artists``` table

In [11]:
artists = fetch_dataframe('artists')
artists.head(2)

Unnamed: 0,artist_id,name,location,latitude,longitude
0,AROS1ML1187FB4CF35,12 Stones,"Mandeville, Louisiana",30.37251,-90.0791
1,ARHO39G1187FB4E31B,38 Special,"Jacksonville, FL",,


### 3.6 Fetch ```time``` table

In [12]:
time = fetch_dataframe('time')
time.head(2)

Unnamed: 0,start_time,hour,day,week,month,year,weekday
0,2018-11-02 17:31:45.796,17,2,44,11,2018,5
1,2018-11-02 18:02:42.796,18,2,44,11,2018,5


### 3.7 Fetch ```songplays``` table

In [13]:
songplays = fetch_dataframe('songplays')
songplays.head(2)

Unnamed: 0,songplay_id,level,location,user_agent,session_id,user_id,song_id,artist_id,start_time
0,101823,free,"Washington-Arlington-Alexandria, DC-VA-MD-WV","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",182,10,SOHTKMO12AB01843B0,AR5EYTL1187B98EDA0,2018-11-02 17:31:45.796
1,502072,free,"New Haven-Milford, CT","""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebK...",207,50,SOBONKR12A58A7A7E0,AR5E44Z1187B9A1D74,2018-11-02 18:02:42.796


## 4. Close Connection

In [14]:
connection.close()