# ETL Processes

In [7]:
import os
import json
import glob
import psycopg2
import pandas as pd
from sql_queries import *

In [8]:
conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb user=postgres password=******")
cur = conn.cursor()

In [9]:
def get_files(filepath):
    all_files = []
    for root, dirs, files in os.walk(filepath):
        files = glob.glob(os.path.join(root,'*.json'))
        for f in files :
            all_files.append(os.path.abspath(f))
    
    return all_files

# Process `song_data`
In this first part, lets perform ETL on the first dataset, `song_data`, to create the `songs` and `artists` dimensional tables.

Let's perform ETL on a single song file and load a single record into each table to start.

In [10]:
filepath = "data\\song_data"

In [11]:
song_files = get_files(filepath)

In [12]:
song_files[0]

'C:\\Users\\sravg\\Downloads\\2020 Learning\\Data Engineer Nanodegree\\Projects\\DataModeling_Relational\\project-template\\data\\song_data\\A\\A\\A\\TRAAAAW128F429D538.json'

In [13]:
df = pd.read_json(song_files[0], lines=True)   # lines=True i.e Read the file as a json object per line.
df

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
0,ARD7TVE1187B99BFB1,,California - LA,,Casual,218.93179,1,SOMZWCG12A8C13C480,I Didn't Mean To,0


## #1: `songs` Table
#### Extract Data for Songs Table

In [14]:
song_data = df[['song_id', 'title', 'artist_id', 'year', 'duration']].values
song_data = song_data.tolist()
song_data = song_data[0]
song_data

['SOMZWCG12A8C13C480', "I Didn't Mean To", 'ARD7TVE1187B99BFB1', 0, 218.93179]

#### Insert Record into Song Table

In [9]:
cur.execute(song_table_insert, song_data)
conn.commit()

Run `test.ipynb` to see if records are successfully added to this table.

## #2: `artists` Table
#### Extract Data for Artists Table

In [15]:
artist_data = df[['artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude']].values
artist_data = artist_data.tolist()
artist_data = artist_data[0]
artist_data

['ARD7TVE1187B99BFB1', 'Casual', 'California - LA', nan, nan]

#### Insert Record into Artist Table

In [11]:
artist_table_insert

'\nINSERT INTO artists\n(artist_ID, name, location, latitude, longitude) VALUES (%s, %s, %s, %s, %s);\n\n'

In [12]:
cur.execute(artist_table_insert, artist_data)
conn.commit()

Run `test.ipynb` to see if records are successfully added to this table.

# Process `log_data`
In this part, lets perform ETL on the second dataset, `log_data`, to create the `time` and `users` dimensional tables, as well as the `songplays` fact table.

Let's perform ETL on a single log file and load a single record into each table.

In [16]:
filepath = "data\\log_data"

In [17]:
log_files= get_files(filepath)
log_files[0]

'C:\\Users\\sravg\\Downloads\\2020 Learning\\Data Engineer Nanodegree\\Projects\\DataModeling_Relational\\project-template\\data\\log_data\\2018\\11\\2018-11-01-events.json'

In [18]:
df = pd.read_json(log_files[0], lines=True)
df

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,,Logged In,Walter,M,0,Frye,,free,"San Francisco-Oakland-Hayward, CA",GET,Home,1540919166796,38,,200,1541105830796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",39
1,,Logged In,Kaylee,F,0,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Home,1540344794796,139,,200,1541106106796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
2,Des'ree,Logged In,Kaylee,F,1,Summers,246.30812,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540344794796,139,You Gotta Be,200,1541106106796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
3,,Logged In,Kaylee,F,2,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Upgrade,1540344794796,139,,200,1541106132796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
4,Mr Oizo,Logged In,Kaylee,F,3,Summers,144.03873,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540344794796,139,Flat 55,200,1541106352796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
5,Tamba Trio,Logged In,Kaylee,F,4,Summers,177.18812,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540344794796,139,Quem Quiser Encontrar O Amor,200,1541106496796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
6,The Mars Volta,Logged In,Kaylee,F,5,Summers,380.42077,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540344794796,139,Eriatarka,200,1541106673796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
7,Infected Mushroom,Logged In,Kaylee,F,6,Summers,440.2673,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540344794796,139,Becoming Insane,200,1541107053796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
8,Blue October / Imogen Heap,Logged In,Kaylee,F,7,Summers,241.3971,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540344794796,139,Congratulations,200,1541107493796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
9,Girl Talk,Logged In,Kaylee,F,8,Summers,160.15628,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540344794796,139,Once again,200,1541107734796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8


## #3: `time` Table
#### Extracted Data for Time Table by following the steps below: 
- Filter records by `NextSong` action
- Convert the `ts` timestamp column to datetime
- Extract the timestamp, hour, day, week of year, month, year, and weekday from the `ts` column and set `time_data` to a list containing these values in order
- Specify labels for these columns and set to `column_labels`
- Create a dataframe, `time_df,` containing the time data for this file by combining `column_labels` and `time_data` into a dictionary and converting this into a dataframe

In [19]:
df = df.loc[df['page'] =='NextSong']
df.head(15)

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
2,Des'ree,Logged In,Kaylee,F,1,Summers,246.30812,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540344794796,139,You Gotta Be,200,1541106106796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
4,Mr Oizo,Logged In,Kaylee,F,3,Summers,144.03873,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540344794796,139,Flat 55,200,1541106352796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
5,Tamba Trio,Logged In,Kaylee,F,4,Summers,177.18812,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540344794796,139,Quem Quiser Encontrar O Amor,200,1541106496796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
6,The Mars Volta,Logged In,Kaylee,F,5,Summers,380.42077,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540344794796,139,Eriatarka,200,1541106673796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
7,Infected Mushroom,Logged In,Kaylee,F,6,Summers,440.2673,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540344794796,139,Becoming Insane,200,1541107053796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
8,Blue October / Imogen Heap,Logged In,Kaylee,F,7,Summers,241.3971,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540344794796,139,Congratulations,200,1541107493796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
9,Girl Talk,Logged In,Kaylee,F,8,Summers,160.15628,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540344794796,139,Once again,200,1541107734796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
10,Black Eyed Peas,Logged In,Sylvie,F,0,Cruz,214.93506,free,"Washington-Arlington-Alexandria, DC-VA-MD-WV",PUT,NextSong,1540266185796,9,Pump It,200,1541108520796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",10
12,Fall Out Boy,Logged In,Ryan,M,1,Smith,200.72444,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541016707796,169,Nobody Puts Baby In The Corner,200,1541109125796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
13,M.I.A.,Logged In,Ryan,M,2,Smith,233.7171,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541016707796,169,Mango Pickle Down River (With The Wilcannia Mob),200,1541109325796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26


In [20]:
t = pd.to_datetime(df['ts'], unit='ms')
t.head()

2   2018-11-01 21:01:46.796
4   2018-11-01 21:05:52.796
5   2018-11-01 21:08:16.796
6   2018-11-01 21:11:13.796
7   2018-11-01 21:17:33.796
Name: ts, dtype: datetime64[ns]

In [21]:
time_data = [df.ts.values, t.dt.hour.values, t.dt.day.values, t.dt.weekofyear.values, t.dt.month.values, t.dt.year.values, t.dt.weekday_name.values]
column_labels = ['start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday']

In [22]:
time_data

[array([1541106106796, 1541106352796, 1541106496796, 1541106673796,
        1541107053796, 1541107493796, 1541107734796, 1541108520796,
        1541109125796, 1541109325796, 1541110994796], dtype=int64),
 array([21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22], dtype=int64),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64),
 array([44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44], dtype=int64),
 array([11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11], dtype=int64),
 array([2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018],
       dtype=int64),
 array(['Thursday', 'Thursday', 'Thursday', 'Thursday', 'Thursday',
        'Thursday', 'Thursday', 'Thursday', 'Thursday', 'Thursday',
        'Thursday'], dtype=object)]

In [23]:
time_df = pd.DataFrame(dict(zip(column_labels, time_data)))
time_df.head()

Unnamed: 0,start_time,hour,day,week,month,year,weekday
0,1541106106796,21,1,44,11,2018,Thursday
1,1541106352796,21,1,44,11,2018,Thursday
2,1541106496796,21,1,44,11,2018,Thursday
3,1541106673796,21,1,44,11,2018,Thursday
4,1541107053796,21,1,44,11,2018,Thursday


#### Insert Records into Time Table

In [23]:
for i, row in time_df.iterrows():
    cur.execute(time_table_insert, list(row))
    conn.commit()

Run `test.ipynb` to see if records are successfully added to this table.

## #4: `users` Table
#### Extract Data for Users Table

In [24]:
user_df = df[['userId', 'firstName', 'lastName', 'gender', 'level']]
user_df.head(15)

Unnamed: 0,userId,firstName,lastName,gender,level
2,8,Kaylee,Summers,F,free
4,8,Kaylee,Summers,F,free
5,8,Kaylee,Summers,F,free
6,8,Kaylee,Summers,F,free
7,8,Kaylee,Summers,F,free
8,8,Kaylee,Summers,F,free
9,8,Kaylee,Summers,F,free
10,10,Sylvie,Cruz,F,free
12,26,Ryan,Smith,M,free
13,26,Ryan,Smith,M,free


#### Insert Records into Users Table

In [53]:
for i, row in user_df.iterrows():
    cur.execute(user_table_insert, row)
    conn.commit()

Run `test.ipynb` to see if records are successfully added to this table.

## #5: `songplays` Table
#### Extract Data and Songplays Table
The log file does not specify an ID for either the song or the artist, we need to get the song ID and artist ID by querying the songs and artists tables to find matches based on song title, artist name, and song duration time.

#### Insert Records into Songplays Table

In [25]:
for index, row in df.iterrows():

    # get songid and artistid from song and artist tables
    cur.execute(song_select, (row.song, row.artist, row.length))
    results = cur.fetchone()
    
    if results:
        songid, artistid = results
    else:
        songid, artistid = None, None

    # insert songplay record
    songplay_data = [index+1, row.ts, row.userId, row.level, songid, artistid, row.sessionId, row.location, row.userAgent]
    cur.execute(songplay_table_insert, songplay_data)
    conn.commit()

Run `test.ipynb` to see if records are successfully added to this table.

# Close Connection to Sparkify Database

In [26]:
conn.close()

# Implement `etl.py`
Using what has been completed in this notebook to implement `etl.py`.