# ETL Processes

In [4]:
# import necessary packages
import os
import glob
import psycopg2
import pandas as pd
from sql_queries import *

In [5]:
# connect to the database
conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb user=student password=student")
cur = conn.cursor()

In [6]:
# create get_files function to get files from directory
def get_files(filepath):
    all_files = []
    for root, dirs, files in os.walk(filepath):
        files = glob.glob(os.path.join(root,'*.json'))
        for f in files :
            all_files.append(os.path.abspath(f))
    
    return all_files

# Process `song_data`

In [4]:
# folder with song files set as variable
song_files = 'data/song_data'

In [5]:
# use the get_files function get the list of all files
filepath = get_files(song_files)
# check the number of files in the list
len(filepath)

74

In [6]:
# create dataframe from song files 
df = pd.DataFrame()
for file in filepath:
    data = pd.read_json(file, lines=True)
    df = df.append(data, ignore_index = True)
# display first 5 records
df.head(5)

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
0,ARD7TVE1187B99BFB1,,California - LA,,Casual,218.93179,1,SOMZWCG12A8C13C480,I Didn't Mean To,0
1,ARNTLGG11E2835DDB9,,,,Clp,266.39628,1,SOUDSGM12AC9618304,Insatiable (Instrumental Version),0
2,AR8ZCNI1187B9A069B,,,,Planet P Project,269.81832,1,SOIAZJW12AB01853F1,Pink World,1984
3,AR10USD1187B99F3F1,,"Burlington, Ontario, Canada",,Tweeterfriendly Music,189.57016,1,SOHKNRJ12A6701D1F8,Drop of Rain,0
4,ARMJAGH1187FB546F3,35.14968,"Memphis, TN",-90.04892,The Box Tops,148.03546,1,SOCIWDW12A8C13D406,Soul Deep,1969


#### Extract Data for Songs Table

In [8]:
# extract one record for the songs table from the dataframe
song_data = list(df[['song_id', 'title', 'artist_id', 'year', 'duration']].values[0])
song_data

['SOMZWCG12A8C13C480', "I Didn't Mean To", 'ARD7TVE1187B99BFB1', 0, 218.93179]

#### Insert Record into Song Table

In [9]:
# load song data to the song table
cur.execute(song_table_insert, song_data)
conn.commit()

Run `test.ipynb` to see if you've successfully added a record to this table.

#### Extract Data for Artists Table

In [11]:
# extract one record for the artist table from the dataframe
artist_data = list(df[['artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude']].values[0])
artist_data

['ARD7TVE1187B99BFB1', 'Casual', 'California - LA', nan, nan]

#### Insert Record into Artist Table

In [12]:
# load artist data to the artist table
cur.execute(artist_table_insert, artist_data)
conn.commit()

Run `test.ipynb` to see if you've successfully added a record to this table.

# Process `log_data`

In [13]:
# folder with log files set as variable
log_files = 'data/log_data'

In [14]:
# use the get_files function get the list of all files
filepath = get_files(log_files)
# check the number of files in the list
len(filepath)

30

In [15]:
# create dataframe from log files 
df1 = pd.DataFrame()
for file in filepath:
    data = pd.read_json(file, lines=True)
    df1 = df1.append(data, ignore_index = True)
# display last 5 records
df1.tail(5)

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
8051,38 Special,Logged In,Jaleah,F,16,Hayes,280.76363,paid,"San Antonio-New Braunfels, TX",PUT,NextSong,1541003000000.0,113,Caught Up In You,200,1542498585796,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,70
8052,Creedence Clearwater Revival,Logged In,Chloe,F,4,Cuevas,186.51383,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1540941000000.0,680,Proud Mary,200,1542498768796,Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20...,49
8053,Ill Nino,Logged In,Jaleah,F,17,Hayes,175.22893,paid,"San Antonio-New Braunfels, TX",PUT,NextSong,1541003000000.0,113,My Resurrection (Album Version),200,1542498865796,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,70
8054,The Police,Logged In,Chloe,F,5,Cuevas,289.85424,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1540941000000.0,680,So Lonely,200,1542498954796,Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20...,49
8055,Bitter:Sweet,Logged In,Jaleah,F,18,Hayes,160.78322,paid,"San Antonio-New Braunfels, TX",PUT,NextSong,1541003000000.0,113,Take 2 Blue,200,1542499040796,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,70


#### Extract Data for Time Table

In [1]:
# check unique values in the page column
df1['page'].unique()

NameError: name 'df1' is not defined

In [17]:
# filter by NextSong
df1 = df1[df1['page'] == 'NextSong']

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,Stephen Lynch,Logged In,Jayden,M,0,Bell,182.85669,free,"Dallas-Fort Worth-Arlington, TX",PUT,NextSong,1540992000000.0,829,Jim Henson's Dead,200,1543537327796,Mozilla/5.0 (compatible; MSIE 10.0; Windows NT...,91
1,Manowar,Logged In,Jacob,M,0,Klein,247.562,paid,"Tampa-St. Petersburg-Clearwater, FL",PUT,NextSong,1540558000000.0,1049,Shell Shock,200,1543540121796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",73
2,Morcheeba,Logged In,Jacob,M,1,Klein,257.41016,paid,"Tampa-St. Petersburg-Clearwater, FL",PUT,NextSong,1540558000000.0,1049,Women Lose Weight (Feat: Slick Rick),200,1543540368796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",73
3,Maroon 5,Logged In,Jacob,M,2,Klein,231.23546,paid,"Tampa-St. Petersburg-Clearwater, FL",PUT,NextSong,1540558000000.0,1049,Won't Go Home Without You,200,1543540625796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",73
4,Train,Logged In,Jacob,M,3,Klein,216.76363,paid,"Tampa-St. Petersburg-Clearwater, FL",PUT,NextSong,1540558000000.0,1049,Hey_ Soul Sister,200,1543540856796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",73


In [18]:
# convert to date/time column 'ts'
t = pd.to_datetime(df1['ts'], unit='ms')
t.head(5)

0   2018-11-30 00:22:07.796
1   2018-11-30 01:08:41.796
2   2018-11-30 01:12:48.796
3   2018-11-30 01:17:05.796
4   2018-11-30 01:20:56.796
Name: ts, dtype: datetime64[ns]

In [19]:
# create dataframe from 'ts' column by sliting timestamp into 'hour', 'day', 'week', 'month', 'year', 'weekday'
time_data = (t, t.dt.hour, t.dt.day, t.dt.week, t.dt.month, t.dt.year, t.dt.weekday)
column_labels = ('ts', 'hour', 'day', 'week', 'month', 'year', 'weekday')

In [20]:
time_df = pd.DataFrame(
            { 'ts' : time_data[0],
              'hour' : time_data[1],
              'day' : time_data[2],
              'week' : time_data[3],
              'month' : time_data[4],
              'year' : time_data[5],
              'weekday' : time_data[6]       
             }, columns=column_labels)

time_df.head(5)

Unnamed: 0,ts,hour,day,week,month,year,weekday
0,2018-11-30 00:22:07.796,0,30,48,11,2018,4
1,2018-11-30 01:08:41.796,1,30,48,11,2018,4
2,2018-11-30 01:12:48.796,1,30,48,11,2018,4
3,2018-11-30 01:17:05.796,1,30,48,11,2018,4
4,2018-11-30 01:20:56.796,1,30,48,11,2018,4


#### Insert Records into Time Table

In [21]:
for i, row in time_df.iterrows():
    cur.execute(time_table_insert, list(row))
    conn.commit()

#### Extract Data for Users Table

In [22]:
user_df = df1[['userId', 'firstName', 'lastName', 'gender', 'level']]
user_df.head(5)

Unnamed: 0,userId,firstName,lastName,gender,level
0,91,Jayden,Bell,M,free
1,73,Jacob,Klein,M,paid
2,73,Jacob,Klein,M,paid
3,73,Jacob,Klein,M,paid
4,73,Jacob,Klein,M,paid


#### Insert Records into Users Table

In [23]:
for i, row in user_df.iterrows():
    cur.execute(user_table_insert, row)
    conn.commit()

#### Extract Data and Songplays Table

#### Insert Records into Songplays Table


In [24]:
for index, row in df1.iterrows():

    # get songid and artistid from song and artist tables
    cur.execute(song_select, (row.song, row.artist, row.length))
    results = cur.fetchone()
    
    if results:
        songid, artistid = results
    else:
        songid, artistid = None, None

    # insert songplay record
    songplay_data = [row.ts, row.userId, row.level, songid, artistid, row.sessionId, row.location, row.userAgent]
    cur.execute(songplay_table_insert, songplay_data)

# Close Connection to Sparkify Database

In [1]:
conn.close()

NameError: name 'conn' is not defined