In [1]:
# import from etl dir
import sys, os
etl_path = os.path.normpath(os.path.abspath(os.path.join(os.path.abspath(''), os.path.pardir, 'etl')))
if (not (etl_path in sys.path)) :
    sys.path.append(etl_path)

In [2]:
# AWS libs
from s3 import S3Loader
# config libs
from config import Config
# data libs
import pandas as pd

In [3]:
def describe_types(row, columns):
    sizes = []
    for column in columns:
        value = row[column]
        size = len(value) if type(value) == 'object' else len(str(value))
        sizes.append(size)
    return sizes

In [4]:
config = Config()
s3Loader = S3Loader(config)

In [5]:
log_data = s3Loader.load_data(config.get('S3', 'LOG_DATA'), -1)

In [6]:
staging_events = pd.read_csv('../../data/staging_events.csv')
staging_events = staging_events.drop(staging_events.columns[[0]], axis=1)

In [7]:
log_data.columns = log_data.columns.str.lower()

staging_events = staging_events[staging_events.columns].sort_values(by=['ts'])
staging_events.index = log_data.index

matches = staging_events[staging_events[staging_events.columns].sort_values(
    by=['ts']) == log_data[staging_events.columns].sort_values(by=['ts'])]
matches.head(2)

Unnamed: 0,userid,firstname,lastname,gender,level,artist,song,length,sessionid,auth,iteminsession,location,registration,ts,page,useragent,status,method
0,39.0,Walter,Frye,M,free,,,,38.0,Logged In,0.0,"San Francisco-Oakland-Hayward, CA",1540919000000.0,1541105830796,Home,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",200.0,GET
1,8.0,Kaylee,Summers,F,free,,,,139.0,Logged In,0.0,"Phoenix-Mesa-Scottsdale, AZ",1540345000000.0,1541106106796,Home,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",200.0,GET


In [8]:
staging_events_filtered = staging_events[staging_events['page'] == 'NextSong'].copy()
print(staging_events_filtered.shape)
staging_events_filtered.isnull().sum()

(6820, 18)


userid           0
firstname        0
lastname         0
gender           0
level            0
artist           0
song             0
length           0
sessionid        0
auth             0
iteminsession    0
location         0
registration     0
ts               0
page             0
useragent        0
status           0
method           0
dtype: int64

In [9]:
staging_events_filtered['songplay_id'] = staging_events_filtered.apply(
    lambda x: str(x['userid']) + str(x['sessionid']) + str(x['iteminsession']), axis=1)
len(staging_events_filtered['songplay_id'].unique())

6820

In [10]:
columns = staging_events.columns
sizes = staging_events.apply(lambda x: describe_types(x, columns), axis=1)
pd.DataFrame(sizes.tolist(), columns=columns).max()

userid             5
firstname         10
lastname           9
gender             3
level              4
artist            89
song             151
length            10
sessionid          4
auth              10
iteminsession      3
location          46
registration      15
ts                13
page              16
useragent        139
status             3
method             3
dtype: int64

In [11]:
log_path = s3Loader.load_path(config.get('S3', 'LOG_JSON_PATH'))
log_path

{'jsonpaths': ["$['artist']",
  "$['auth']",
  "$['firstName']",
  "$['gender']",
  "$['itemInSession']",
  "$['lastName']",
  "$['length']",
  "$['level']",
  "$['location']",
  "$['method']",
  "$['page']",
  "$['registration']",
  "$['sessionId']",
  "$['song']",
  "$['status']",
  "$['ts']",
  "$['userAgent']",
  "$['userId']"]}

In [12]:
song_data = s3Loader.load_data(config.get('S3', 'SONG_DATA'), 100)
song_data.head(2)

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
0,ARJNIUY12298900C91,,,,Adelitas Way,213.9424,1,SOBLFFE12AF72AA5BA,Scream,2009
1,AR73AIO1187B9AD57B,37.77916,"San Francisco, CA",-122.42005,Western Addiction,118.07302,1,SOQPWCR12A6D4FB2A3,A Poor Recipe For Civic Cohesion,2005


In [13]:
columns = song_data.columns
sizes = song_data.apply(lambda x: describe_types(x, columns), axis=1)
pd.DataFrame(sizes.tolist(), columns=columns).max()

artist_id           18
artist_latitude      9
artist_location     33
artist_longitude    19
artist_name         33
duration             9
num_songs            1
song_id             18
title               54
year                 4
dtype: int64

In [14]:
staging_songs = pd.read_csv('../../data/staging_songs.csv')
staging_songs = staging_songs.drop(staging_songs.columns[[0]], axis=1)
staging_songs[['title', 'artist_name']] = staging_songs[['title', 'artist_name']].apply(lambda x: x.str.strip())
staging_songs.head(2)

Unnamed: 0,song_id,title,duration,year,num_songs,artist_id,artist_name,artist_location,artist_latitude,artist_longitude
0,SOBLFFE12AF72AA5BA,Scream,213.9424,2009,1,ARJNIUY12298900C91,Adelitas Way,,,
1,SOEKAZG12AB018837E,I'll Slap Your Face (Entertainment USA Theme),129.85424,2001,1,ARSVTNL1187B992A91,Jonathan King,"London, England",51.50632,-0.12714


In [15]:
staging_events_filtered[['song', 'artist']] = staging_events_filtered[['song', 'artist']].apply(lambda x: x.str.strip())

In [16]:
staging_songplays = pd.merge(
    staging_events_filtered,
    staging_songs,
    how="inner",
    left_on=['song', 'artist'],
    right_on=['title', 'artist_name']
)
print(staging_songplays.shape) 
staging_songplays.head(2)

(333, 29)


Unnamed: 0,userid,firstname,lastname,gender,level,artist,song,length,sessionid,auth,...,song_id,title,duration,year,num_songs,artist_id,artist_name,artist_location,artist_latitude,artist_longitude
0,8.0,Kaylee,Summers,F,free,The Mars Volta,Eriatarka,380.42077,139,Logged In,...,SOEIQUY12AF72A086A,Eriatarka,380.42077,2003,1,ARHUC691187B9AD27F,The Mars Volta,"Long Beach, California",,
1,50.0,Ava,Robinson,F,free,Dwight Yoakam,You're The One,239.3073,156,Logged In,...,SOBONKR12A58A7A7E0,You're The One,239.3073,1990,1,AR5E44Z1187B9A1D74,Dwight Yoakam,"Pikeville, KY",37.4817,-82.51887


In [17]:
users = pd.read_csv('../../data/users.csv')
users.head(2)

Unnamed: 0.1,Unnamed: 0,user_id,first_name,last_name,gender,level
0,0,2,Jizelle,Benjamin,F,free
1,1,4,Alivia,Terrell,F,free


In [18]:
songs = pd.read_csv('../../data/songs.csv')
songs.head(2)

Unnamed: 0.1,Unnamed: 0,song_id,title,year,duration,artist_id
0,0,SOSQIHH12A8C13370B,15 Step,2007,237.21751,ARH6W4X1187B99274F
1,1,SOKOGIP12AB0182FCD,Adrenaline,2007,200.9073,AROS1ML1187FB4CF35


In [19]:
artists = pd.read_csv('../../data/artists.csv')
artists.head(2)

Unnamed: 0.1,Unnamed: 0,artist_id,name,location,latitude,longitude
0,0,AROS1ML1187FB4CF35,12 Stones,"Mandeville, Louisiana",30.37251,-90.0791
1,1,ARHO39G1187FB4E31B,38 Special,"Jacksonville, FL",,


In [20]:
time = pd.read_csv('../../data/time.csv')
time.head(2)

Unnamed: 0.1,Unnamed: 0,start_time,hour,day,week,month,year,weekday
0,0,2018-11-02 17:31:45.796,17,2,44,11,2018,5
1,1,2018-11-02 18:02:42.796,18,2,44,11,2018,5


In [21]:
songplays = pd.read_csv('../../data/songplays.csv')
songplays.head(2)

Unnamed: 0.1,Unnamed: 0,songplay_id,level,location,user_agent,session_id,user_id,song_id,artist_id,start_time
0,0,101823,free,"Washington-Arlington-Alexandria, DC-VA-MD-WV","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",182,10,SOHTKMO12AB01843B0,AR5EYTL1187B98EDA0,2018-11-02 17:31:45.796
1,1,502072,free,"New Haven-Milford, CT","""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebK...",207,50,SOBONKR12A58A7A7E0,AR5E44Z1187B9A1D74,2018-11-02 18:02:42.796


In [22]:
pd.DataFrame({
    'songplays': [songplays.shape[0]],
    'users': [users.shape[0]],
    'songs': [songs.shape[0]],
    'artists': [artists.shape[0]],
    'time': [time.shape[0]],
})

Unnamed: 0,songplays,users,songs,artists,time
0,319,104,209,195,319
