In [118]:
# system libs
import glob
import os

# etl/eda libs
import pandas as pd

#### 1. Get a list of JSON files in the folder and subfolders

In [119]:
def get_files(folder):
    all_files = []
    for root, dirs, files in os.walk(folder):
        files = glob.glob(os.path.join(root,'*.json'))
        for f in files :
            all_files.append(os.path.abspath(f))
    
    return all_files

#### 2. Get a concatenated DataFrame of all JSON files in the list

In [120]:
def get_data(files):
    dfs = []
    for f in files:
        df = pd.read_json(f, lines=True)
        dfs.append(df)
    
    return pd.concat(dfs, ignore_index=True)

#### 3. Get the DataFrame for all JSON files in the folder ```'data/song_data'```

In [121]:
song_files = get_files('data/song_data')
song_df = get_data(song_files)

print(f'Number of songs: {song_df.shape[0]}')
song_df.head(2)

Number of songs: 71


Unnamed: 0,num_songs,artist_id,artist_latitude,artist_longitude,artist_location,artist_name,song_id,title,duration,year
0,1,ARNF6401187FB57032,40.79086,-73.96644,"New York, NY [Manhattan]",Sophie B. Hawkins,SONWXQJ12A8C134D94,The Ballad Of Sleeping Beauty,305.162,1994
1,1,ARVBRGZ1187FB4675A,,,,Gwen Stefani,SORRZGD12A6310DBC3,Harajuku Girls,290.55955,2004


#### 4. Trim all song strings to avoid query mismatch

In [122]:
strings = list(song_df.select_dtypes(include=['object']).columns)
song_df[strings] = song_df[strings].apply(lambda x: x.str.strip())

#### 5. Look for duplicates on the sensitive columns

In [123]:
check = ['song_id', 'title', 'artist_id', 'artist_name', 'duration']
for column in check:
    duplicates = song_df.duplicated(subset=[column], keep="first").sum()
    print(f'Duplicate {column}: {duplicates}')

Duplicate song_id: 0
Duplicate title: 0
Duplicate artist_id: 2
Duplicate artist_name: 2
Duplicate duration: 0


#### 6. Get the DataFrame for all JSON files in the folder ```'data/log_data'```

In [124]:
log_files = get_files('data/log_data')
log_df = get_data(log_files)

log_df = log_df[log_df['page'] == 'NextSong']
log_df['ts'] = pd.to_datetime(log_df['ts'], unit='ms')
log_df['time_data'] = log_df['ts'].apply(lambda x: [x, x.hour, x.day, x.week, x.month, x.year, x.day_name()])

print(f'Number of songplays: {log_df.shape[0]}')
log_df.head(2)

Number of songplays: 6820


Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId,time_data
0,The Killers,Logged In,Jayden,M,32,Graves,246.80444,paid,"Marinette, WI-MI",PUT,NextSong,1540664000000.0,594,Read My Mind,200,2018-11-20 00:00:42.796,"""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebK...",25,"[2018-11-20 00:00:42.796000, 0, 20, 47, 11, 20..."
1,Tamia,Logged In,Jayden,M,33,Graves,243.09506,paid,"Marinette, WI-MI",PUT,NextSong,1540664000000.0,594,Officially Missing You (Radio Version),200,2018-11-20 00:04:48.796,"""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebK...",25,"[2018-11-20 00:04:48.796000, 0, 20, 47, 11, 20..."


#### 7. Trim all songplay strings to avoid query mismatch

In [125]:
strings = list(log_df.select_dtypes(include=['object']).columns)
log_df[strings] = log_df[strings].apply(lambda x: x.str.strip())

#### 8. Look for duplicates on the sensitive columns

In [126]:
check = ['song', 'artist', 'length']
for column in check:
    duplicates = log_df.duplicated(subset=[column], keep="first").sum()
    print(f'Duplicate {column}: {duplicates}')

print(f'Duplicate combined: {log_df.duplicated(subset=check, keep="first").sum()}')

Duplicate song: 1631
Duplicate artist: 3672
Duplicate length: 2826
Duplicate combined: 1496


#### 9. Check if all necessary data types match

In [127]:
print(log_df[['song', 'artist', 'length']].dtypes.tolist())
print(song_df[['title', 'artist_name', 'duration']].dtypes.tolist())

[dtype('O'), dtype('O'), dtype('float64')]
[dtype('O'), dtype('O'), dtype('float64')]


#### 10. Simulates the ```song_select``` query

- Iterate over all log_df DataFrame as proposed by the ETL process.
- Uses a loc filter to compare row.song, row.artist and row.length like the SQL WHERE statement.
- Counts the results that match and the empty ones.

~~~~sql
    SELECT s.song_id, a.artist_id
    FROM songs s
    JOIN artists a
    ON s.artist_id = a.artist_id
    WHERE s.title = %s AND a.name = %s AND s.duration = %s;
~~~~

In [128]:
empty = 0
matches = 0

for index, row in log_df.iterrows():
    result = song_df.loc[(song_df['title'] == row.song) & (song_df['artist_name'] == row.artist) & (song_df['duration'] == row.length)]

    if result.empty:
        empty += 1
    else:
        matches += 1

print(f'Empty: {empty} Matches: {matches}')

Empty: 6819 Matches: 1


#### 12. Results

- The solution shown in this notebook confirm the existence of a problem in the dataset provided for this project.
- There is only one match between a songplay (log_data) and the entire song table (song_data).
- The result of running the code below is a songplay table with 6819 null values ​​for the song_id and artist_id columns.

~~~~python
    # insert songplay records
    for index, row in df.iterrows():
        
        # get songid and artistid from song and artist tables
        cur.execute(song_select, (row.song, row.artist, row.length))
        results = cur.fetchone()
        
        if results:
            songid, artistid = results
        else:
            songid, artistid = None, None

        # insert songplay record
        songplay_data = [row.level, row.location, row.userAgent, artistid, row.sessionId, songid, row.ts, row.userId]
        cur.execute(songplay_table_insert, songplay_data)
~~~~
