# Exploring the Data

This notebook to explore and understand the data.


In [1]:
import os
import glob
import psycopg2
import pandas as pd
from sql_queries import *
import numpy as np
from tqdm import tqdm

In [2]:
conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb user=student password=student")
cur = conn.cursor()
conn.set_session(autocommit=True)

In [5]:
def get_files(filepath):
    all_files = []
    for root, dirs, files in os.walk(filepath):
        files = glob.glob(os.path.join(root,'*.json'))
        for f in files :
            all_files.append(os.path.abspath(f))
    
    return all_files

In [6]:
filepath = "data/song_data"
song_files = get_files(filepath)

In [7]:
df = pd.concat([pd.read_json(song_files[i], lines=True) for i in range(len(song_files))])
df.head()

Unnamed: 0,num_songs,artist_id,artist_latitude,artist_longitude,artist_location,artist_name,song_id,title,duration,year
0,1,AR7G5I41187FB4CE6C,,,"London, England",Adam Ant,SONHOTT12A8C13493C,Something Girls,233.40363,1982
0,1,AR8ZCNI1187B9A069B,,,,Planet P Project,SOIAZJW12AB01853F1,Pink World,269.81832,1984
0,1,ARXR32B1187FB57099,,,,Gob,SOFSOCN12A8C143F5D,Face the Ashes,209.60608,2007
0,1,AR10USD1187B99F3F1,,,"Burlington, Ontario, Canada",Tweeterfriendly Music,SOHKNRJ12A6701D1F8,Drop of Rain,189.57016,0
0,1,ARGSJW91187B9B1D6B,35.21962,-80.01955,North Carolina,JennyAnyKind,SOQHXMF12AB0182363,Young Boy Blues,218.77506,0


## Check for null values

In [8]:
df.isnull().sum(axis = 0)

num_songs            0
artist_id            0
artist_latitude     41
artist_longitude    41
artist_location      0
artist_name          0
song_id              0
title                0
duration             0
year                 0
dtype: int64

Here we see only lat and long have null values

## Find the min and max of each string / number

In [13]:
def find_min_max(column):
    data_type = type(column[0])
    
    minimum = float('inf')
    maximum = float('-inf')
    
    if data_type == str:
        for i in column:
            if i is None:
                continue
            if len(i) < minimum:
                minimum = len(i)
            elif len(i) > maximum:
                maximum = len(i)
    elif data_type == int or data_type == np.int32:
        for i in column:
            if i < minimum:
                minimum = i
            elif i > maximum:
                maximum = i
    elif data_type == float:
        precision = 0
        scale = 0
        for i in column:
            if pd.isna(i):
                continue
            p, s = str(i).split('.')
            if len(p) > precision:
                precision = len(p)
            if len(s) > scale:
                scale = len(s)
                
            if i < minimum:
                minimum = i
            elif i > maximum:
                maximum = i
        print('max scale {}\n max precision {}'.format(scale, scale+precision, ))
        
    
    print('minimum', minimum, '\nmaximum', maximum)

In [14]:
titles = {"num_songs":0,
"artist_id":1,
"artist_latitude":2,
"artist_longitude":3,
"artist_location":4,
"artist_name":5,
"song_id":6,
"title":7,
"duration":8,
"year":9}

for k,v in titles.items():
    print(k)
    find_min_max(df.to_numpy()[:,[v]].flatten())
    print()


num_songs
minimum 1 
maximum 1

artist_id
minimum 18 
maximum 18

artist_latitude
max scale 5
 max precision 8
minimum -13.442 
maximum 56.27609

artist_longitude
max scale 15
 max precision 19
minimum -122.42005 
maximum 15.967600000000001

artist_location
minimum 0 
maximum 29

artist_name
minimum 3 
maximum 94

song_id
minimum 18 
maximum 18

title
minimum 5 
maximum 52

duration
max scale 5
 max precision 8
minimum 29.54404 
maximum 599.24853

year
minimum 0 
maximum 2008



In [15]:
filepath = "data/log_data"
logfiles = get_files(filepath)

In [17]:
df = pd.concat([pd.read_json(logfiles[i], lines=True) for i in range(len(logfiles))])
df.head(2)

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,Frumpies,Logged In,Anabelle,F,0,Simpson,134.47791,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",PUT,NextSong,1541044000000.0,455,Fuck Kitty,200,1541903636796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",69
1,Kenny G with Peabo Bryson,Logged In,Anabelle,F,1,Simpson,264.75057,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",PUT,NextSong,1541044000000.0,455,By The Time This Night Is Over,200,1541903770796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",69


In [21]:
titles = {"artist":0,
"auth":1,
"firstName":2,
"gender":3,
"itemInSession":4,
"lastName":5,
"length":6,
"level":7,
"location":8,
"method":9,
"page":10,
"registration":11,
"sessionId":12,
"song":13,
"userAgent":16}

for k,v in titles.items():
    print(k)
    find_min_max(df.to_numpy()[:,[v]].flatten())
    print()


artist
minimum 2 
maximum 89

auth
minimum 9 
maximum 10

firstName
minimum 3 
maximum 10

gender
minimum 1 
maximum 1

itemInSession
minimum 0 
maximum 127

lastName
minimum 3 
maximum 9

length
max scale 14
 max precision 18
minimum 15.85587 
maximum 2594.87302

level
minimum 4 
maximum 4

location
minimum 10 
maximum 46

method
minimum 3 
maximum 3

page
minimum 4 
maximum 16

registration
max scale 1
 max precision 14
minimum 1539908999796.0 
maximum 1541098488796.0

sessionId
minimum 3 
maximum 1114

song
minimum 1 
maximum 151

userAgent
minimum 63 
maximum 139

