# Chapter 2 - Working with Pandas DataFrames

In [1]:
import numpy as np

In [2]:
data = np.genfromtxt(
    "../data/example_data.csv", delimiter=";", names=True, dtype=None, encoding="UTF"
)


In [3]:
data

array([('2018-10-13 11:10:23.560', '262km NW of Ozernovskiy, Russia', 'mww', 6.7, 'green', 1),
       ('2018-10-13 04:34:15.580', '25km E of Bitung, Indonesia', 'mww', 5.2, 'green', 0),
       ('2018-10-13 00:13:46.220', '42km WNW of Sola, Vanuatu', 'mww', 5.7, 'green', 0),
       ('2018-10-12 21:09:49.240', '13km E of Nueva Concepcion, Guatemala', 'mww', 5.7, 'green', 0),
       ('2018-10-12 02:52:03.620', '128km SE of Kimbe, Papua New Guinea', 'mww', 5.6, 'green', 1)],
      dtype=[('time', '<U23'), ('place', '<U37'), ('magType', '<U3'), ('mag', '<f8'), ('alert', '<U5'), ('tsunami', '<i4')])

In [4]:
data.shape

(5,)

In [5]:
data.dtype

dtype([('time', '<U23'), ('place', '<U37'), ('magType', '<U3'), ('mag', '<f8'), ('alert', '<U5'), ('tsunami', '<i4')])

In [6]:
%%timeit
max([row[3] for row in data])

The slowest run took 5.50 times longer than the fastest. This could mean that an intermediate result is being cached.
12.7 µs ± 8.36 µs per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [7]:
array_dict = {
    col: np.array([row[i] for row in data]) 
    for i, col in enumerate(data.dtype.names)
}


In [8]:
array_dict

{'time': array(['2018-10-13 11:10:23.560', '2018-10-13 04:34:15.580',
        '2018-10-13 00:13:46.220', '2018-10-12 21:09:49.240',
        '2018-10-12 02:52:03.620'], dtype='<U23'),
 'place': array(['262km NW of Ozernovskiy, Russia', '25km E of Bitung, Indonesia',
        '42km WNW of Sola, Vanuatu',
        '13km E of Nueva Concepcion, Guatemala',
        '128km SE of Kimbe, Papua New Guinea'], dtype='<U37'),
 'magType': array(['mww', 'mww', 'mww', 'mww', 'mww'], dtype='<U3'),
 'mag': array([6.7, 5.2, 5.7, 5.7, 5.6]),
 'alert': array(['green', 'green', 'green', 'green', 'green'], dtype='<U5'),
 'tsunami': array([1, 0, 0, 0, 1])}

In [9]:
%%timeit
array_dict['mag'].max()

The slowest run took 5.13 times longer than the fastest. This could mean that an intermediate result is being cached.
6.53 µs ± 3.42 µs per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [10]:
np.array([
    value[array_dict['mag'].argmax()]
    for key, value in array_dict.items()
])

array(['2018-10-13 11:10:23.560', '262km NW of Ozernovskiy, Russia',
       'mww', '6.7', 'green', '1'], dtype='<U32')

## Series

In [11]:
import pandas as pd

In [12]:
place = pd.Series(array_dict['place'], name='place')
place

0          262km NW of Ozernovskiy, Russia
1              25km E of Bitung, Indonesia
2                42km WNW of Sola, Vanuatu
3    13km E of Nueva Concepcion, Guatemala
4      128km SE of Kimbe, Papua New Guinea
Name: place, dtype: object

In [13]:
place.to_numpy()

array(['262km NW of Ozernovskiy, Russia', '25km E of Bitung, Indonesia',
       '42km WNW of Sola, Vanuatu',
       '13km E of Nueva Concepcion, Guatemala',
       '128km SE of Kimbe, Papua New Guinea'], dtype=object)

In [14]:
place.index

RangeIndex(start=0, stop=5, step=1)

In [15]:
place.index.values

array([0, 1, 2, 3, 4], dtype=int64)

In [16]:
# place.index.name
# place.index.dtype
# place.index.shape
place.index.is_unique

True

In [17]:
np.array([1,1,1]) + np.array([-1, 0, 1])

array([0, 1, 2])

In [18]:
numbers = np.linspace(0, 10, num=5)
numbers

array([ 0. ,  2.5,  5. ,  7.5, 10. ])

In [19]:
x = pd.Series(numbers)
y = pd.Series(numbers, index=pd.Index([1,2,3,4,5]))
x + y

0     NaN
1     2.5
2     7.5
3    12.5
4    17.5
5     NaN
dtype: float64

## DataFrame

In [20]:
df = pd.DataFrame(array_dict)
df

Unnamed: 0,time,place,magType,mag,alert,tsunami
0,2018-10-13 11:10:23.560,"262km NW of Ozernovskiy, Russia",mww,6.7,green,1
1,2018-10-13 04:34:15.580,"25km E of Bitung, Indonesia",mww,5.2,green,0
2,2018-10-13 00:13:46.220,"42km WNW of Sola, Vanuatu",mww,5.7,green,0
3,2018-10-12 21:09:49.240,"13km E of Nueva Concepcion, Guatemala",mww,5.7,green,0
4,2018-10-12 02:52:03.620,"128km SE of Kimbe, Papua New Guinea",mww,5.6,green,1


In [21]:
df.dtypes

time        object
place       object
magType     object
mag        float64
alert       object
tsunami      int32
dtype: object

In [22]:
df.values

array([['2018-10-13 11:10:23.560', '262km NW of Ozernovskiy, Russia',
        'mww', 6.7, 'green', 1],
       ['2018-10-13 04:34:15.580', '25km E of Bitung, Indonesia', 'mww',
        5.2, 'green', 0],
       ['2018-10-13 00:13:46.220', '42km WNW of Sola, Vanuatu', 'mww',
        5.7, 'green', 0],
       ['2018-10-12 21:09:49.240',
        '13km E of Nueva Concepcion, Guatemala', 'mww', 5.7, 'green', 0],
       ['2018-10-12 02:52:03.620', '128km SE of Kimbe, Papua New Guinea',
        'mww', 5.6, 'green', 1]], dtype=object)

In [23]:
df.columns

Index(['time', 'place', 'magType', 'mag', 'alert', 'tsunami'], dtype='object')

In [24]:
df + df

Unnamed: 0,time,place,magType,mag,alert,tsunami
0,2018-10-13 11:10:23.5602018-10-13 11:10:23.560,"262km NW of Ozernovskiy, Russia262km NW of Oze...",mwwmww,13.4,greengreen,2
1,2018-10-13 04:34:15.5802018-10-13 04:34:15.580,"25km E of Bitung, Indonesia25km E of Bitung, I...",mwwmww,10.4,greengreen,0
2,2018-10-13 00:13:46.2202018-10-13 00:13:46.220,"42km WNW of Sola, Vanuatu42km WNW of Sola, Van...",mwwmww,11.4,greengreen,0
3,2018-10-12 21:09:49.2402018-10-12 21:09:49.240,"13km E of Nueva Concepcion, Guatemala13km E of...",mwwmww,11.4,greengreen,0
4,2018-10-12 02:52:03.6202018-10-12 02:52:03.620,"128km SE of Kimbe, Papua New Guinea128km SE of...",mwwmww,11.2,greengreen,2


In [25]:
pd.read_csv?

[1;31mSignature:[0m
[0mpd[0m[1;33m.[0m[0mread_csv[0m[1;33m([0m[1;33m
[0m    [0mfilepath_or_buffer[0m[1;33m:[0m [1;34m'FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]'[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0msep[0m[1;33m:[0m [1;34m'str | None | lib.NoDefault'[0m [1;33m=[0m [1;33m<[0m[0mno_default[0m[1;33m>[0m[1;33m,[0m[1;33m
[0m    [0mdelimiter[0m[1;33m:[0m [1;34m'str | None | lib.NoDefault'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mheader[0m[1;33m:[0m [1;34m"int | Sequence[int] | None | Literal['infer']"[0m [1;33m=[0m [1;34m'infer'[0m[1;33m,[0m[1;33m
[0m    [0mnames[0m[1;33m:[0m [1;34m'Sequence[Hashable] | None | lib.NoDefault'[0m [1;33m=[0m [1;33m<[0m[0mno_default[0m[1;33m>[0m[1;33m,[0m[1;33m
[0m    [0mindex_col[0m[1;33m:[0m [1;34m'IndexLabel | Literal[False] | None'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0musecols[0m[1;33m=

## Creating DataFrames

In [26]:
import datetime as dt
import numpy as np
import pandas as pd


In [27]:
# Check number of rows in data (-l flag)
!wc -l ../data/earthquakes.csv

In [28]:
# check size of file
!ls -lh ../data | grep earthquakes.csv

In [29]:
files = !ls -lh ../data

In [30]:
[file for file in files if '.csv' in file]

[]

In [31]:
!head -n 2 ../data/earthquakes.csv

In [32]:
!tail -n 2 ../data/earthquakes.csv

In [33]:
!awk -F',' '{print NF; exit}' ../data/earthquakes.csv

In [34]:
headers = !head -n 1 ../data/earthquakes.csv
len(headers[0].split(','))

IndexError: list index out of range

In [None]:
df = pd.read_csv('../data/earthquakes.csv')

In [None]:
df_url = pd.read_csv(
    "https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas-2nd-edition/master/ch_02/data/earthquakes.csv"
)


In [None]:
df.to_csv('output.csv', index=False)

## From a database (Page 68)

In [35]:
import sqlite3

In [36]:
with sqlite3.connect('../data/quakes.db') as connection:
    pd.read_csv('../data/tsunamis.csv').to_sql(
        'tsunamis', connection, index=False, if_exists='replace'
    )

In [37]:
with sqlite3.connect('../data/quakes.db') as connection:
    tsunamis = pd.read_sql('SELECT * FROM tsunamis', connection)

In [38]:
tsunamis.head()

Unnamed: 0,alert,type,title,place,magType,mag,time
0,,earthquake,"M 5.0 - 165km NNW of Flying Fish Cove, Christm...","165km NNW of Flying Fish Cove, Christmas Island",mww,5.0,1539459504090
1,green,earthquake,"M 6.7 - 262km NW of Ozernovskiy, Russia","262km NW of Ozernovskiy, Russia",mww,6.7,1539429023560
2,green,earthquake,"M 5.6 - 128km SE of Kimbe, Papua New Guinea","128km SE of Kimbe, Papua New Guinea",mww,5.6,1539312723620
3,green,earthquake,"M 6.5 - 148km S of Severo-Kuril'sk, Russia","148km S of Severo-Kuril'sk, Russia",mww,6.5,1539213362130
4,green,earthquake,"M 6.2 - 94km SW of Kokopo, Papua New Guinea","94km SW of Kokopo, Papua New Guinea",mww,6.2,1539208835130


## From an API

In [39]:
import requests

In [40]:
yesterday = dt.date.today() - dt.timedelta(days=1)

In [41]:
api = 'https://earthquake.usgs.gov/fdsnws/event/1/query'

In [42]:
payload = {
    'format': 'geojson',
    'starttime': yesterday - dt.timedelta(days=30),
    'endtime': yesterday
}

response = requests.get(api, params=payload)

In [43]:
response.status_code

200

In [44]:
earthquake_json = response.json()
earthquake_json.keys()

dict_keys(['type', 'metadata', 'features', 'bbox'])

In [45]:
earthquake_json['metadata']

{'generated': 1669588689000,
 'url': 'https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime=2022-10-27&endtime=2022-11-26',
 'title': 'USGS Earthquakes',
 'status': 200,
 'api': '1.13.6',
 'count': 10139}

In [46]:
type(earthquake_json['features'])

list

In [47]:
earthquake_json['features'][0]

{'type': 'Feature',
 'properties': {'mag': 2.35,
  'place': '2 km SSW of Indios, Puerto Rico',
  'time': 1669420597660,
  'updated': 1669421743690,
  'tz': None,
  'url': 'https://earthquake.usgs.gov/earthquakes/eventpage/pr71384008',
  'detail': 'https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=pr71384008&format=geojson',
  'felt': None,
  'cdi': None,
  'mmi': None,
  'alert': None,
  'status': 'reviewed',
  'tsunami': 0,
  'sig': 85,
  'net': 'pr',
  'code': '71384008',
  'ids': ',pr71384008,',
  'sources': ',pr,',
  'types': ',origin,phase-data,',
  'nst': 6,
  'dmin': 0.04815,
  'rms': 0.07,
  'gap': 259,
  'magType': 'md',
  'type': 'earthquake',
  'title': 'M 2.4 - 2 km SSW of Indios, Puerto Rico'},
 'geometry': {'type': 'Point',
  'coordinates': [-66.8286666666667, 17.9753333333333, 12.19]},
 'id': 'pr71384008'}

In [48]:
earthquake_properties_data = [
    quake['properties'] for quake in earthquake_json['features']
]

In [49]:
df = pd.DataFrame(earthquake_properties_data)

In [50]:
df.head()

Unnamed: 0,mag,place,time,updated,tz,url,detail,felt,cdi,mmi,...,ids,sources,types,nst,dmin,rms,gap,magType,type,title
0,2.35,"2 km SSW of Indios, Puerto Rico",1669420597660,1669421743690,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,",pr71384008,",",pr,",",origin,phase-data,",6.0,0.04815,0.07,259.0,md,earthquake,"M 2.4 - 2 km SSW of Indios, Puerto Rico"
1,5.0,"94 km S of Ust’-Kamchatsk Staryy, Russia",1669419464776,1669421827040,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,",us7000isgf,",",us,",",origin,phase-data,",199.0,3.257,0.69,111.0,mb,earthquake,"M 5.0 - 94 km S of Ust’-Kamchatsk Staryy, Russia"
2,2.5,"18 km NW of Cacouna, Canada",1669419244929,1669449869040,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,",us7000isip,",",us,",",origin,phase-data,",10.0,0.653,0.19,138.0,ml,earthquake,"M 2.5 - 18 km NW of Cacouna, Canada"
3,2.16,"45km ENE of Ensenada, B.C., MX",1669418926840,1669419140205,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,",ci40380936,",",ci,",",nearby-cities,origin,phase-data,scitech-link,",12.0,0.1991,0.19,97.0,ml,earthquake,"M 2.2 - 45km ENE of Ensenada, B.C., MX"
4,1.98866,"3 km WSW of Oceanside, Oregon",1669418755000,1669526433700,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,",uw61896181,",",uw,",",origin,phase-data,",20.0,0.1866,1.86,253.0,ml,earthquake,"M 2.0 - 3 km WSW of Oceanside, Oregon"


In [54]:
earthquake_geometry_data = [
    quake['geometry'] for quake in earthquake_json['features']
]

In [55]:
df_geom = pd.DataFrame(earthquake_geometry_data)

In [56]:
df_geom

Unnamed: 0,type,coordinates
0,Point,"[-66.8286666666667, 17.9753333333333, 12.19]"
1,Point,"[162.4761, 55.3763, 10]"
2,Point,"[-69.6728, 48.0309, 18.088]"
3,Point,"[-116.18, 32.0335, 14.61]"
4,Point,"[-124.00916666666667, 45.442, 38.97]"
...,...,...
10134,Point,"[-66.8801666666667, 17.9931666666667, 16.02]"
10135,Point,"[-120.25116666666666, 47.678666666666665, 2.02]"
10136,Point,"[-116.5933333, 33.1591667, 8.75]"
10137,Point,"[-66.5355, -27.9264, 167.055]"
