# Chapter 2 - Working with Pandas DataFrames

In [1]:
import numpy as np

In [2]:
data = np.genfromtxt(
    "../data/example_data.csv", delimiter=";", names=True, dtype=None, encoding="UTF"
)


In [3]:
data

array([('2018-10-13 11:10:23.560', '262km NW of Ozernovskiy, Russia', 'mww', 6.7, 'green', 1),
       ('2018-10-13 04:34:15.580', '25km E of Bitung, Indonesia', 'mww', 5.2, 'green', 0),
       ('2018-10-13 00:13:46.220', '42km WNW of Sola, Vanuatu', 'mww', 5.7, 'green', 0),
       ('2018-10-12 21:09:49.240', '13km E of Nueva Concepcion, Guatemala', 'mww', 5.7, 'green', 0),
       ('2018-10-12 02:52:03.620', '128km SE of Kimbe, Papua New Guinea', 'mww', 5.6, 'green', 1)],
      dtype=[('time', '<U23'), ('place', '<U37'), ('magType', '<U3'), ('mag', '<f8'), ('alert', '<U5'), ('tsunami', '<i4')])

In [4]:
data.shape

(5,)

In [6]:
data.dtype

dtype([('time', '<U23'), ('place', '<U37'), ('magType', '<U3'), ('mag', '<f8'), ('alert', '<U5'), ('tsunami', '<i4')])

In [7]:
%%timeit
max([row[3] for row in data])

2.26 µs ± 91.3 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [10]:
array_dict = {
    col: np.array([row[i] for row in data]) 
    for i, col in enumerate(data.dtype.names)
}


In [11]:
array_dict

{'time': array(['2018-10-13 11:10:23.560', '2018-10-13 04:34:15.580',
        '2018-10-13 00:13:46.220', '2018-10-12 21:09:49.240',
        '2018-10-12 02:52:03.620'], dtype='<U23'),
 'place': array(['262km NW of Ozernovskiy, Russia', '25km E of Bitung, Indonesia',
        '42km WNW of Sola, Vanuatu',
        '13km E of Nueva Concepcion, Guatemala',
        '128km SE of Kimbe, Papua New Guinea'], dtype='<U37'),
 'magType': array(['mww', 'mww', 'mww', 'mww', 'mww'], dtype='<U3'),
 'mag': array([6.7, 5.2, 5.7, 5.7, 5.6]),
 'alert': array(['green', 'green', 'green', 'green', 'green'], dtype='<U5'),
 'tsunami': array([1, 0, 0, 0, 1])}

In [13]:
%%timeit
array_dict['mag'].max()

873 ns ± 8.78 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [14]:
np.array([
    value[array_dict['mag'].argmax()]
    for key, value in array_dict.items()
])

array(['2018-10-13 11:10:23.560', '262km NW of Ozernovskiy, Russia',
       'mww', '6.7', 'green', '1'], dtype='<U32')

## Series

In [15]:
import pandas as pd

In [16]:
place = pd.Series(array_dict['place'], name='place')
place

0          262km NW of Ozernovskiy, Russia
1              25km E of Bitung, Indonesia
2                42km WNW of Sola, Vanuatu
3    13km E of Nueva Concepcion, Guatemala
4      128km SE of Kimbe, Papua New Guinea
Name: place, dtype: object

In [19]:
place.to_numpy()

array(['262km NW of Ozernovskiy, Russia', '25km E of Bitung, Indonesia',
       '42km WNW of Sola, Vanuatu',
       '13km E of Nueva Concepcion, Guatemala',
       '128km SE of Kimbe, Papua New Guinea'], dtype=object)

In [20]:
place.index

RangeIndex(start=0, stop=5, step=1)

In [21]:
place.index.values

array([0, 1, 2, 3, 4], dtype=int64)

In [25]:
# place.index.name
# place.index.dtype
# place.index.shape
place.index.is_unique

True

In [26]:
np.array([1,1,1]) + np.array([-1, 0, 1])

array([0, 1, 2])

In [27]:
numbers = np.linspace(0, 10, num=5)
numbers

array([ 0. ,  2.5,  5. ,  7.5, 10. ])

In [28]:
x = pd.Series(numbers)
y = pd.Series(numbers, index=pd.Index([1,2,3,4,5]))
x + y

0     NaN
1     2.5
2     7.5
3    12.5
4    17.5
5     NaN
dtype: float64

## DataFrame

In [30]:
df = pd.DataFrame(array_dict)
df

Unnamed: 0,time,place,magType,mag,alert,tsunami
0,2018-10-13 11:10:23.560,"262km NW of Ozernovskiy, Russia",mww,6.7,green,1
1,2018-10-13 04:34:15.580,"25km E of Bitung, Indonesia",mww,5.2,green,0
2,2018-10-13 00:13:46.220,"42km WNW of Sola, Vanuatu",mww,5.7,green,0
3,2018-10-12 21:09:49.240,"13km E of Nueva Concepcion, Guatemala",mww,5.7,green,0
4,2018-10-12 02:52:03.620,"128km SE of Kimbe, Papua New Guinea",mww,5.6,green,1


In [31]:
df.dtypes

time        object
place       object
magType     object
mag        float64
alert       object
tsunami      int32
dtype: object

In [32]:
df.values

array([['2018-10-13 11:10:23.560', '262km NW of Ozernovskiy, Russia',
        'mww', 6.7, 'green', 1],
       ['2018-10-13 04:34:15.580', '25km E of Bitung, Indonesia', 'mww',
        5.2, 'green', 0],
       ['2018-10-13 00:13:46.220', '42km WNW of Sola, Vanuatu', 'mww',
        5.7, 'green', 0],
       ['2018-10-12 21:09:49.240',
        '13km E of Nueva Concepcion, Guatemala', 'mww', 5.7, 'green', 0],
       ['2018-10-12 02:52:03.620', '128km SE of Kimbe, Papua New Guinea',
        'mww', 5.6, 'green', 1]], dtype=object)

In [33]:
df.columns

Index(['time', 'place', 'magType', 'mag', 'alert', 'tsunami'], dtype='object')

In [34]:
df + df

Unnamed: 0,time,place,magType,mag,alert,tsunami
0,2018-10-13 11:10:23.5602018-10-13 11:10:23.560,"262km NW of Ozernovskiy, Russia262km NW of Oze...",mwwmww,13.4,greengreen,2
1,2018-10-13 04:34:15.5802018-10-13 04:34:15.580,"25km E of Bitung, Indonesia25km E of Bitung, I...",mwwmww,10.4,greengreen,0
2,2018-10-13 00:13:46.2202018-10-13 00:13:46.220,"42km WNW of Sola, Vanuatu42km WNW of Sola, Van...",mwwmww,11.4,greengreen,0
3,2018-10-12 21:09:49.2402018-10-12 21:09:49.240,"13km E of Nueva Concepcion, Guatemala13km E of...",mwwmww,11.4,greengreen,0
4,2018-10-12 02:52:03.6202018-10-12 02:52:03.620,"128km SE of Kimbe, Papua New Guinea128km SE of...",mwwmww,11.2,greengreen,2


In [None]:
pd.read_csv?

## Creating DataFrames

In [41]:
import datetime as dt
import numpy as np
import pandas as pd


In [42]:
# Check number of rows in data (-l flag)
!wc -l ../data/earthquakes.csv

9333 ../data/earthquakes.csv


In [44]:
# check size of file
!ls -lh ../data | grep earthquakes.csv

-rw-r--r-- 1 TMadsen Domain Users 3.4M Nov  7 18:31 earthquakes.csv


In [45]:
files = !ls -lh ../data

In [47]:
[file for file in files if '.csv' in file]

['-rw-r--r-- 1 TMadsen Domain Users 3.4M Nov  7 18:31 earthquakes.csv',
 '-rw-r--r-- 1 TMadsen Domain Users  397 Nov  7 18:31 example_data.csv',
 '-rw-r--r-- 1 TMadsen Domain Users 3.5M Nov  7 18:31 parsed.csv',
 '-rw-r--r-- 1 TMadsen Domain Users 6.7K Nov  7 18:31 tsunamis.csv']

In [48]:
!head -n 2 ../data/earthquakes.csv

alert,cdi,code,detail,dmin,felt,gap,ids,mag,magType,mmi,net,nst,place,rms,sig,sources,status,time,title,tsunami,type,types,tz,updated,url
,,37389218,https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=ci37389218&format=geojson,0.008693,,85.0,",ci37389218,",1.35,ml,,ci,26.0,"9km NE of Aguanga, CA",0.19,28,",ci,",automatic,1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,earthquake,",geoserve,nearby-cities,origin,phase-data,",-480.0,1539475395144,https://earthquake.usgs.gov/earthquakes/eventpage/ci37389218


In [49]:
!tail -n 2 ../data/earthquakes.csv

,,38063959,https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=ci38063959&format=geojson,0.01865,,61.0,",ci38063959,",1.1,ml,,ci,27.0,"9km NE of Aguanga, CA",0.1,19,",ci,",reviewed,1537229545350,"M 1.1 - 9km NE of Aguanga, CA",0,earthquake,",focal-mechanism,geoserve,nearby-cities,origin,phase-data,scitech-link,",-480.0,1537230211640,https://earthquake.usgs.gov/earthquakes/eventpage/ci38063959
,,38063935,https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=ci38063935&format=geojson,0.01698,,39.0,",ci38063935,",0.66,ml,,ci,24.0,"9km NE of Aguanga, CA",0.1,7,",ci,",reviewed,1537228864470,"M 0.7 - 9km NE of Aguanga, CA",0,earthquake,",focal-mechanism,geoserve,nearby-cities,origin,phase-data,scitech-link,",-480.0,1537305830770,https://earthquake.usgs.gov/earthquakes/eventpage/ci38063935


In [50]:
!awk -F',' '{print NF; exit}' ../data/earthquakes.csv

26


In [51]:
headers = !head -n 1 ../data/earthquakes.csv
len(headers[0].split(','))

26

In [52]:
df = pd.read_csv('../data/earthquakes.csv')

In [53]:
df_url = pd.read_csv(
    "https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas-2nd-edition/master/ch_02/data/earthquakes.csv"
)


In [54]:
df.to_csv('output.csv', index=False)

## From a database