# Chapter 2

## Adding and removing data

In [57]:
import pandas as pd
import numpy as np

In [25]:
df = pd.read_csv(
    '../data/earthquakes.csv',
    usecols=[
        'time', 'title', 'place','magType',
        'mag', 'alert', 'tsunami'
    ]
)

In [3]:
df.head()

Unnamed: 0,alert,mag,magType,place,time,title,tsunami
0,,1.35,ml,"9km NE of Aguanga, CA",1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0
1,,1.29,ml,"9km NE of Aguanga, CA",1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0
2,,3.42,ml,"8km NE of Aguanga, CA",1539475062610,"M 3.4 - 8km NE of Aguanga, CA",0
3,,0.44,ml,"9km NE of Aguanga, CA",1539474978070,"M 0.4 - 9km NE of Aguanga, CA",0
4,,2.16,md,"10km NW of Avenal, CA",1539474716050,"M 2.2 - 10km NW of Avenal, CA",0


### Creating new data

In [26]:
df['source'] = 'USGS API'
df.head()

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,source
0,,1.35,ml,"9km NE of Aguanga, CA",1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,USGS API
1,,1.29,ml,"9km NE of Aguanga, CA",1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0,USGS API
2,,3.42,ml,"8km NE of Aguanga, CA",1539475062610,"M 3.4 - 8km NE of Aguanga, CA",0,USGS API
3,,0.44,ml,"9km NE of Aguanga, CA",1539474978070,"M 0.4 - 9km NE of Aguanga, CA",0,USGS API
4,,2.16,md,"10km NW of Avenal, CA",1539474716050,"M 2.2 - 10km NW of Avenal, CA",0,USGS API


In [27]:
df['mag_negative'] = df.mag < 0
df.head()

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,source,mag_negative
0,,1.35,ml,"9km NE of Aguanga, CA",1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,USGS API,False
1,,1.29,ml,"9km NE of Aguanga, CA",1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0,USGS API,False
2,,3.42,ml,"8km NE of Aguanga, CA",1539475062610,"M 3.4 - 8km NE of Aguanga, CA",0,USGS API,False
3,,0.44,ml,"9km NE of Aguanga, CA",1539474978070,"M 0.4 - 9km NE of Aguanga, CA",0,USGS API,False
4,,2.16,md,"10km NW of Avenal, CA",1539474716050,"M 2.2 - 10km NW of Avenal, CA",0,USGS API,False


In [28]:
df.place.str.extract(r', (.*$)')[0].sort_values().unique()

array(['Afghanistan', 'Alaska', 'Argentina', 'Arizona', 'Arkansas',
       'Australia', 'Azerbaijan', 'B.C., MX', 'Barbuda', 'Bolivia',
       'Bonaire, Saint Eustatius and Saba ', 'British Virgin Islands',
       'Burma', 'CA', 'California', 'Canada', 'Chile', 'China',
       'Christmas Island', 'Colombia', 'Colorado', 'Costa Rica',
       'Dominican Republic', 'East Timor', 'Ecuador', 'Ecuador region',
       'El Salvador', 'Fiji', 'Greece', 'Greenland', 'Guam', 'Guatemala',
       'Haiti', 'Hawaii', 'Honduras', 'Idaho', 'Illinois', 'India',
       'Indonesia', 'Iran', 'Iraq', 'Italy', 'Jamaica', 'Japan', 'Kansas',
       'Kentucky', 'Kyrgyzstan', 'Martinique', 'Mauritius', 'Mayotte',
       'Mexico', 'Missouri', 'Montana', 'NV', 'Nevada', 'New Caledonia',
       'New Hampshire', 'New Mexico', 'New Zealand', 'Nicaragua',
       'North Carolina', 'Northern Mariana Islands', 'Oklahoma', 'Oregon',
       'Pakistan', 'Papua New Guinea', 'Peru', 'Philippines',
       'Puerto Rico', 'Roman

In [29]:
df.place.unique().tolist()

['9km NE of Aguanga, CA',
 '8km NE of Aguanga, CA',
 '10km NW of Avenal, CA',
 '55km ESE of Punta Cana, Dominican Republic',
 '105km W of Talkeetna, Alaska',
 '10km NW of Parkfield, CA',
 '6km NW of The Geysers, CA',
 '219km SSE of Saparua, Indonesia',
 '10km NE of Aguanga, CA',
 '53km SE of Punta Cana, Dominican Republic',
 '120km SSW of Banda Aceh, Indonesia',
 '14km NW of Parkfield, CA',
 '156km WNW of Haines Junction, Canada',
 '2km N of The Geysers, CA',
 '2km S of Maricao, Puerto Rico',
 '57km SSW of Kaktovik, Alaska',
 '4km ENE of Calimesa, CA',
 '61km N of Tierras Nuevas Poniente, Puerto Rico',
 '64km SSW of Salcha, Alaska',
 '4km NW of San Clemente, CA',
 '27km E of Coso Junction, CA',
 '6km SSE of Idyllwild, CA',
 '111km NNW of Arctic Village, Alaska',
 '43km S of Redoubt Volcano, Alaska',
 '86km SW of Anchor Point, Alaska',
 '15km NW of Parkfield, CA',
 '2km WNW of Manhattan, Montana',
 '79km SW of Kaktovik, Alaska',
 '13km E of Hawthorne, Nevada',
 '45km S of Kaktovik, Alas

In [37]:
df['parsed_place'] = (df
                        .place
                        .str.replace(r'.* of ', '', regex=True)
                        .str.replace(r'CA$', 'California', regex=True)
                        .str.replace(r'NV$', 'Nevada', regex=True)
                        .str.replace(r'MX$', 'Mexico', regex=True)
                        .str.replace(r' region$', '', regex=True)
                        .str.replace('northern ', '')
                        .str.replace('Fiji Islands', 'Fiji')
                        .str.replace(r'^.*, ', '', regex=True)
                        .str.strip()
                    )

In [38]:
df.parsed_place

0        California
1        California
2        California
3        California
4        California
           ...     
9327     California
9328     California
9329    Puerto Rico
9330     California
9331     California
Name: parsed_place, Length: 9332, dtype: object

In [39]:
df.query('parsed_place.str.contains("northern")')

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,source,mag_negative,parsed_place


In [40]:
df.parsed_place.sort_values().unique().tolist()

['Afghanistan',
 'Alaska',
 'Argentina',
 'Arizona',
 'Arkansas',
 'Ascension Island',
 'Australia',
 'Azerbaijan',
 'Balleny Islands',
 'Barbuda',
 'Bolivia',
 'British Virgin Islands',
 'Burma',
 'California',
 'Canada',
 'Carlsberg Ridge',
 'Central East Pacific Rise',
 'Central Mid-Atlantic Ridge',
 'Chile',
 'China',
 'Christmas Island',
 'Colombia',
 'Colorado',
 'Costa Rica',
 'Dominican Republic',
 'East Timor',
 'Ecuador',
 'El Salvador',
 'Fiji',
 'Greece',
 'Greenland',
 'Guam',
 'Guatemala',
 'Haiti',
 'Hawaii',
 'Honduras',
 'Idaho',
 'Illinois',
 'India',
 'Indian Ocean Triple Junction',
 'Indonesia',
 'Iran',
 'Iraq',
 'Italy',
 'Jamaica',
 'Japan',
 'Kansas',
 'Kentucky',
 'Kuril Islands',
 'Kyrgyzstan',
 'Martinique',
 'Mauritius',
 'Mayotte',
 'Mexico',
 'Mid-Indian Ridge',
 'Missouri',
 'Montana',
 'Nevada',
 'New Caledonia',
 'New Hampshire',
 'New Mexico',
 'New Zealand',
 'Nicaragua',
 'North Carolina',
 'Northern East Pacific Rise',
 'Northern Mariana Islands',
 

In [42]:
df.assign(
    in_ca=df.parsed_place.str.endswith('California'),
    in_alaska=df.parsed_place.str.endswith('Alaska'),
    neither=lambda x: ~x.in_ca & ~x.in_alaska
).sample(5, random_state=0)

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,source,mag_negative,parsed_place,in_ca,in_alaska,neither
7207,,4.8,mwr,"73km SSW of Masachapa, Nicaragua",1537749595210,"M 4.8 - 73km SSW of Masachapa, Nicaragua",0,USGS API,False,Nicaragua,False,False,True
4755,,1.09,ml,"28km NNW of Packwood, Washington",1538227540460,"M 1.1 - 28km NNW of Packwood, Washington",0,USGS API,False,Washington,False,False,True
4595,,1.8,ml,"77km SSW of Kaktovik, Alaska",1538259609862,"M 1.8 - 77km SSW of Kaktovik, Alaska",0,USGS API,False,Alaska,False,True,False
3566,,1.5,ml,"102km NW of Arctic Village, Alaska",1538464751822,"M 1.5 - 102km NW of Arctic Village, Alaska",0,USGS API,False,Alaska,False,True,False
2182,,0.9,ml,"26km ENE of Pine Valley, CA",1538801713880,"M 0.9 - 26km ENE of Pine Valley, CA",0,USGS API,False,California,True,False,False


In [43]:
tsunami = df[df.tsunami == 1]
no_tsunami = df[df.tsunami == 0]

In [44]:
tsunami.shape, no_tsunami.shape

((61, 10), (9271, 10))

In [45]:
pd.concat([tsunami, no_tsunami]).shape

(9332, 10)

In [46]:
tsunami.append(no_tsunami).shape
# Note:  he frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

  tsunami.append(no_tsunami).shape


(9332, 10)

In [53]:
additional_columns = pd.read_csv(
    '../data/earthquakes.csv',
    usecols=['tz', 'felt', 'ids']
)

In [58]:
np.mean(additional_columns.index == df.index)

1.0

In [48]:
pd.concat([df.head(2), additional_columns.head(2)], axis=1)

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,source,mag_negative,parsed_place,felt,ids,tz
0,,1.35,ml,"9km NE of Aguanga, CA",1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,USGS API,False,California,,",ci37389218,",-480.0
1,,1.29,ml,"9km NE of Aguanga, CA",1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0,USGS API,False,California,,",ci37389202,",-480.0


In [50]:
additional_columns = pd.read_csv(
    '../data/earthquakes.csv',
    usecols=['tz', 'felt', 'ids', 'time'],
    index_col='time'
)

In [51]:
pd.concat([df.head(2), additional_columns.head(2)], axis=1)

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,source,mag_negative,parsed_place,felt,ids,tz
0,,1.35,ml,"9km NE of Aguanga, CA",1539475000000.0,"M 1.4 - 9km NE of Aguanga, CA",0.0,USGS API,False,California,,,
1,,1.29,ml,"9km NE of Aguanga, CA",1539475000000.0,"M 1.3 - 9km NE of Aguanga, CA",0.0,USGS API,False,California,,,
1539475168010,,,,,,,,,,,,",ci37389218,",-480.0
1539475129610,,,,,,,,,,,,",ci37389202,",-480.0


In [52]:
pd.concat(
    [
        tsunami.head(2),
        no_tsunami.head(2).assign(type='earthquake')
    ],
    join='inner', ignore_index=True
)

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,source,mag_negative,parsed_place
0,,5.0,mww,"165km NNW of Flying Fish Cove, Christmas Island",1539459504090,"M 5.0 - 165km NNW of Flying Fish Cove, Christm...",1,USGS API,False,Christmas Island
1,green,6.7,mww,"262km NW of Ozernovskiy, Russia",1539429023560,"M 6.7 - 262km NW of Ozernovskiy, Russia",1,USGS API,False,Russia
2,,1.35,ml,"9km NE of Aguanga, CA",1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,USGS API,False,California
3,,1.29,ml,"9km NE of Aguanga, CA",1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0,USGS API,False,California


In [59]:
df.head()

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,source,mag_negative,parsed_place
0,,1.35,ml,"9km NE of Aguanga, CA",1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,USGS API,False,California
1,,1.29,ml,"9km NE of Aguanga, CA",1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0,USGS API,False,California
2,,3.42,ml,"8km NE of Aguanga, CA",1539475062610,"M 3.4 - 8km NE of Aguanga, CA",0,USGS API,False,California
3,,0.44,ml,"9km NE of Aguanga, CA",1539474978070,"M 0.4 - 9km NE of Aguanga, CA",0,USGS API,False,California
4,,2.16,md,"10km NW of Avenal, CA",1539474716050,"M 2.2 - 10km NW of Avenal, CA",0,USGS API,False,California


### Deleting unwanted data

In [60]:
del df['source']
df.columns

Index(['alert', 'mag', 'magType', 'place', 'time', 'title', 'tsunami',
       'mag_negative', 'parsed_place'],
      dtype='object')

In [66]:
try:
    del df['source']
except KeyError:
    print("column does not exist")

column does not exist


In [67]:
mag_negative = df.pop('mag_negative')
df.columns

Index(['alert', 'mag', 'magType', 'place', 'time', 'title', 'tsunami',
       'parsed_place'],
      dtype='object')

In [68]:
mag_negative.value_counts()

False    8841
True      491
Name: mag_negative, dtype: int64

In [69]:
np.mean(mag_negative.index == df.index)

1.0

In [70]:
df[mag_negative].head()

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,parsed_place
39,,-0.1,ml,"6km NW of Lemmon Valley, Nevada",1539458844506,"M -0.1 - 6km NW of Lemmon Valley, Nevada",0,Nevada
49,,-0.1,ml,"6km NW of Lemmon Valley, Nevada",1539455017464,"M -0.1 - 6km NW of Lemmon Valley, Nevada",0,Nevada
135,,-0.4,ml,"10km SSE of Beatty, Nevada",1539422175717,"M -0.4 - 10km SSE of Beatty, Nevada",0,Nevada
161,,-0.02,md,"20km SSE of Ronan, Montana",1539412475360,"M -0.0 - 20km SSE of Ronan, Montana",0,Montana
198,,-0.2,ml,"60km N of Pahrump, Nevada",1539398340822,"M -0.2 - 60km N of Pahrump, Nevada",0,Nevada


In [71]:
df.drop([0,1]).head(2)

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,parsed_place
2,,3.42,ml,"8km NE of Aguanga, CA",1539475062610,"M 3.4 - 8km NE of Aguanga, CA",0,California
3,,0.44,ml,"9km NE of Aguanga, CA",1539474978070,"M 0.4 - 9km NE of Aguanga, CA",0,California


In [72]:
cols_to_drop = [
    col for col in df.columns
    if col not in [
        'alert', 'mag', 'title', 'time', 'tsunami'
    ]
]
df.drop(columns=cols_to_drop).head()

Unnamed: 0,alert,mag,time,title,tsunami
0,,1.35,1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0
1,,1.29,1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0
2,,3.42,1539475062610,"M 3.4 - 8km NE of Aguanga, CA",0
3,,0.44,1539474978070,"M 0.4 - 9km NE of Aguanga, CA",0
4,,2.16,1539474716050,"M 2.2 - 10km NW of Avenal, CA",0


In [74]:
df.drop(columns=cols_to_drop).equals(
    df.drop(cols_to_drop, axis=1)
)

True

## Summary