In [6]:
import pandas as pd
import datetime

## Data Exploration

In [32]:
def csv_to_df(file):
    return pd.read_csv(
        file,
        parse_dates=["time"]
    )

In [3]:
jan = csv_to_df("datasets/january.csv")
feb = csv_to_df("datasets/february.csv")
mar = csv_to_df("datasets/march.csv")

In [4]:
sum(size for (size, _) in (jan.shape, feb.shape, mar.shape))

35919

In [6]:
earthquakes.shape

(35919, 22)

In [15]:
earthquakes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35919 entries, 0 to 12312
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   time             35919 non-null  datetime64[ns, UTC]
 1   latitude         35919 non-null  float64            
 2   longitude        35919 non-null  float64            
 3   depth            35919 non-null  float64            
 4   mag              35916 non-null  float64            
 5   magType          35916 non-null  object             
 6   nst              21283 non-null  float64            
 7   gap              26711 non-null  float64            
 8   dmin             22113 non-null  float64            
 9   rms              35919 non-null  float64            
 10  net              35919 non-null  object             
 11  id               35919 non-null  object             
 12  updated          35919 non-null  object             
 13  place           

In [35]:
# Dropping unnecessary columns
earthquakes.drop(
    ["nst", "gap", "dmin", "rms", "updated", "horizontalError", "depthError", "magError", "magNst"],
    axis=1
).head()

Unnamed: 0,time,latitude,longitude,depth,mag,magType,net,id,place,type,status,locationSource,magSource
0,2022-01-30 23:58:39.500000+00:00,34.879167,-97.852,14.67,0.99,ml,ok,ok2022ccvb,"7 km WSW of Alex, Oklahoma",earthquake,reviewed,ok,ok
1,2022-01-30 23:56:19.237000+00:00,62.91,-151.1774,107.6,1.4,ml,ak,ak0221e05fgq,Central Alaska,earthquake,reviewed,ak,ak
2,2022-01-30 23:54:44.477000+00:00,63.3048,-151.2721,8.8,1.1,ml,ak,ak0221e051u3,"34 km SE of Denali National Park, Alaska",earthquake,reviewed,ak,ak
3,2022-01-30 23:54:32.973000+00:00,51.5578,-176.6738,39.6,1.8,ml,ak,ak0221e050xt,"35 km S of Adak, Alaska",earthquake,reviewed,ak,ak
4,2022-01-30 23:51:44.207000+00:00,63.2917,-151.3217,9.9,1.1,ml,ak,ak0221e04enn,"34 km SE of Denali National Park, Alaska",earthquake,reviewed,ak,ak


In [28]:
earthquakes["net"].unique()

array(['ok', 'ak', 'nc', 'us', 'av', 'hv', 'pr', 'ci', 'uw', 'nm', 'nn',
       'uu', 'tx', 'mb', 'se'], dtype=object)

In [31]:
earthquakes["locationSource"].unique()

array(['ok', 'ak', 'nc', 'us', 'av', 'hv', 'pr', 'ci', 'uw', 'nm', 'nn',
       'uu', 'tx', 'mb', 'se'], dtype=object)

In [25]:
earthquakes["depth"].mean()

26.288167243528935

In [23]:
earthquakes["status"].unique()

array(['reviewed', 'automatic'], dtype=object)

In [24]:
earthquakes["type"].unique()

array(['earthquake', 'ice quake', 'explosion', 'mining explosion',
       'quarry blast', 'volcanic eruption', 'other event'], dtype=object)

# Data Loading

In [7]:
# Loading only columns necessary for visualization.
def csv_to_df(file):
    return pd.read_csv(
        file,
        parse_dates=["time"],
        usecols=[
            "id",
            "time",
            "latitude",
            "longitude",
            "mag",
            "magType",
            "place",
            "type",
            "status",
            "locationSource"
        ],
        index_col="id"
    )

In [8]:
jan = csv_to_df("datasets/january.csv")
feb = csv_to_df("datasets/february.csv")
mar = csv_to_df("datasets/march.csv")

In [9]:
sum(size for (size, _) in (jan.shape, feb.shape, mar.shape))

35919

In [10]:
earthquakes = pd.concat([jan, feb, mar])

In [11]:
earthquakes.shape

(35919, 9)

In [12]:
earthquakes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 35919 entries, ok2022ccvb to uw61819426
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   time            35919 non-null  datetime64[ns, UTC]
 1   latitude        35919 non-null  float64            
 2   longitude       35919 non-null  float64            
 3   mag             35916 non-null  float64            
 4   magType         35916 non-null  object             
 5   place           35784 non-null  object             
 6   type            35919 non-null  object             
 7   status          35919 non-null  object             
 8   locationSource  35919 non-null  object             
dtypes: datetime64[ns, UTC](1), float64(3), object(5)
memory usage: 2.7+ MB


# Data Wrangling

In [13]:
earthquakes.nunique()

time              35914
latitude          28270
longitude         29491
mag                 635
magType              10
place             16335
type                  7
status                2
locationSource       15
dtype: int64

In [14]:
earthquakes["locationSource"].unique()

array(['ok', 'ak', 'nc', 'us', 'av', 'hv', 'pr', 'ci', 'uw', 'nm', 'nn',
       'uu', 'tx', 'mb', 'se'], dtype=object)

In [15]:
earthquakes["status"].unique()

array(['reviewed', 'automatic'], dtype=object)

In [16]:
earthquakes["type"].unique()

array(['earthquake', 'ice quake', 'explosion', 'mining explosion',
       'quarry blast', 'volcanic eruption', 'other event'], dtype=object)

In [17]:
earthquakes["magType"].unique()

array(['ml', 'md', 'mb', 'mw', 'mww', 'mb_lg', 'mwr', 'mh', nan, 'ms_20',
       'mwb'], dtype=object)

## Converting values to appropriate data types

In [18]:
earthquakes["status"] = earthquakes["status"].astype("category")
earthquakes["locationSource"] = earthquakes["locationSource"].astype("category")
earthquakes["type"] = earthquakes["type"].astype("category")
earthquakes["magType"] = earthquakes["magType"].astype("category")

## Checking for missing values

In [19]:
earthquakes.shape

(35919, 9)

In [20]:
earthquakes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 35919 entries, ok2022ccvb to uw61819426
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   time            35919 non-null  datetime64[ns, UTC]
 1   latitude        35919 non-null  float64            
 2   longitude       35919 non-null  float64            
 3   mag             35916 non-null  float64            
 4   magType         35916 non-null  category           
 5   place           35784 non-null  object             
 6   type            35919 non-null  category           
 7   status          35919 non-null  category           
 8   locationSource  35919 non-null  category           
dtypes: category(4), datetime64[ns, UTC](1), float64(3), object(1)
memory usage: 1.8+ MB


In [150]:
earthquakes[earthquakes["place"].isnull()].iloc[0]

time              2022-01-30 11:14:16.340000+00:00
latitude                                   61.3634
longitude                                 -149.574
mag                                            1.2
magType                                         ml
place                                          NaN
type                                    earthquake
status                                    reviewed
locationSource                                  ak
Name: ak0221dsr7ph, dtype: object

In [102]:
earthquakes[earthquakes["mag"].isnull()]

Unnamed: 0_level_0,time,latitude,longitude,mag,magType,place,type,status,locationSource
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
nc71127084,2022-01-18 14:50:55.480000+00:00,35.975333,-120.371,,,"10km NNE of Parkfield, CA",earthquake,reviewed,nc
nc73674341,2022-01-06 06:11:48.790000+00:00,38.7985,-122.7015,,,"3km NNW of Anderson Springs, CA",earthquake,reviewed,nc
pr71336423,2022-02-21 21:01:10.020000+00:00,17.9565,-66.928167,,,"2 km SW of Guánica, Puerto Rico",earthquake,reviewed,pr


## Dropping earthquakes with no magnitudes for which no values could be found

In [23]:
earthquakes = earthquakes.dropna(subset=["mag"])

In [106]:
earthquakes.shape

(35916, 9)

## Extracting month from the `time` attribute

In [21]:
earthquakes["month"] = earthquakes["time"].dt.month_name()

In [22]:
earthquakes["month"].value_counts()

January     12423
March       12313
February    11183
Name: month, dtype: int64

## Extracting state from the `longitude` and `latitude` attributes

In [134]:
states = earthquakes["place"].str.split(
    pat=",",
    n=1,
).str.get(-1)

In [135]:
states

id
ok2022ccvb            Oklahoma
ak0221e05fgq    Central Alaska
ak0221e051u3            Alaska
ak0221e050xt            Alaska
ak0221e04enn            Alaska
                     ...      
ak0222r8d3ze            Alaska
nc73699171                  CA
ak0222r8acmt            Alaska
uu60483552                Utah
uw61819426              Canada
Name: place, Length: 35916, dtype: object

In [138]:
states.info()

<class 'pandas.core.series.Series'>
Index: 35916 entries, ok2022ccvb to uw61819426
Series name: place
Non-Null Count  Dtype 
--------------  ----- 
35781 non-null  object
dtypes: object(1)
memory usage: 1.6+ MB
