In [1]:
import pandas as pd
import datetime

## Data Exploration

In [2]:
def csv_to_df(file):
    return pd.read_csv(
        file,
        parse_dates=["time"]
    )

In [3]:
jan = csv_to_df("datasets/january.csv")
feb = csv_to_df("datasets/february.csv")
mar = csv_to_df("datasets/march.csv")

In [4]:
sum(size for (size, _) in (jan.shape, feb.shape, mar.shape))

35919

In [5]:
earthquakes = pd.concat([jan, feb, mar])

In [6]:
earthquakes.shape

(35919, 22)

In [7]:
earthquakes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35919 entries, 0 to 12312
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   time             35919 non-null  datetime64[ns, UTC]
 1   latitude         35919 non-null  float64            
 2   longitude        35919 non-null  float64            
 3   depth            35919 non-null  float64            
 4   mag              35916 non-null  float64            
 5   magType          35916 non-null  object             
 6   nst              21283 non-null  float64            
 7   gap              26711 non-null  float64            
 8   dmin             22113 non-null  float64            
 9   rms              35919 non-null  float64            
 10  net              35919 non-null  object             
 11  id               35919 non-null  object             
 12  updated          35919 non-null  object             
 13  place           

In [8]:
# Dropping unnecessary columns
earthquakes.drop(
    ["nst", "gap", "dmin", "rms", "updated", "horizontalError", "depthError", "magError", "magNst"],
    axis=1
).head()

Unnamed: 0,time,latitude,longitude,depth,mag,magType,net,id,place,type,status,locationSource,magSource
0,2022-01-30 23:58:39.500000+00:00,34.879167,-97.852,14.67,0.99,ml,ok,ok2022ccvb,"7 km WSW of Alex, Oklahoma",earthquake,reviewed,ok,ok
1,2022-01-30 23:56:19.237000+00:00,62.91,-151.1774,107.6,1.4,ml,ak,ak0221e05fgq,Central Alaska,earthquake,reviewed,ak,ak
2,2022-01-30 23:54:44.477000+00:00,63.3048,-151.2721,8.8,1.1,ml,ak,ak0221e051u3,"34 km SE of Denali National Park, Alaska",earthquake,reviewed,ak,ak
3,2022-01-30 23:54:32.973000+00:00,51.5578,-176.6738,39.6,1.8,ml,ak,ak0221e050xt,"35 km S of Adak, Alaska",earthquake,reviewed,ak,ak
4,2022-01-30 23:51:44.207000+00:00,63.2917,-151.3217,9.9,1.1,ml,ak,ak0221e04enn,"34 km SE of Denali National Park, Alaska",earthquake,reviewed,ak,ak


In [9]:
earthquakes["net"].unique()

array(['ok', 'ak', 'nc', 'us', 'av', 'hv', 'pr', 'ci', 'uw', 'nm', 'nn',
       'uu', 'tx', 'mb', 'se'], dtype=object)

In [10]:
earthquakes["locationSource"].unique()

array(['ok', 'ak', 'nc', 'us', 'av', 'hv', 'pr', 'ci', 'uw', 'nm', 'nn',
       'uu', 'tx', 'mb', 'se'], dtype=object)

In [11]:
earthquakes["depth"].mean()

26.288167243528935

In [12]:
earthquakes["status"].unique()

array(['reviewed', 'automatic'], dtype=object)

In [13]:
earthquakes["type"].unique()

array(['earthquake', 'ice quake', 'explosion', 'mining explosion',
       'quarry blast', 'volcanic eruption', 'other event'], dtype=object)

# Data Loading

In [14]:
# Loading only columns necessary for visualization.
def csv_to_df(file):
    return pd.read_csv(
        file,
        parse_dates=["time"],
        usecols=[
            "id",
            "time",
            "latitude",
            "longitude",
            "mag",
            "magType",
            "place",
            "type",
            "status",
            "locationSource"
        ],
        index_col="id"
    )

In [15]:
jan = csv_to_df("datasets/january.csv")
feb = csv_to_df("datasets/february.csv")
mar = csv_to_df("datasets/march.csv")

In [16]:
sum(size for (size, _) in (jan.shape, feb.shape, mar.shape))

35919

In [17]:
earthquakes = pd.concat([jan, feb, mar])

In [18]:
earthquakes.shape

(35919, 9)

In [19]:
earthquakes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 35919 entries, ok2022ccvb to uw61819426
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   time            35919 non-null  datetime64[ns, UTC]
 1   latitude        35919 non-null  float64            
 2   longitude       35919 non-null  float64            
 3   mag             35916 non-null  float64            
 4   magType         35916 non-null  object             
 5   place           35784 non-null  object             
 6   type            35919 non-null  object             
 7   status          35919 non-null  object             
 8   locationSource  35919 non-null  object             
dtypes: datetime64[ns, UTC](1), float64(3), object(5)
memory usage: 2.7+ MB


# Data Wrangling

In [20]:
earthquakes.nunique()

time              35914
latitude          28270
longitude         29491
mag                 635
magType              10
place             16335
type                  7
status                2
locationSource       15
dtype: int64

In [21]:
earthquakes["locationSource"].unique()

array(['ok', 'ak', 'nc', 'us', 'av', 'hv', 'pr', 'ci', 'uw', 'nm', 'nn',
       'uu', 'tx', 'mb', 'se'], dtype=object)

In [22]:
earthquakes["status"].unique()

array(['reviewed', 'automatic'], dtype=object)

In [23]:
earthquakes["type"].unique()

array(['earthquake', 'ice quake', 'explosion', 'mining explosion',
       'quarry blast', 'volcanic eruption', 'other event'], dtype=object)

In [24]:
earthquakes["magType"].unique()

array(['ml', 'md', 'mb', 'mw', 'mww', 'mb_lg', 'mwr', 'mh', nan, 'ms_20',
       'mwb'], dtype=object)

## Converting values to appropriate data types

In [25]:
earthquakes["status"] = earthquakes["status"].astype("category")
earthquakes["locationSource"] = earthquakes["locationSource"].astype("category")
earthquakes["type"] = earthquakes["type"].astype("category")
earthquakes["magType"] = earthquakes["magType"].astype("category")

## Checking for missing values

In [26]:
earthquakes.shape

(35919, 9)

In [27]:
earthquakes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 35919 entries, ok2022ccvb to uw61819426
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   time            35919 non-null  datetime64[ns, UTC]
 1   latitude        35919 non-null  float64            
 2   longitude       35919 non-null  float64            
 3   mag             35916 non-null  float64            
 4   magType         35916 non-null  category           
 5   place           35784 non-null  object             
 6   type            35919 non-null  category           
 7   status          35919 non-null  category           
 8   locationSource  35919 non-null  category           
dtypes: category(4), datetime64[ns, UTC](1), float64(3), object(1)
memory usage: 1.8+ MB


In [28]:
earthquakes[earthquakes["place"].isnull()].iloc[0]

time              2022-01-30 11:14:16.340000+00:00
latitude                                   61.3634
longitude                                 -149.574
mag                                            1.2
magType                                         ml
place                                          NaN
type                                    earthquake
status                                    reviewed
locationSource                                  ak
Name: ak0221dsr7ph, dtype: object

In [29]:
earthquakes[earthquakes["mag"].isnull()]

Unnamed: 0_level_0,time,latitude,longitude,mag,magType,place,type,status,locationSource
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
nc71127084,2022-01-18 14:50:55.480000+00:00,35.975333,-120.371,,,"10km NNE of Parkfield, CA",earthquake,reviewed,nc
nc73674341,2022-01-06 06:11:48.790000+00:00,38.7985,-122.7015,,,"3km NNW of Anderson Springs, CA",earthquake,reviewed,nc
pr71336423,2022-02-21 21:01:10.020000+00:00,17.9565,-66.928167,,,"2 km SW of Guánica, Puerto Rico",earthquake,reviewed,pr


## Dropping earthquakes with no magnitudes for which no values could be found

In [30]:
earthquakes = earthquakes.dropna(subset=["mag"])

In [31]:
earthquakes.shape

(35916, 9)

## Extracting month from the `time` attribute

In [32]:
earthquakes["month"] = earthquakes["time"].dt.month_name()

In [33]:
earthquakes["month"].value_counts()

January     12421
March       12313
February    11182
Name: month, dtype: int64

## Extracting state from the `longitude` and `latitude` attributes

In [34]:
states = earthquakes["place"].str.split(
    pat=",",
    n=1,
).str.get(-1)

In [35]:
states

id
ok2022ccvb            Oklahoma
ak0221e05fgq    Central Alaska
ak0221e051u3            Alaska
ak0221e050xt            Alaska
ak0221e04enn            Alaska
                     ...      
ak0222r8d3ze            Alaska
nc73699171                  CA
ak0222r8acmt            Alaska
uu60483552                Utah
uw61819426              Canada
Name: place, Length: 35916, dtype: object

In [36]:
states.info()

<class 'pandas.core.series.Series'>
Index: 35916 entries, ok2022ccvb to uw61819426
Series name: place
Non-Null Count  Dtype 
--------------  ----- 
35781 non-null  object
dtypes: object(1)
memory usage: 1.6+ MB


In [44]:
temp = earthquakes[earthquakes["place"].isnull()]

In [45]:
temp

Unnamed: 0_level_0,time,latitude,longitude,mag,magType,place,type,status,locationSource,month
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ak0221dsr7ph,2022-01-30 11:14:16.340000+00:00,61.363400,-149.574000,1.20,ml,,earthquake,reviewed,ak,January
tx2022byde,2022-01-28 10:23:37.233000+00:00,31.707534,-103.995351,2.50,ml,,earthquake,reviewed,tx,January
ak0221afcyv4,2022-01-28 07:24:33.861000+00:00,53.216700,-166.734500,1.70,ml,,earthquake,reviewed,ak,January
ak022191gw7r,2022-01-27 23:44:58.569000+00:00,60.281600,-152.497100,1.90,ml,,earthquake,reviewed,ak,January
ak02218o8kpq,2022-01-27 01:22:26.695000+00:00,61.327600,-150.872700,1.60,ml,,earthquake,reviewed,ak,January
...,...,...,...,...,...,...,...,...,...,...
pr71338438,2022-03-02 03:49:23.630000+00:00,18.042667,-66.894167,1.51,md,,earthquake,reviewed,pr,March
us6000h23r,2022-03-01 04:52:18.615000+00:00,-7.162000,156.382800,4.30,mb,,earthquake,reviewed,us,March
ak0222rabwus,2022-03-01 03:50:34.211000+00:00,60.655000,-152.423100,1.10,ml,,earthquake,reviewed,ak,March
us6000h23y,2022-03-01 03:37:44.437000+00:00,-2.067800,127.825500,4.40,mb,,earthquake,reviewed,us,March


In [46]:
import geocoder

In [47]:
def get_state(df):
    g = geocoder.osm([df["latitude"], df["longitude"]], method="reverse")
    if g.ok:
        geojson = g.geojson
        address = geojson["features"][0]["properties"]["raw"]["address"]
        return address.get("ISO3166-2-lvl4")
    return ""

In [52]:
res = temp[["latitude", "longitude"]].apply(get_state, axis=1)

In [59]:
res.isnull().value_counts()

False    132
True       3
dtype: int64

In [1]:
# We actually get a lot of the missing values. So it can be applied on the original dataframe.

In [2]:
import pandas as pd

In [85]:
def csv_to_df(file):
    return pd.read_csv(
        file,
        parse_dates=["time"],
        usecols=[
            "id",
            "time",
            "latitude",
            "longitude",
            "mag",
            "magType",
            "place",
            "type",
            "status",
            "locationSource",
            "state",
        ],
        index_col="id"
    )

In [86]:
jan = csv_to_df("datasets/jan-enriched.csv")
feb = csv_to_df("datasets/feb-enriched.csv")
mar = csv_to_df("datasets/mar-enriched.csv")

In [87]:
sum(size for (size, _) in (jan.shape, feb.shape, mar.shape))

35919

In [88]:
earthquakes = pd.concat([jan, feb, mar])

In [89]:
earthquakes["state"].str.startswith("CA").value_counts()

False    31088
True       287
Name: state, dtype: int64

In [90]:
# Dropping Canadian states
earthquakes = earthquakes[~earthquakes["state"].str.startswith("CA", na=False)]

In [91]:
earthquakes.shape

(35632, 10)

In [92]:
earthquakes[earthquakes["state"].isnull()].shape

(4544, 10)

In [93]:
earthquakes.dropna(subset=["state"]).shape

(31088, 10)

In [94]:
35632 - 4544

31088

In [95]:
# Dropping the islands stuff
earthquakes = earthquakes.dropna(subset=["state"])

In [98]:
earthquakes.head()

Unnamed: 0_level_0,time,latitude,longitude,mag,magType,place,type,status,locationSource,state
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ok2022ccvb,2022-01-30 23:58:39.500000+00:00,34.879167,-97.852,0.99,ml,"7 km WSW of Alex, Oklahoma",earthquake,reviewed,ok,US-OK
ak0221e05fgq,2022-01-30 23:56:19.237000+00:00,62.91,-151.1774,1.4,ml,Central Alaska,earthquake,reviewed,ak,US-AK
ak0221e051u3,2022-01-30 23:54:44.477000+00:00,63.3048,-151.2721,1.1,ml,"34 km SE of Denali National Park, Alaska",earthquake,reviewed,ak,US-AK
ak0221e050xt,2022-01-30 23:54:32.973000+00:00,51.5578,-176.6738,1.8,ml,"35 km S of Adak, Alaska",earthquake,reviewed,ak,US-AK
ak0221e04enn,2022-01-30 23:51:44.207000+00:00,63.2917,-151.3217,1.1,ml,"34 km SE of Denali National Park, Alaska",earthquake,reviewed,ak,US-AK


In [99]:
earthquakes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31088 entries, ok2022ccvb to uu60483552
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   time            31088 non-null  datetime64[ns, UTC]
 1   latitude        31088 non-null  float64            
 2   longitude       31088 non-null  float64            
 3   mag             31085 non-null  float64            
 4   magType         31085 non-null  object             
 5   place           31003 non-null  object             
 6   type            31088 non-null  object             
 7   status          31088 non-null  object             
 8   locationSource  31088 non-null  object             
 9   state           31088 non-null  object             
dtypes: datetime64[ns, UTC](1), float64(3), object(6)
memory usage: 2.6+ MB


In [100]:
earthquakes[earthquakes["mag"].isnull()]

Unnamed: 0_level_0,time,latitude,longitude,mag,magType,place,type,status,locationSource,state
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
nc71127084,2022-01-18 14:50:55.480000+00:00,35.975333,-120.371,,,"10km NNE of Parkfield, CA",earthquake,reviewed,nc,US-CA
nc73674341,2022-01-06 06:11:48.790000+00:00,38.7985,-122.7015,,,"3km NNW of Anderson Springs, CA",earthquake,reviewed,nc,US-CA
pr71336423,2022-02-21 21:01:10.020000+00:00,17.9565,-66.928167,,,"2 km SW of Guánica, Puerto Rico",earthquake,reviewed,pr,US-PR


In [101]:
earthquakes = earthquakes.dropna(subset=["mag"])

In [102]:
earthquakes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31085 entries, ok2022ccvb to uu60483552
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   time            31085 non-null  datetime64[ns, UTC]
 1   latitude        31085 non-null  float64            
 2   longitude       31085 non-null  float64            
 3   mag             31085 non-null  float64            
 4   magType         31085 non-null  object             
 5   place           31000 non-null  object             
 6   type            31085 non-null  object             
 7   status          31085 non-null  object             
 8   locationSource  31085 non-null  object             
 9   state           31085 non-null  object             
dtypes: datetime64[ns, UTC](1), float64(3), object(6)
memory usage: 2.6+ MB


In [105]:
earthquakes = earthquakes.assign(
    state_code=earthquakes["state"].str.split("-").str.get(-1).str.strip(),
    month=earthquakes["time"].dt.month_name() 
)

In [106]:
earthquakes.head()

Unnamed: 0_level_0,time,latitude,longitude,mag,magType,place,type,status,locationSource,state,state_code,month
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ok2022ccvb,2022-01-30 23:58:39.500000+00:00,34.879167,-97.852,0.99,ml,"7 km WSW of Alex, Oklahoma",earthquake,reviewed,ok,US-OK,OK,January
ak0221e05fgq,2022-01-30 23:56:19.237000+00:00,62.91,-151.1774,1.4,ml,Central Alaska,earthquake,reviewed,ak,US-AK,AK,January
ak0221e051u3,2022-01-30 23:54:44.477000+00:00,63.3048,-151.2721,1.1,ml,"34 km SE of Denali National Park, Alaska",earthquake,reviewed,ak,US-AK,AK,January
ak0221e050xt,2022-01-30 23:54:32.973000+00:00,51.5578,-176.6738,1.8,ml,"35 km S of Adak, Alaska",earthquake,reviewed,ak,US-AK,AK,January
ak0221e04enn,2022-01-30 23:51:44.207000+00:00,63.2917,-151.3217,1.1,ml,"34 km SE of Denali National Park, Alaska",earthquake,reviewed,ak,US-AK,AK,January


In [109]:
earthquakes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31085 entries, ok2022ccvb to uu60483552
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   time            31085 non-null  datetime64[ns, UTC]
 1   latitude        31085 non-null  float64            
 2   longitude       31085 non-null  float64            
 3   mag             31085 non-null  float64            
 4   magType         31085 non-null  object             
 5   place           31000 non-null  object             
 6   type            31085 non-null  object             
 7   status          31085 non-null  object             
 8   locationSource  31085 non-null  object             
 9   state           31085 non-null  object             
 10  state_code      31085 non-null  object             
 11  month           31085 non-null  object             
dtypes: datetime64[ns, UTC](1), float64(3), object(8)
memory usage: 3.1+ MB


In [111]:
earthquakes.nunique()

time              31080
latitude          23616
longitude         24759
mag                 600
magType               9
place             13593
type                  7
status                2
locationSource       15
state               341
state_code          272
month                 3
dtype: int64

In [114]:
earthquakes["state"].unique()

array(['US-OK', 'US-AK', 'US-CA', 'IR-18', 'ID-MA', 'US-HI', 'US-PR',
       'US-WA', 'US-MO', 'US-NV', 'ID-SN', 'US-UT', 'ID-NT', 'PG-WPD',
       'US-ID', 'US-TX', 'AR-A', 'US-WY', 'FJ-E', 'CL-AT', 'PG-ESW',
       'ID-SA', 'AF-BDS', 'VU-PAM', 'ID-AC', 'JP-26', 'DO-10', 'PG-EBR',
       'ID-MU', 'US-MT', 'US-TN', 'US-KS', 'JP-02', 'PE-UCA', 'TO-02',
       'PE-HUC', 'ID-PA', 'BO-P', 'HT-NI', 'MX-CHP', 'AR-K', 'PK-GB',
       'US-OH', 'CN-SC', 'CL-RM', 'ID-BE', 'US-OR', 'SB-TE', 'US-MN',
       'AF-TAK', 'CL-AP', 'RU-SAK', 'JP-23', 'MX-BCN', 'CL-AN', 'MX-JAL',
       'JP-04', 'CL-TA', 'US-SC', 'PG-MPL', 'PG-WBK', 'CN-XJ', 'US-CT',
       'JP-30', 'CR-A', 'US-CO', 'GL-KU', 'PG-NIK', 'AR-M', 'JP-08',
       'PE-ICA', 'CN-QH', 'PH-BTG', 'DO-14', 'CO-SAN', 'CD-TA', 'AR-J',
       'US-NM', 'HT-OU', 'TW-ILA', 'PG-MRL', 'PH-AKL', 'TO-03', 'CN-XZ',
       'TJ-RA', 'AU-WA', 'ID-NB', 'IN-UT', 'PE-ARE', 'US-MP', 'ID-JI',
       'MA-03', 'DO-11', 'IR-03', 'AF-BGL', 'PE-PUN', 'NZ-STL', 'MM-11',
  

In [112]:
# Converting values to appropriate data types
earthquakes["magType"] = earthquakes["magType"].astype("category")
earthquakes["type"] = earthquakes["type"].astype("category")
earthquakes["status"] = earthquakes["status"].astype("category")
earthquakes["locationSource"] = earthquakes["locationSource"].astype("category")
earthquakes["state"]


# Selecting Data for Plotting

In [5]:
import pandas as pd

In [19]:
earthquakes = pd.read_csv(
    "datasets/us_earthquakes.csv",
    parse_dates=["time"],
)

In [20]:
earthquakes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29357 entries, 0 to 29356
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   id              29357 non-null  object             
 1   time            29357 non-null  datetime64[ns, UTC]
 2   latitude        29357 non-null  float64            
 3   longitude       29357 non-null  float64            
 4   mag             29357 non-null  float64            
 5   magType         29357 non-null  object             
 6   place           29281 non-null  object             
 7   type            29357 non-null  object             
 8   status          29357 non-null  object             
 9   locationSource  29357 non-null  object             
 10  state           29357 non-null  object             
 11  state_code      29357 non-null  object             
 12  month           29357 non-null  object             
dtypes: datetime64[ns, UTC](1), floa

In [21]:
earthquakes["magType"] = earthquakes["magType"].astype("category")
earthquakes["type"] = earthquakes["type"].astype("category")
earthquakes["status"] = earthquakes["status"].astype("category")
earthquakes["locationSource"] = earthquakes["locationSource"].astype("category")
earthquakes["state"] = earthquakes["state"].astype("category")
earthquakes["state_code"] = earthquakes["state_code"].astype("category")
earthquakes["month"] = earthquakes["month"].astype("category")

In [22]:
earthquakes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29357 entries, 0 to 29356
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   id              29357 non-null  object             
 1   time            29357 non-null  datetime64[ns, UTC]
 2   latitude        29357 non-null  float64            
 3   longitude       29357 non-null  float64            
 4   mag             29357 non-null  float64            
 5   magType         29357 non-null  category           
 6   place           29281 non-null  object             
 7   type            29357 non-null  category           
 8   status          29357 non-null  category           
 9   locationSource  29357 non-null  category           
 10  state           29357 non-null  category           
 11  state_code      29357 non-null  category           
 12  month           29357 non-null  category           
dtypes: category(7), datetime64[ns, 

In [38]:
df = earthquakes.copy()

In [31]:
df.groupby(["month", "state_code"]).groups.keys()

dict_keys([('February', 'AK'), ('February', 'AR'), ('February', 'AZ'), ('February', 'CA'), ('February', 'CO'), ('February', 'HI'), ('February', 'ID'), ('February', 'KS'), ('February', 'KY'), ('February', 'ME'), ('February', 'MN'), ('February', 'MO'), ('February', 'MT'), ('February', 'NC'), ('February', 'NH'), ('February', 'NM'), ('February', 'NV'), ('February', 'NY'), ('February', 'OH'), ('February', 'OK'), ('February', 'OR'), ('February', 'PR'), ('February', 'SC'), ('February', 'TN'), ('February', 'TX'), ('February', 'UT'), ('February', 'VA'), ('February', 'WA'), ('February', 'WY'), ('January', 'AK'), ('January', 'AR'), ('January', 'AZ'), ('January', 'CA'), ('January', 'CO'), ('January', 'CT'), ('January', 'GA'), ('January', 'HI'), ('January', 'ID'), ('January', 'KS'), ('January', 'MA'), ('January', 'ME'), ('January', 'MN'), ('January', 'MO'), ('January', 'MP'), ('January', 'MT'), ('January', 'NH'), ('January', 'NM'), ('January', 'NV'), ('January', 'NY'), ('January', 'OH'), ('January'

In [39]:
df = df.groupby(["month", "state_code"]).agg(earthquakes=("id", "count")).reset_index()

In [42]:
df.sample(10)

Unnamed: 0,month,state_code,earthquakes
2,February,AZ,8
12,February,MA,0
79,March,HI,635
6,February,GA,0
89,March,MT,146
81,March,IL,1
87,March,MO,14
61,January,OR,25
14,February,MN,3
30,February,TX,114


In [45]:
df[df["month"]=="January"]

Unnamed: 0,month,state_code,earthquakes
36,January,AK,3082
37,January,AR,7
38,January,AZ,3
39,January,CA,3757
40,January,CO,4
41,January,CT,1
42,January,GA,1
43,January,HI,1017
44,January,ID,73
45,January,IL,0


In [55]:
earthquakes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29357 entries, 0 to 29356
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   id              29357 non-null  object             
 1   time            29357 non-null  datetime64[ns, UTC]
 2   latitude        29357 non-null  float64            
 3   longitude       29357 non-null  float64            
 4   mag             29357 non-null  float64            
 5   magType         29357 non-null  category           
 6   place           29281 non-null  object             
 7   type            29357 non-null  category           
 8   status          29357 non-null  category           
 9   locationSource  29357 non-null  category           
 10  state           29357 non-null  category           
 11  state_code      29357 non-null  category           
 12  month           29357 non-null  category           
dtypes: category(7), datetime64[ns, 

In [54]:
earthquakes["type"].value_counts(normalize=True)

earthquake          0.975440
quarry blast        0.011139
explosion           0.006029
ice quake           0.004701
mining explosion    0.002487
other event         0.000204
Name: type, dtype: float64

In [57]:
earthquakes["status"].value_counts()

reviewed     24972
automatic     4385
Name: status, dtype: int64

In [61]:
earthquakes.head()

Unnamed: 0,id,time,latitude,longitude,mag,magType,place,type,status,locationSource,state,state_code,month
0,ok2022ccvb,2022-01-30 23:58:39.500000+00:00,34.879167,-97.852,0.99,ml,"7 km WSW of Alex, Oklahoma",earthquake,reviewed,ok,US-OK,OK,January
1,ak0221e05fgq,2022-01-30 23:56:19.237000+00:00,62.91,-151.1774,1.4,ml,Central Alaska,earthquake,reviewed,ak,US-AK,AK,January
2,ak0221e051u3,2022-01-30 23:54:44.477000+00:00,63.3048,-151.2721,1.1,ml,"34 km SE of Denali National Park, Alaska",earthquake,reviewed,ak,US-AK,AK,January
3,ak0221e050xt,2022-01-30 23:54:32.973000+00:00,51.5578,-176.6738,1.8,ml,"35 km S of Adak, Alaska",earthquake,reviewed,ak,US-AK,AK,January
4,ak0221e04enn,2022-01-30 23:51:44.207000+00:00,63.2917,-151.3217,1.1,ml,"34 km SE of Denali National Park, Alaska",earthquake,reviewed,ak,US-AK,AK,January


In [70]:
df_scatter = earthquakes[["id", "time", "latitude", "longitude", "mag", "place", "type", "state_code"]]

In [73]:
df_scatter = df_scatter.assign(
    date=df_scatter["time"].dt.date
)

In [75]:
df_scatter.sample(5)

Unnamed: 0,id,time,latitude,longitude,mag,place,type,state_code,date
9420,nc73673041,2022-01-03 08:35:36.490000+00:00,38.121333,-122.127833,1.06,"8km NNE of Benicia, CA",earthquake,CA,2022-01-03
10137,nn00834569,2022-02-27 19:17:28.135000+00:00,38.1673,-117.8879,1.1,"31 km SE of Mina, Nevada",earthquake,NV,2022-02-27
1545,ak0221779m9t,2022-01-26 12:25:16.495000+00:00,63.1063,-151.5243,0.9,"49 km SSE of Denali National Park, Alaska",earthquake,AK,2022-01-26
19194,ak0221h0bq2t,2022-02-01 05:25:55.137000+00:00,60.8269,-151.8195,1.1,"32 km WNW of Nikiski, Alaska",earthquake,AK,2022-02-01
25597,nc73704466,2022-03-12 00:59:16.700000+00:00,38.815166,-122.820503,0.5,"7km NW of The Geysers, CA",earthquake,CA,2022-03-12


In [79]:
df_scatter.sort_values(by=["mag"], ascending=False).head(10)

Unnamed: 0,id,time,latitude,longitude,mag,place,type,state_code,date
17355,ak0221pb6nv5,2022-02-06 07:22:33.179000+00:00,62.7471,-148.7049,5.2,"72 km S of Cantwell, Alaska",earthquake,AK,2022-02-06
6470,us7000gb6l,2022-01-12 01:48:03.831000+00:00,52.0751,178.0555,5.2,"Rat Islands, Aleutian Islands, Alaska",earthquake,AK,2022-01-12
28931,us6000h18d,2022-03-01 22:31:17.846000+00:00,52.346,174.0425,5.2,"80 km SE of Attu Station, Alaska",earthquake,AK,2022-03-01
3139,ak022ytdd55,2022-01-21 05:18:26.444000+00:00,60.3163,-152.3603,5.1,"48 km NW of Ninilchik, Alaska",earthquake,AK,2022-01-21
5931,us7000gbj9,2022-01-13 05:47:12.246000+00:00,51.2863,-179.1427,5.1,"Andreanof Islands, Aleutian Islands, Alaska",earthquake,AK,2022-01-13
1802,us7000gf5f,2022-01-26 01:02:07.228000+00:00,51.9957,178.3305,4.9,"Rat Islands, Aleutian Islands, Alaska",earthquake,AK,2022-01-26
25372,ak02239qtmiy,2022-03-12 19:59:48.600000+00:00,60.79,-152.0819,4.8,"44 km WNW of Nikiski, Alaska",earthquake,AK,2022-03-12
21833,ak0223rx1x1y,2022-03-23 18:45:12.911000+00:00,60.209,-153.0955,4.8,"67 km E of Port Alsworth, Alaska",earthquake,AK,2022-03-23
1717,us7000gf6q,2022-01-26 04:31:23.823000+00:00,51.9565,178.3387,4.8,"Rat Islands, Aleutian Islands, Alaska",earthquake,AK,2022-01-26
10676,us6000h0bg,2022-02-26 07:16:43.732000+00:00,51.8955,178.4066,4.8,"Rat Islands, Aleutian Islands, Alaska",earthquake,AK,2022-02-26


In [96]:
df_scatter["label"] = "id: " + df_scatter["id"] + ", mag: " + df_scatter["mag"].astype(str) + ", " + df_scatter["place"] + ", " + df_scatter["date"].astype(str)

In [97]:
df_scatter

Unnamed: 0,id,time,latitude,longitude,mag,place,type,state_code,date,label
0,ok2022ccvb,2022-01-30 23:58:39.500000+00:00,34.879167,-97.852000,0.99,"7 km WSW of Alex, Oklahoma",earthquake,OK,2022-01-30,"id: ok2022ccvb, mag: 0.99, 7 km WSW of Alex, O..."
1,ak0221e05fgq,2022-01-30 23:56:19.237000+00:00,62.910000,-151.177400,1.40,Central Alaska,earthquake,AK,2022-01-30,"id: ak0221e05fgq, mag: 1.4, Central Alaska, 20..."
2,ak0221e051u3,2022-01-30 23:54:44.477000+00:00,63.304800,-151.272100,1.10,"34 km SE of Denali National Park, Alaska",earthquake,AK,2022-01-30,"id: ak0221e051u3, mag: 1.1, 34 km SE of Denali..."
3,ak0221e050xt,2022-01-30 23:54:32.973000+00:00,51.557800,-176.673800,1.80,"35 km S of Adak, Alaska",earthquake,AK,2022-01-30,"id: ak0221e050xt, mag: 1.8, 35 km S of Adak, A..."
4,ak0221e04enn,2022-01-30 23:51:44.207000+00:00,63.291700,-151.321700,1.10,"34 km SE of Denali National Park, Alaska",earthquake,AK,2022-01-30,"id: ak0221e04enn, mag: 1.1, 34 km SE of Denali..."
...,...,...,...,...,...,...,...,...,...,...
29352,nc73699161,2022-03-01 00:20:28.230000+00:00,40.877500,-122.027667,1.57,"10km WNW of Montgomery Creek, CA",earthquake,CA,2022-03-01,"id: nc73699161, mag: 1.57, 10km WNW of Montgom..."
29353,ak0222r8d3ze,2022-03-01 00:20:03.430000+00:00,58.936900,-152.970700,3.00,"76 km SW of Nanwalek, Alaska",earthquake,AK,2022-03-01,"id: ak0222r8d3ze, mag: 3.0, 76 km SW of Nanwal..."
29354,nc73699171,2022-03-01 00:19:44.800000+00:00,37.470667,-121.616167,0.95,"22km ENE of Alum Rock, CA",earthquake,CA,2022-03-01,"id: nc73699171, mag: 0.95, 22km ENE of Alum Ro..."
29355,ak0222r8acmt,2022-03-01 00:07:11.888000+00:00,62.494100,-152.198500,1.10,"69 km NW of Skwentna, Alaska",earthquake,AK,2022-03-01,"id: ak0222r8acmt, mag: 1.1, 69 km NW of Skwent..."


In [98]:
df_chart = earthquakes.copy()

In [109]:
df_chart = df_chart["type"].value_counts().to_frame().reset_index().rename(columns={"index": "Type", "type": "Count"})

In [110]:
df_chart

Unnamed: 0,Type,Count
0,earthquake,28636
1,quarry blast,327
2,explosion,177
3,ice quake,138
4,mining explosion,73
5,other event,6


In [105]:
import plotly.express as px

In [116]:
fig = px.bar(df_chart, x="Type", y="Count", title="Wide-Form Input")

In [117]:
fig.show()

In [118]:
fig = px.pie(df_chart, values="Count", names="Type", hole=.3)

In [119]:
fig.show()