In [1]:
import pandas as pd
import datetime

## Data Exploration

In [2]:
def csv_to_df(file):
    return pd.read_csv(
        file,
        parse_dates=["time"]
    )

In [3]:
jan = csv_to_df("datasets/january.csv")
feb = csv_to_df("datasets/february.csv")
mar = csv_to_df("datasets/march.csv")

In [4]:
sum(size for (size, _) in (jan.shape, feb.shape, mar.shape))

35919

In [5]:
earthquakes = pd.concat([jan, feb, mar])

In [6]:
earthquakes.shape

(35919, 22)

In [7]:
earthquakes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35919 entries, 0 to 12312
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   time             35919 non-null  datetime64[ns, UTC]
 1   latitude         35919 non-null  float64            
 2   longitude        35919 non-null  float64            
 3   depth            35919 non-null  float64            
 4   mag              35916 non-null  float64            
 5   magType          35916 non-null  object             
 6   nst              21283 non-null  float64            
 7   gap              26711 non-null  float64            
 8   dmin             22113 non-null  float64            
 9   rms              35919 non-null  float64            
 10  net              35919 non-null  object             
 11  id               35919 non-null  object             
 12  updated          35919 non-null  object             
 13  place           

In [8]:
# Dropping unnecessary columns
earthquakes.drop(
    ["nst", "gap", "dmin", "rms", "updated", "horizontalError", "depthError", "magError", "magNst"],
    axis=1
).head()

Unnamed: 0,time,latitude,longitude,depth,mag,magType,net,id,place,type,status,locationSource,magSource
0,2022-01-30 23:58:39.500000+00:00,34.879167,-97.852,14.67,0.99,ml,ok,ok2022ccvb,"7 km WSW of Alex, Oklahoma",earthquake,reviewed,ok,ok
1,2022-01-30 23:56:19.237000+00:00,62.91,-151.1774,107.6,1.4,ml,ak,ak0221e05fgq,Central Alaska,earthquake,reviewed,ak,ak
2,2022-01-30 23:54:44.477000+00:00,63.3048,-151.2721,8.8,1.1,ml,ak,ak0221e051u3,"34 km SE of Denali National Park, Alaska",earthquake,reviewed,ak,ak
3,2022-01-30 23:54:32.973000+00:00,51.5578,-176.6738,39.6,1.8,ml,ak,ak0221e050xt,"35 km S of Adak, Alaska",earthquake,reviewed,ak,ak
4,2022-01-30 23:51:44.207000+00:00,63.2917,-151.3217,9.9,1.1,ml,ak,ak0221e04enn,"34 km SE of Denali National Park, Alaska",earthquake,reviewed,ak,ak


In [9]:
earthquakes["net"].unique()

array(['ok', 'ak', 'nc', 'us', 'av', 'hv', 'pr', 'ci', 'uw', 'nm', 'nn',
       'uu', 'tx', 'mb', 'se'], dtype=object)

In [10]:
earthquakes["locationSource"].unique()

array(['ok', 'ak', 'nc', 'us', 'av', 'hv', 'pr', 'ci', 'uw', 'nm', 'nn',
       'uu', 'tx', 'mb', 'se'], dtype=object)

In [11]:
earthquakes["depth"].mean()

26.288167243528935

In [12]:
earthquakes["status"].unique()

array(['reviewed', 'automatic'], dtype=object)

In [13]:
earthquakes["type"].unique()

array(['earthquake', 'ice quake', 'explosion', 'mining explosion',
       'quarry blast', 'volcanic eruption', 'other event'], dtype=object)

# Data Loading

In [14]:
# Loading only columns necessary for visualization.
def csv_to_df(file):
    return pd.read_csv(
        file,
        parse_dates=["time"],
        usecols=[
            "id",
            "time",
            "latitude",
            "longitude",
            "mag",
            "magType",
            "place",
            "type",
            "status",
            "locationSource"
        ],
        index_col="id"
    )

In [15]:
jan = csv_to_df("datasets/january.csv")
feb = csv_to_df("datasets/february.csv")
mar = csv_to_df("datasets/march.csv")

In [16]:
sum(size for (size, _) in (jan.shape, feb.shape, mar.shape))

35919

In [17]:
earthquakes = pd.concat([jan, feb, mar])

In [18]:
earthquakes.shape

(35919, 9)

In [19]:
earthquakes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 35919 entries, ok2022ccvb to uw61819426
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   time            35919 non-null  datetime64[ns, UTC]
 1   latitude        35919 non-null  float64            
 2   longitude       35919 non-null  float64            
 3   mag             35916 non-null  float64            
 4   magType         35916 non-null  object             
 5   place           35784 non-null  object             
 6   type            35919 non-null  object             
 7   status          35919 non-null  object             
 8   locationSource  35919 non-null  object             
dtypes: datetime64[ns, UTC](1), float64(3), object(5)
memory usage: 2.7+ MB


# Data Wrangling

In [20]:
earthquakes.nunique()

time              35914
latitude          28270
longitude         29491
mag                 635
magType              10
place             16335
type                  7
status                2
locationSource       15
dtype: int64

In [21]:
earthquakes["locationSource"].unique()

array(['ok', 'ak', 'nc', 'us', 'av', 'hv', 'pr', 'ci', 'uw', 'nm', 'nn',
       'uu', 'tx', 'mb', 'se'], dtype=object)

In [22]:
earthquakes["status"].unique()

array(['reviewed', 'automatic'], dtype=object)

In [23]:
earthquakes["type"].unique()

array(['earthquake', 'ice quake', 'explosion', 'mining explosion',
       'quarry blast', 'volcanic eruption', 'other event'], dtype=object)

In [24]:
earthquakes["magType"].unique()

array(['ml', 'md', 'mb', 'mw', 'mww', 'mb_lg', 'mwr', 'mh', nan, 'ms_20',
       'mwb'], dtype=object)

## Converting values to appropriate data types

In [25]:
earthquakes["status"] = earthquakes["status"].astype("category")
earthquakes["locationSource"] = earthquakes["locationSource"].astype("category")
earthquakes["type"] = earthquakes["type"].astype("category")
earthquakes["magType"] = earthquakes["magType"].astype("category")

## Checking for missing values

In [26]:
earthquakes.shape

(35919, 9)

In [27]:
earthquakes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 35919 entries, ok2022ccvb to uw61819426
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   time            35919 non-null  datetime64[ns, UTC]
 1   latitude        35919 non-null  float64            
 2   longitude       35919 non-null  float64            
 3   mag             35916 non-null  float64            
 4   magType         35916 non-null  category           
 5   place           35784 non-null  object             
 6   type            35919 non-null  category           
 7   status          35919 non-null  category           
 8   locationSource  35919 non-null  category           
dtypes: category(4), datetime64[ns, UTC](1), float64(3), object(1)
memory usage: 1.8+ MB


In [28]:
earthquakes[earthquakes["place"].isnull()].iloc[0]

time              2022-01-30 11:14:16.340000+00:00
latitude                                   61.3634
longitude                                 -149.574
mag                                            1.2
magType                                         ml
place                                          NaN
type                                    earthquake
status                                    reviewed
locationSource                                  ak
Name: ak0221dsr7ph, dtype: object

In [29]:
earthquakes[earthquakes["mag"].isnull()]

Unnamed: 0_level_0,time,latitude,longitude,mag,magType,place,type,status,locationSource
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
nc71127084,2022-01-18 14:50:55.480000+00:00,35.975333,-120.371,,,"10km NNE of Parkfield, CA",earthquake,reviewed,nc
nc73674341,2022-01-06 06:11:48.790000+00:00,38.7985,-122.7015,,,"3km NNW of Anderson Springs, CA",earthquake,reviewed,nc
pr71336423,2022-02-21 21:01:10.020000+00:00,17.9565,-66.928167,,,"2 km SW of Guánica, Puerto Rico",earthquake,reviewed,pr


## Dropping earthquakes with no magnitudes for which no values could be found

In [30]:
earthquakes = earthquakes.dropna(subset=["mag"])

In [31]:
earthquakes.shape

(35916, 9)

## Extracting month from the `time` attribute

In [32]:
earthquakes["month"] = earthquakes["time"].dt.month_name()

In [33]:
earthquakes["month"].value_counts()

January     12421
March       12313
February    11182
Name: month, dtype: int64

## Extracting state from the `longitude` and `latitude` attributes

In [34]:
states = earthquakes["place"].str.split(
    pat=",",
    n=1,
).str.get(-1)

In [35]:
states

id
ok2022ccvb            Oklahoma
ak0221e05fgq    Central Alaska
ak0221e051u3            Alaska
ak0221e050xt            Alaska
ak0221e04enn            Alaska
                     ...      
ak0222r8d3ze            Alaska
nc73699171                  CA
ak0222r8acmt            Alaska
uu60483552                Utah
uw61819426              Canada
Name: place, Length: 35916, dtype: object

In [36]:
states.info()

<class 'pandas.core.series.Series'>
Index: 35916 entries, ok2022ccvb to uw61819426
Series name: place
Non-Null Count  Dtype 
--------------  ----- 
35781 non-null  object
dtypes: object(1)
memory usage: 1.6+ MB


In [37]:
temp = earthquakes[earthquakes["place"].isnull()]

In [38]:
temp

Unnamed: 0_level_0,time,latitude,longitude,mag,magType,place,type,status,locationSource,month
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ak0221dsr7ph,2022-01-30 11:14:16.340000+00:00,61.363400,-149.574000,1.20,ml,,earthquake,reviewed,ak,January
tx2022byde,2022-01-28 10:23:37.233000+00:00,31.707534,-103.995351,2.50,ml,,earthquake,reviewed,tx,January
ak0221afcyv4,2022-01-28 07:24:33.861000+00:00,53.216700,-166.734500,1.70,ml,,earthquake,reviewed,ak,January
ak022191gw7r,2022-01-27 23:44:58.569000+00:00,60.281600,-152.497100,1.90,ml,,earthquake,reviewed,ak,January
ak02218o8kpq,2022-01-27 01:22:26.695000+00:00,61.327600,-150.872700,1.60,ml,,earthquake,reviewed,ak,January
...,...,...,...,...,...,...,...,...,...,...
pr71338438,2022-03-02 03:49:23.630000+00:00,18.042667,-66.894167,1.51,md,,earthquake,reviewed,pr,March
us6000h23r,2022-03-01 04:52:18.615000+00:00,-7.162000,156.382800,4.30,mb,,earthquake,reviewed,us,March
ak0222rabwus,2022-03-01 03:50:34.211000+00:00,60.655000,-152.423100,1.10,ml,,earthquake,reviewed,ak,March
us6000h23y,2022-03-01 03:37:44.437000+00:00,-2.067800,127.825500,4.40,mb,,earthquake,reviewed,us,March


In [39]:
import geocoder

In [40]:
def get_state(df):
    g = geocoder.osm([df["latitude"], df["longitude"]], method="reverse")
    if g.ok:
        geojson = g.geojson
        address = geojson["features"][0]["properties"]["raw"]["address"]
        return address.get("ISO3166-2-lvl4")
    return ""

In [41]:
!pip install geocoder


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [None]:
res = temp[["latitude", "longitude"]].apply(get_state, axis=1)

In [None]:
res.isnull().value_counts()

In [None]:
# We actually get a lot of the missing values. So it can be applied on the original dataframe.

In [None]:
import pandas as pd

In [None]:
def csv_to_df(file):
    return pd.read_csv(
        file,
        parse_dates=["time"],
        usecols=[
            "id",
            "time",
            "latitude",
            "longitude",
            "mag",
            "magType",
            "place",
            "type",
            "status",
            "locationSource",
            "state",
        ],
        index_col="id"
    )

In [None]:
jan = csv_to_df("datasets/jan-enriched.csv")
feb = csv_to_df("datasets/feb-enriched.csv")
mar = csv_to_df("datasets/mar-enriched.csv")

In [None]:
sum(size for (size, _) in (jan.shape, feb.shape, mar.shape))

In [None]:
earthquakes = pd.concat([jan, feb, mar])

In [None]:
earthquakes["state"].str.startswith("CA").value_counts()

In [None]:
# Dropping Canadian states
earthquakes = earthquakes[~earthquakes["state"].str.startswith("CA", na=False)]

In [None]:
earthquakes.shape

In [None]:
earthquakes[earthquakes["state"].isnull()].shape

In [None]:
earthquakes.dropna(subset=["state"]).shape

In [None]:
35632 - 4544

In [None]:
# Dropping the islands stuff
earthquakes = earthquakes.dropna(subset=["state"])

In [None]:
earthquakes.head()

In [None]:
earthquakes.info()

In [None]:
earthquakes[earthquakes["mag"].isnull()]

In [None]:
earthquakes = earthquakes.dropna(subset=["mag"])

In [None]:
earthquakes.info()

In [None]:
earthquakes = earthquakes.assign(
    state_code=earthquakes["state"].str.split("-").str.get(-1).str.strip(),
    month=earthquakes["time"].dt.month_name() 
)

In [None]:
earthquakes.head()

In [None]:
earthquakes.info()

In [None]:
earthquakes.nunique()

In [None]:
earthquakes["state"].unique()

In [None]:
# Converting values to appropriate data types
earthquakes["magType"] = earthquakes["magType"].astype("category")
earthquakes["type"] = earthquakes["type"].astype("category")
earthquakes["status"] = earthquakes["status"].astype("category")
earthquakes["locationSource"] = earthquakes["locationSource"].astype("category")
earthquakes["state"]


# Selecting Data for Plotting

In [None]:
import pandas as pd

In [None]:
earthquakes = pd.read_csv(
    "datasets/us_earthquakes.csv",
    parse_dates=["time"],
)

In [None]:
earthquakes.info()

In [None]:
earthquakes["magType"] = earthquakes["magType"].astype("category")
earthquakes["type"] = earthquakes["type"].astype("category")
earthquakes["status"] = earthquakes["status"].astype("category")
earthquakes["locationSource"] = earthquakes["locationSource"].astype("category")
earthquakes["state"] = earthquakes["state"].astype("category")
earthquakes["state_code"] = earthquakes["state_code"].astype("category")
earthquakes["month"] = earthquakes["month"].astype("category")

In [None]:
earthquakes.info()

In [None]:
df = earthquakes.copy()

In [None]:
df.groupby(["month", "state_code"]).groups.keys()

In [None]:
df = df.groupby(["month", "state_code"]).agg(earthquakes=("id", "count")).reset_index()

In [None]:
df.sample(10)

In [None]:
df[df["month"]=="January"]

In [None]:
earthquakes.info()

In [None]:
earthquakes["type"].value_counts(normalize=True)

In [None]:
earthquakes["status"].value_counts()

In [None]:
earthquakes.head()

In [None]:
df_scatter = earthquakes[["id", "time", "latitude", "longitude", "mag", "place", "type", "state_code"]]

In [None]:
df_scatter = df_scatter.assign(
    date=df_scatter["time"].dt.date
)

In [None]:
df_scatter.sample(5)

In [None]:
df_scatter.sort_values(by=["mag"], ascending=False).head(10)

In [None]:
df_scatter["label"] = "id: " + df_scatter["id"] + ", mag: " + df_scatter["mag"].astype(str) + ", " + df_scatter["place"] + ", " + df_scatter["date"].astype(str)

In [None]:
df_scatter

In [None]:
df_chart = earthquakes.copy()

In [None]:
df_chart = df_chart["type"].value_counts().to_frame().reset_index().rename(columns={"index": "Type", "type": "Count"})

In [None]:
df_chart

In [None]:
import plotly.express as px

In [None]:
fig = px.bar(df_chart, x="Type", y="Count", title="Wide-Form Input")

In [None]:
fig.show()

In [None]:
fig = px.pie(df_chart, values="Count", names="Type", hole=.3)

In [None]:
fig.show()