In [1]:
import pandas as pd

In [2]:
def csv_to_df(file):
    return pd.read_csv(
        file,
        parse_dates=["time"],
        usecols=[
            "id",
            "time",
            "latitude",
            "longitude",
            "mag",
            "magType",
            "place",
            "type",
            "status",
            "locationSource",
            "state",
        ],
        index_col="id"
    )

In [3]:
jan = csv_to_df("datasets/jan-enriched.csv")
feb = csv_to_df("datasets/feb-enriched.csv")
mar = csv_to_df("datasets/mar-enriched.csv")

In [4]:
sum(size for (size, _) in (jan.shape, feb.shape, mar.shape))

35919

In [5]:
earthquakes = pd.concat([jan, feb, mar])

In [6]:
earthquakes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 35919 entries, ok2022ccvb to uw61819426
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   time            35919 non-null  datetime64[ns, UTC]
 1   latitude        35919 non-null  float64            
 2   longitude       35919 non-null  float64            
 3   mag             35916 non-null  float64            
 4   magType         35916 non-null  object             
 5   place           35784 non-null  object             
 6   type            35919 non-null  object             
 7   status          35919 non-null  object             
 8   locationSource  35919 non-null  object             
 9   state           31375 non-null  object             
dtypes: datetime64[ns, UTC](1), float64(3), object(6)
memory usage: 3.0+ MB


In [7]:
earthquakes.nunique()

time              35914
latitude          28270
longitude         29491
mag                 635
magType              10
place             16335
type                  7
status                2
locationSource       15
state               346
dtype: int64

In [8]:
earthquakes["state"].unique()

array(['US-OK', 'US-AK', 'US-CA', 'IR-18', 'ID-MA', 'US-HI', 'US-PR', nan,
       'US-WA', 'CA-YT', 'CA-BC', 'US-MO', 'US-NV', 'ID-SN', 'US-UT',
       'ID-NT', 'PG-WPD', 'US-ID', 'US-TX', 'AR-A', 'US-WY', 'FJ-E',
       'CL-AT', 'PG-ESW', 'ID-SA', 'AF-BDS', 'VU-PAM', 'ID-AC', 'JP-26',
       'DO-10', 'PG-EBR', 'ID-MU', 'US-MT', 'US-TN', 'US-KS', 'JP-02',
       'PE-UCA', 'TO-02', 'PE-HUC', 'ID-PA', 'BO-P', 'HT-NI', 'MX-CHP',
       'AR-K', 'PK-GB', 'US-OH', 'CN-SC', 'CL-RM', 'ID-BE', 'US-OR',
       'SB-TE', 'US-MN', 'AF-TAK', 'CL-AP', 'RU-SAK', 'JP-23', 'MX-BCN',
       'CL-AN', 'MX-JAL', 'JP-04', 'CL-TA', 'US-SC', 'PG-MPL', 'PG-WBK',
       'CN-XJ', 'US-CT', 'JP-30', 'CR-A', 'US-CO', 'GL-KU', 'CA-NB',
       'PG-NIK', 'AR-M', 'JP-08', 'PE-ICA', 'CN-QH', 'PH-BTG', 'DO-14',
       'CO-SAN', 'CD-TA', 'AR-J', 'US-NM', 'HT-OU', 'TW-ILA', 'PG-MRL',
       'CA-QC', 'PH-AKL', 'TO-03', 'CN-XZ', 'TJ-RA', 'AU-WA', 'ID-NB',
       'IN-UT', 'PE-ARE', 'US-MP', 'ID-JI', 'MA-03', 'DO-11', 'IR-03',


In [9]:
# We see states for countries other than the USA. Also, the states field reports null for islands and stuff.
#So dropping countries other than the USA.
us_earthquakes = earthquakes[earthquakes["state"].str.startswith("US", na=False)]

In [10]:
us_earthquakes.nunique()

time              29355
latitude          21909
longitude         23051
mag                 578
magType               8
place             12013
type                  6
status                2
locationSource       15
state                36
dtype: int64

In [11]:
us_earthquakes[us_earthquakes["mag"].isnull()]

Unnamed: 0_level_0,time,latitude,longitude,mag,magType,place,type,status,locationSource,state
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
nc71127084,2022-01-18 14:50:55.480000+00:00,35.975333,-120.371,,,"10km NNE of Parkfield, CA",earthquake,reviewed,nc,US-CA
nc73674341,2022-01-06 06:11:48.790000+00:00,38.7985,-122.7015,,,"3km NNW of Anderson Springs, CA",earthquake,reviewed,nc,US-CA
pr71336423,2022-02-21 21:01:10.020000+00:00,17.9565,-66.928167,,,"2 km SW of Guánica, Puerto Rico",earthquake,reviewed,pr,US-PR


In [12]:
# Dropping earthquakes with missing magnitude values
us_earthquakes = us_earthquakes.dropna(subset=["mag"])

In [13]:
# Extracting state code from state data and month from time.
us_earthquakes = us_earthquakes.assign(
    state_code=us_earthquakes["state"].str.split("-").str.get(-1).str.strip(),
    month=us_earthquakes["time"].dt.month_name()
)

In [14]:
us_earthquakes.head()

Unnamed: 0_level_0,time,latitude,longitude,mag,magType,place,type,status,locationSource,state,state_code,month
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ok2022ccvb,2022-01-30 23:58:39.500000+00:00,34.879167,-97.852,0.99,ml,"7 km WSW of Alex, Oklahoma",earthquake,reviewed,ok,US-OK,OK,January
ak0221e05fgq,2022-01-30 23:56:19.237000+00:00,62.91,-151.1774,1.4,ml,Central Alaska,earthquake,reviewed,ak,US-AK,AK,January
ak0221e051u3,2022-01-30 23:54:44.477000+00:00,63.3048,-151.2721,1.1,ml,"34 km SE of Denali National Park, Alaska",earthquake,reviewed,ak,US-AK,AK,January
ak0221e050xt,2022-01-30 23:54:32.973000+00:00,51.5578,-176.6738,1.8,ml,"35 km S of Adak, Alaska",earthquake,reviewed,ak,US-AK,AK,January
ak0221e04enn,2022-01-30 23:51:44.207000+00:00,63.2917,-151.3217,1.1,ml,"34 km SE of Denali National Park, Alaska",earthquake,reviewed,ak,US-AK,AK,January


In [15]:
us_earthquakes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29357 entries, ok2022ccvb to uu60483552
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   time            29357 non-null  datetime64[ns, UTC]
 1   latitude        29357 non-null  float64            
 2   longitude       29357 non-null  float64            
 3   mag             29357 non-null  float64            
 4   magType         29357 non-null  object             
 5   place           29281 non-null  object             
 6   type            29357 non-null  object             
 7   status          29357 non-null  object             
 8   locationSource  29357 non-null  object             
 9   state           29357 non-null  object             
 10  state_code      29357 non-null  object             
 11  month           29357 non-null  object             
dtypes: datetime64[ns, UTC](1), float64(3), object(8)
memory usage: 2.9+ MB


In [16]:
# Converting values to appropriate type

In [17]:
us_earthquakes.nunique()

time              29352
latitude          21907
longitude         23048
mag                 578
magType               8
place             12011
type                  6
status                2
locationSource       15
state                36
state_code           36
month                 3
dtype: int64

In [18]:
us_earthquakes["magType"] = us_earthquakes["magType"].astype("category")
us_earthquakes["type"] = us_earthquakes["type"].astype("category")
us_earthquakes["status"] = us_earthquakes["status"].astype("category")
us_earthquakes["locationSource"] = us_earthquakes["locationSource"].astype("category")
us_earthquakes["state"] = us_earthquakes["state"].astype("category")
us_earthquakes["state_code"] = us_earthquakes["state_code"].astype("category")
us_earthquakes["month"] = us_earthquakes["month"].astype("category")

In [19]:
us_earthquakes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29357 entries, ok2022ccvb to uu60483552
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   time            29357 non-null  datetime64[ns, UTC]
 1   latitude        29357 non-null  float64            
 2   longitude       29357 non-null  float64            
 3   mag             29357 non-null  float64            
 4   magType         29357 non-null  category           
 5   place           29281 non-null  object             
 6   type            29357 non-null  category           
 7   status          29357 non-null  category           
 8   locationSource  29357 non-null  category           
 9   state           29357 non-null  category           
 10  state_code      29357 non-null  category           
 11  month           29357 non-null  category           
dtypes: category(7), datetime64[ns, UTC](1), float64(3), object(1)
memory usage: 1.5

In [23]:
us_earthquakes.sample(5)

Unnamed: 0_level_0,time,latitude,longitude,mag,magType,place,type,status,locationSource,state,state_code,month
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ci39924879,2022-01-27 10:52:41.220000+00:00,35.039833,-118.993167,2.84,ml,"12km NNW of Grapevine, CA",earthquake,reviewed,ci,US-CA,CA,January
ak0223b59f53,2022-03-13 04:31:26.356000+00:00,55.8918,-159.4324,1.9,ml,"3 km ESE of Ivanof Bay, Alaska",earthquake,reviewed,ak,US-AK,AK,March
nc73681586,2022-01-21 19:21:58.610000+00:00,37.9285,-122.1155,1.58,md,"5km N of Lafayette, CA",earthquake,reviewed,nc,US-CA,CA,January
hv72903867,2022-02-05 02:41:09.280000+00:00,19.479667,-155.644667,0.88,md,"23 km E of Honaunau-Napoopoo, Hawaii",earthquake,reviewed,hv,US-HI,HI,February
tx2022gcnp,2022-03-28 18:23:44.444000+00:00,31.644255,-104.229956,1.8,ml,"55 km NW of Toyah, Texas",earthquake,reviewed,tx,US-TX,TX,March
