In [49]:
# resources:
# https://earthquake.usgs.gov/fdsnws/event/1/   API


In [50]:
import requests
import json
import pandas as pd
import random

In [51]:
# open session and explore the api
session = requests.Session()
URL = "https://earthquake.usgs.gov/fdsnws/event/1/application.json"

response = session.get(url=URL)
data = response.json()

print(data.keys())
for key in data.keys():
    print(random.sample(data[key], 10))

dict_keys(['catalogs', 'contributors', 'producttypes', 'eventtypes', 'magnitudetypes'])
['iscgemsup', 'us7000kg6v', 'ev', 'dr', 'sc', 'ott', 'gcmt', 'iscgem', 'eqh', 'nm']
['nn', 'cgs', 'ew', 'prod01-pdl01.cr.usgs.gov', 'hv', 'atlas', 'official', 'ak', 'ok', 'admin']
['trump-moment-tensor', 'trump-impact-text', 'finite-fault', 'deleted-text', 'unassociated-amplitude', 'significance', 'origin', 'nearby-cities', 'losspager-admin', 'trump-general-link']
['sonic boom', 'nuclear explosion', 'snow_avalanche', 'collapse', 'rockslide', 'ice quake', 'quarry_blast', 'chemical explosion', 'volcanic explosion', 'train crash']
['ml', 'H', 'mwc', 'MbLg', 'mwp', 'no', 'mlr', 'Unknown', 'mdl', 'lg']


In [52]:
# let's get some actual earthquake data
URL = "https://earthquake.usgs.gov/fdsnws/event/1/query"
PARAMS = {
    "eventtype":"earthquake",
    "format":"geojson"
}
response = session.get(url=URL, params=PARAMS)
data = response.json()

print(data.keys())
print(data["metadata"])
print(random.sample(data["features"], 3))
print(len(data["features"]))

dict_keys(['type', 'metadata', 'features', 'bbox'])
{'generated': 1700124032000, 'url': 'https://earthquake.usgs.gov/fdsnws/event/1/query?eventtype=earthquake&format=geojson', 'title': 'USGS Earthquakes', 'status': 200, 'api': '1.14.0', 'count': 9613}
[{'type': 'Feature', 'properties': {'mag': 1.54, 'place': '15 km NE of Little Lake, CA', 'time': 1698339690310, 'updated': 1698359700977, 'tz': None, 'url': 'https://earthquake.usgs.gov/earthquakes/eventpage/ci40588376', 'detail': 'https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=ci40588376&format=geojson', 'felt': None, 'cdi': None, 'mmi': None, 'alert': None, 'status': 'reviewed', 'tsunami': 0, 'sig': 36, 'net': 'ci', 'code': '40588376', 'ids': ',ci40588376,', 'sources': ',ci,', 'types': ',nearby-cities,origin,phase-data,scitech-link,', 'nst': 20, 'dmin': 0.04568, 'rms': 0.12, 'gap': 77, 'magType': 'ml', 'type': 'earthquake', 'title': 'M 1.5 - 15 km NE of Little Lake, CA'}, 'geometry': {'type': 'Point', 'coordinates': [-117.7743

In [53]:
# writing it to a json file to look at it
json_string = json.dumps(data["features"], indent=4)
with open("test_request.json", "w") as outfile:
    outfile.write(json_string)

In [54]:
# load json string into a python object
data = json.loads(json_string)

# extract properties and coordinates
properties_list = [item['properties'] for item in data]
coordinates_list = [item['geometry']['coordinates'] for item in data]

# create dfs
properties_df = pd.DataFrame(properties_list)
coordinates_df = pd.DataFrame(coordinates_list, columns=['longitude', 'latitude', 'altitude'])

# concatenate dataframes along columns
result_df = pd.concat([properties_df, coordinates_df], axis=1)

# display resulting df
print(result_df.describe())

               mag          time       updated          felt         cdi  \
count  9613.000000  9.613000e+03  9.613000e+03    593.000000  593.000000   
mean      1.640613  1.698730e+12  1.699077e+12     50.865093    2.727656   
std       1.259901  7.158193e+08  6.912498e+08    591.255917    1.388738   
min      -1.320000  1.697532e+12  1.697535e+12      0.000000    0.000000   
25%       0.840000  1.698118e+12  1.698541e+12      1.000000    2.000000   
50%       1.420000  1.698657e+12  1.699096e+12      1.000000    2.700000   
75%       2.080000  1.699336e+12  1.699639e+12      5.000000    3.600000   
max       7.100000  1.700124e+12  1.700124e+12  11940.000000    8.200000   

              mmi      tsunami          sig          nst         dmin  \
count  132.000000  9613.000000  9613.000000  7591.000000  5967.000000   
mean     3.576803     0.000728    66.299490    23.516006     0.639600   
std      1.763647     0.026976    97.912658    22.619339     2.221627   
min      0.000000     0

In [55]:
# i'm comparing with the file to see if we created the dfs correctly
print(result_df.head())

    mag                          place           time        updated    tz  \
0  1.06           10 km NW of Anza, CA  1700123751600  1700123875719  None   
1  0.71     8 km NW of The Geysers, CA  1700122800570  1700123591517  None   
2  0.86    0 km NNE of The Geysers, CA  1700122682220  1700123290482  None   
3  0.92      1 km WSW of Parkfield, CA  1700122583870  1700123290629  None   
4  1.60  18 km N of Two Rivers, Alaska  1700122178985  1700122296472  None   

                                                 url  \
0  https://earthquake.usgs.gov/earthquakes/eventp...   
1  https://earthquake.usgs.gov/earthquakes/eventp...   
2  https://earthquake.usgs.gov/earthquakes/eventp...   
3  https://earthquake.usgs.gov/earthquakes/eventp...   
4  https://earthquake.usgs.gov/earthquakes/eventp...   

                                              detail  felt  cdi  mmi  ...  \
0  https://earthquake.usgs.gov/fdsnws/event/1/que...   NaN  NaN  NaN  ...   
1  https://earthquake.usgs.gov/fdsnws/ev

In [56]:
# delete some unused variables
del properties_df, properties_list, coordinates_df, coordinates_list
df = result_df
del result_df

In [57]:
# now let's do some more clean up
# i think we can drop place since we have the coordinates and those are universally comparable, which those place strings are not
# drop updated
# tz is none everywhere, i can't find in the docu what it even is
# url and detail are quasi redundant, we're dropping url
# idk what net is, i can't find it in the docu. should we drop it? but it's the same as sources just without the ticks. i think we should drop source as well as types, type, title
df = df.drop(columns = ["place", "updated", "tz", "url", "sources", "types", "title"])

In [58]:
df["status"].unique()
# can be reduced to boolean

array(['automatic', 'reviewed'], dtype=object)

In [59]:
df["alert"].unique()
# alert needs to be differently encoded

array([None, 'green', 'yellow'], dtype=object)

In [60]:
df["magType"].unique()
# omg are there different magnitude types? let's one_hot encode these and see how it works. maybe we'll need to convert them somehow to have a common unit but idk yet
# https://www.usgs.gov/programs/earthquake-hazards/magnitude-types

array(['ml', 'md', 'ml(texnet)', 'mb', 'mww', 'mb_lg', 'mlv', 'mwr', 'mw',
       'mwb'], dtype=object)

In [61]:
print(df[df["tsunami"] == 1]["ids"].count())
print(df[df["tsunami"] == 0]["ids"].count())
# very serious class imbalance. BUT it's probably just more likely that no tsunami occurs, luckily

7
9606


In [62]:
# TODO ststus boolean
# TODO onehot encode alert. note that there are more options than we see in our sample (green, yellow, orange, red, none)
# TODO reserach and decide what to do with magtype

In [63]:
# also, those are just like 10000 rows. this is little data. let's get some more.
# but i think before we do that it's time to make a remote mongo db instance and store our stuff because i cannot handle tons of data on this machine
# we'll have to make json strings from our dfs, i think, and then store them in mongo