# Preprocess NOAA data.


In [117]:
import pandas as pd
import requests

pages = 8

for page in range(1, pages + 1):
    if (page == 1):
        r = requests.get(f"https://www.ngdc.noaa.gov/hazel/hazard-service/api/v1/earthquakes?minYear=2000")
    else:
        r = requests.get(f"https://www.ngdc.noaa.gov/hazel/hazard-service/api/v1/earthquakes?minYear=2000&page={page}")
    data = r.json()["items"]
    if (page == 1):
        all_data = data
    else:
        all_data.extend(data)

df = pd.json_normalize(all_data)

In [118]:
df.to_csv("data/noaa_earthquakes_2000_2025.csv", index=False)

In [119]:
pd.set_option('display.max_columns', None)
print(df.columns)
df.head()

Index(['id', 'year', 'month', 'day', 'hour', 'minute', 'second',
       'locationName', 'latitude', 'longitude', 'eqDepth', 'eqMagnitude',
       'damageAmountOrder', 'eqMagMb', 'publish', 'damageAmountOrderTotal',
       'housesDamagedTotal', 'housesDamagedAmountOrderTotal', 'country',
       'regionCode', 'injuries', 'injuriesAmountOrder', 'housesDestroyed',
       'housesDestroyedAmountOrder', 'housesDamaged',
       'housesDamagedAmountOrder', 'eqMagMw', 'eqMagMs', 'injuriesTotal',
       'injuriesAmountOrderTotal', 'housesDestroyedTotal',
       'housesDestroyedAmountOrderTotal', 'deaths', 'deathsAmountOrder',
       'damageMillionsDollars', 'eqMagMl', 'deathsTotal',
       'deathsAmountOrderTotal', 'damageMillionsDollarsTotal',
       'tsunamiEventId', 'intensity', 'volcanoEventId', 'area', 'missing',
       'missingAmountOrder', 'missingTotal', 'missingAmountOrderTotal',
       'eqMagUnk'],
      dtype='object')


Unnamed: 0,id,year,month,day,hour,minute,second,locationName,latitude,longitude,eqDepth,eqMagnitude,damageAmountOrder,eqMagMb,publish,damageAmountOrderTotal,housesDamagedTotal,housesDamagedAmountOrderTotal,country,regionCode,injuries,injuriesAmountOrder,housesDestroyed,housesDestroyedAmountOrder,housesDamaged,housesDamagedAmountOrder,eqMagMw,eqMagMs,injuriesTotal,injuriesAmountOrderTotal,housesDestroyedTotal,housesDestroyedAmountOrderTotal,deaths,deathsAmountOrder,damageMillionsDollars,eqMagMl,deathsTotal,deathsAmountOrderTotal,damageMillionsDollarsTotal,tsunamiEventId,intensity,volcanoEventId,area,missing,missingAmountOrder,missingTotal,missingAmountOrderTotal,eqMagUnk
0,5551,2000,1,3,22.0,34.0,12.6,INDIA-BANGLADESH BORDER: MAHESHKHALI,22.132,92.771,33.0,4.6,1.0,4.6,True,1.0,100.0,2.0,INDIA,60,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,5552,2000,1,11,23.0,43.0,56.4,CHINA: LIAONING PROVINCE,40.498,122.994,10.0,5.1,3.0,4.9,True,3.0,8800.0,4.0,CHINA,30,30.0,1.0,3600.0,4.0,8800.0,4.0,5.1,4.7,30.0,1.0,3600.0,4.0,,,,,,,,,,,,,,,,
2,5553,2000,1,14,23.0,37.0,7.8,CHINA: YUNNAN PROVINCE: YAOAN COUNTY,25.607,101.063,33.0,5.9,4.0,5.4,True,4.0,,,CHINA,30,2528.0,4.0,41000.0,4.0,,,5.9,5.9,2528.0,4.0,41000.0,4.0,5.0,1.0,73.5,5.5,7.0,1.0,73.5,,,,,,,,,
3,5554,2000,2,2,22.0,58.0,1.5,"IRAN: BARDASKAN, KASHMAR",35.288,58.218,33.0,5.3,2.0,5.1,True,2.0,300.0,3.0,IRAN,140,15.0,1.0,100.0,2.0,100.0,2.0,5.3,5.3,15.0,1.0,100.0,2.0,1.0,1.0,,,1.0,1.0,,,,,,,,,,
4,5555,2000,2,7,19.0,34.0,57.0,SOUTH AFRICA; SWAZILAND: MBABANE-MANZINI,-26.288,30.888,5.0,4.5,1.0,4.5,True,1.0,,,SOUTH AFRICA,10,1.0,1.0,,,,,,,1.0,1.0,,,,,,,,,,,,,,,,,,


In [120]:
# combine housesDamagedTotal and housesDamaged columns
print(df["housesDamagedTotal"])
print(df["housesDamaged"])
# if housesDamagedTotal is different than housesDamaged, take an average
df["housesDamagedTotal"] = df["housesDamagedTotal"].fillna(df["housesDamaged"])
df["housesDamaged"] = df["housesDamaged"].fillna(df["housesDamagedTotal"])
df["housesDamagedTotal"] = df["housesDamagedTotal"].where(df["housesDamagedTotal"] == df["housesDamaged"], (df["housesDamagedTotal"] + df["housesDamaged"]) / 2)
print(df["housesDamagedTotal"])

0        100.0
1       8800.0
2          NaN
3        300.0
4          NaN
         ...  
1420       NaN
1421       NaN
1422       NaN
1423    4328.0
1424       NaN
Name: housesDamagedTotal, Length: 1425, dtype: float64
0          NaN
1       8800.0
2          NaN
3        100.0
4          NaN
         ...  
1420       NaN
1421       NaN
1422       NaN
1423    4328.0
1424       NaN
Name: housesDamaged, Length: 1425, dtype: float64
0        100.0
1       8800.0
2          NaN
3        200.0
4          NaN
         ...  
1420       NaN
1421       NaN
1422       NaN
1423    4328.0
1424       NaN
Name: housesDamagedTotal, Length: 1425, dtype: float64


In [121]:
df["housesDamagedTotal"] = df["housesDamagedTotal"].fillna(df["housesDamaged"]).fillna(0)
df = df.drop(columns=["housesDamaged"])
df.head()

Unnamed: 0,id,year,month,day,hour,minute,second,locationName,latitude,longitude,eqDepth,eqMagnitude,damageAmountOrder,eqMagMb,publish,damageAmountOrderTotal,housesDamagedTotal,housesDamagedAmountOrderTotal,country,regionCode,injuries,injuriesAmountOrder,housesDestroyed,housesDestroyedAmountOrder,housesDamagedAmountOrder,eqMagMw,eqMagMs,injuriesTotal,injuriesAmountOrderTotal,housesDestroyedTotal,housesDestroyedAmountOrderTotal,deaths,deathsAmountOrder,damageMillionsDollars,eqMagMl,deathsTotal,deathsAmountOrderTotal,damageMillionsDollarsTotal,tsunamiEventId,intensity,volcanoEventId,area,missing,missingAmountOrder,missingTotal,missingAmountOrderTotal,eqMagUnk
0,5551,2000,1,3,22.0,34.0,12.6,INDIA-BANGLADESH BORDER: MAHESHKHALI,22.132,92.771,33.0,4.6,1.0,4.6,True,1.0,100.0,2.0,INDIA,60,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,5552,2000,1,11,23.0,43.0,56.4,CHINA: LIAONING PROVINCE,40.498,122.994,10.0,5.1,3.0,4.9,True,3.0,8800.0,4.0,CHINA,30,30.0,1.0,3600.0,4.0,4.0,5.1,4.7,30.0,1.0,3600.0,4.0,,,,,,,,,,,,,,,,
2,5553,2000,1,14,23.0,37.0,7.8,CHINA: YUNNAN PROVINCE: YAOAN COUNTY,25.607,101.063,33.0,5.9,4.0,5.4,True,4.0,0.0,,CHINA,30,2528.0,4.0,41000.0,4.0,,5.9,5.9,2528.0,4.0,41000.0,4.0,5.0,1.0,73.5,5.5,7.0,1.0,73.5,,,,,,,,,
3,5554,2000,2,2,22.0,58.0,1.5,"IRAN: BARDASKAN, KASHMAR",35.288,58.218,33.0,5.3,2.0,5.1,True,2.0,200.0,3.0,IRAN,140,15.0,1.0,100.0,2.0,2.0,5.3,5.3,15.0,1.0,100.0,2.0,1.0,1.0,,,1.0,1.0,,,,,,,,,,
4,5555,2000,2,7,19.0,34.0,57.0,SOUTH AFRICA; SWAZILAND: MBABANE-MANZINI,-26.288,30.888,5.0,4.5,1.0,4.5,True,1.0,0.0,,SOUTH AFRICA,10,1.0,1.0,,,,,,1.0,1.0,,,,,,,,,,,,,,,,,,


In [122]:
df.columns

Index(['id', 'year', 'month', 'day', 'hour', 'minute', 'second',
       'locationName', 'latitude', 'longitude', 'eqDepth', 'eqMagnitude',
       'damageAmountOrder', 'eqMagMb', 'publish', 'damageAmountOrderTotal',
       'housesDamagedTotal', 'housesDamagedAmountOrderTotal', 'country',
       'regionCode', 'injuries', 'injuriesAmountOrder', 'housesDestroyed',
       'housesDestroyedAmountOrder', 'housesDamagedAmountOrder', 'eqMagMw',
       'eqMagMs', 'injuriesTotal', 'injuriesAmountOrderTotal',
       'housesDestroyedTotal', 'housesDestroyedAmountOrderTotal', 'deaths',
       'deathsAmountOrder', 'damageMillionsDollars', 'eqMagMl', 'deathsTotal',
       'deathsAmountOrderTotal', 'damageMillionsDollarsTotal',
       'tsunamiEventId', 'intensity', 'volcanoEventId', 'area', 'missing',
       'missingAmountOrder', 'missingTotal', 'missingAmountOrderTotal',
       'eqMagUnk'],
      dtype='object')

In [123]:
# combine damageAmountOrder and damageAmountOrderTotal columns
print(df["damageAmountOrder"])
print(df["damageAmountOrderTotal"])
# if damageAmountOrderTotal is different than damageAmountOrder, take an average
df["damageAmountOrderTotal"] = df["damageAmountOrderTotal"].fillna(df["damageAmountOrder"])
df["damageAmountOrder"] = df["damageAmountOrder"].fillna(df["damageAmountOrderTotal"])
df["damageAmountOrderTotal"] = df["damageAmountOrderTotal"].where(df["damageAmountOrderTotal"] == df["damageAmountOrder"], (df["damageAmountOrderTotal"] + df["damageAmountOrder"]) / 2)
print(df["damageAmountOrderTotal"])

0       1.0
1       3.0
2       4.0
3       2.0
4       1.0
       ... 
1420    NaN
1421    NaN
1422    1.0
1423    3.0
1424    3.0
Name: damageAmountOrder, Length: 1425, dtype: float64
0       1.0
1       3.0
2       4.0
3       2.0
4       1.0
       ... 
1420    NaN
1421    NaN
1422    1.0
1423    3.0
1424    3.0
Name: damageAmountOrderTotal, Length: 1425, dtype: float64
0       1.0
1       3.0
2       4.0
3       2.0
4       1.0
       ... 
1420    NaN
1421    NaN
1422    1.0
1423    3.0
1424    3.0
Name: damageAmountOrderTotal, Length: 1425, dtype: float64


In [124]:
df["damageAmountOrderTotal"] = df["damageAmountOrderTotal"].fillna(df["damageAmountOrder"]).fillna(0)
df = df.drop(columns=["damageAmountOrder"])
df.head()

Unnamed: 0,id,year,month,day,hour,minute,second,locationName,latitude,longitude,eqDepth,eqMagnitude,eqMagMb,publish,damageAmountOrderTotal,housesDamagedTotal,housesDamagedAmountOrderTotal,country,regionCode,injuries,injuriesAmountOrder,housesDestroyed,housesDestroyedAmountOrder,housesDamagedAmountOrder,eqMagMw,eqMagMs,injuriesTotal,injuriesAmountOrderTotal,housesDestroyedTotal,housesDestroyedAmountOrderTotal,deaths,deathsAmountOrder,damageMillionsDollars,eqMagMl,deathsTotal,deathsAmountOrderTotal,damageMillionsDollarsTotal,tsunamiEventId,intensity,volcanoEventId,area,missing,missingAmountOrder,missingTotal,missingAmountOrderTotal,eqMagUnk
0,5551,2000,1,3,22.0,34.0,12.6,INDIA-BANGLADESH BORDER: MAHESHKHALI,22.132,92.771,33.0,4.6,4.6,True,1.0,100.0,2.0,INDIA,60,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,5552,2000,1,11,23.0,43.0,56.4,CHINA: LIAONING PROVINCE,40.498,122.994,10.0,5.1,4.9,True,3.0,8800.0,4.0,CHINA,30,30.0,1.0,3600.0,4.0,4.0,5.1,4.7,30.0,1.0,3600.0,4.0,,,,,,,,,,,,,,,,
2,5553,2000,1,14,23.0,37.0,7.8,CHINA: YUNNAN PROVINCE: YAOAN COUNTY,25.607,101.063,33.0,5.9,5.4,True,4.0,0.0,,CHINA,30,2528.0,4.0,41000.0,4.0,,5.9,5.9,2528.0,4.0,41000.0,4.0,5.0,1.0,73.5,5.5,7.0,1.0,73.5,,,,,,,,,
3,5554,2000,2,2,22.0,58.0,1.5,"IRAN: BARDASKAN, KASHMAR",35.288,58.218,33.0,5.3,5.1,True,2.0,200.0,3.0,IRAN,140,15.0,1.0,100.0,2.0,2.0,5.3,5.3,15.0,1.0,100.0,2.0,1.0,1.0,,,1.0,1.0,,,,,,,,,,
4,5555,2000,2,7,19.0,34.0,57.0,SOUTH AFRICA; SWAZILAND: MBABANE-MANZINI,-26.288,30.888,5.0,4.5,4.5,True,1.0,0.0,,SOUTH AFRICA,10,1.0,1.0,,,,,,1.0,1.0,,,,,,,,,,,,,,,,,,


In [125]:
for col in df.columns:
    for compareCol in df.columns:
        if col in compareCol and compareCol != col:
            print(f"{col} is contained in {compareCol}")

injuries is contained in injuriesAmountOrder
injuries is contained in injuriesTotal
injuries is contained in injuriesAmountOrderTotal
injuriesAmountOrder is contained in injuriesAmountOrderTotal
housesDestroyed is contained in housesDestroyedAmountOrder
housesDestroyed is contained in housesDestroyedTotal
housesDestroyed is contained in housesDestroyedAmountOrderTotal
housesDestroyedAmountOrder is contained in housesDestroyedAmountOrderTotal
housesDamagedAmountOrder is contained in housesDamagedAmountOrderTotal
deaths is contained in deathsAmountOrder
deaths is contained in deathsTotal
deaths is contained in deathsAmountOrderTotal
deathsAmountOrder is contained in deathsAmountOrderTotal
damageMillionsDollars is contained in damageMillionsDollarsTotal
missing is contained in missingAmountOrder
missing is contained in missingTotal
missing is contained in missingAmountOrderTotal
missingAmountOrder is contained in missingAmountOrderTotal


In [126]:
print(len(df.columns))

46


In [127]:
# if injuriesTotal is different than injuries, take an average
def combine_columns(compare_column_name, column_name, df=df):
    df[column_name] = df[column_name].fillna(df[compare_column_name])
    df[compare_column_name] = df[compare_column_name].fillna(df[column_name])
    df[column_name] = df[column_name].where(df[column_name] == df[compare_column_name], (df[column_name] + df[compare_column_name]) / 2)

    df[column_name] = df[column_name].fillna(df[compare_column_name]).fillna(0)
    df = df.drop(columns=[compare_column_name])
    return df

df = combine_columns("injuries", "injuriesTotal", df)
df = combine_columns("housesDestroyed", "housesDestroyedTotal", df)
df = combine_columns("housesDamagedAmountOrder", "housesDamagedAmountOrderTotal", df)
df = combine_columns("deaths", "deathsTotal", df)
df = combine_columns("damageMillionsDollars", "damageMillionsDollarsTotal", df)
df = combine_columns("missing", "missingTotal", df)
df = combine_columns("missingAmountOrder", "missingAmountOrderTotal", df)

print(len(df.columns))

39


In [129]:
df = df.fillna(0)
df.head()

Unnamed: 0,id,year,month,day,hour,minute,second,locationName,latitude,longitude,eqDepth,eqMagnitude,eqMagMb,publish,damageAmountOrderTotal,housesDamagedTotal,housesDamagedAmountOrderTotal,country,regionCode,injuriesAmountOrder,housesDestroyedAmountOrder,eqMagMw,eqMagMs,injuriesTotal,injuriesAmountOrderTotal,housesDestroyedTotal,housesDestroyedAmountOrderTotal,deathsAmountOrder,eqMagMl,deathsTotal,deathsAmountOrderTotal,damageMillionsDollarsTotal,tsunamiEventId,intensity,volcanoEventId,area,missingTotal,missingAmountOrderTotal,eqMagUnk
0,5551,2000,1,3,22.0,34.0,12.6,INDIA-BANGLADESH BORDER: MAHESHKHALI,22.132,92.771,33.0,4.6,4.6,True,1.0,100.0,2.0,INDIA,60,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
1,5552,2000,1,11,23.0,43.0,56.4,CHINA: LIAONING PROVINCE,40.498,122.994,10.0,5.1,4.9,True,3.0,8800.0,4.0,CHINA,30,1.0,4.0,5.1,4.7,30.0,1.0,3600.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
2,5553,2000,1,14,23.0,37.0,7.8,CHINA: YUNNAN PROVINCE: YAOAN COUNTY,25.607,101.063,33.0,5.9,5.4,True,4.0,0.0,0.0,CHINA,30,4.0,4.0,5.9,5.9,2528.0,4.0,41000.0,4.0,1.0,5.5,6.0,1.0,73.5,0.0,0.0,0.0,0,0.0,0.0,0.0
3,5554,2000,2,2,22.0,58.0,1.5,"IRAN: BARDASKAN, KASHMAR",35.288,58.218,33.0,5.3,5.1,True,2.0,200.0,2.5,IRAN,140,1.0,2.0,5.3,5.3,15.0,1.0,100.0,2.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
4,5555,2000,2,7,19.0,34.0,57.0,SOUTH AFRICA; SWAZILAND: MBABANE-MANZINI,-26.288,30.888,5.0,4.5,4.5,True,1.0,0.0,0.0,SOUTH AFRICA,10,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
