In [None]:
from bs4 import BeautifulSoup as BS
import urllib.request
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
import seaborn as sns

import geopandas as gpd

import datetime

In [None]:
request = urllib.request.Request("https://en.wikipedia.org/wiki/List_of_deadly_earthquakes_since_1900")
result = urllib.request.urlopen(request)
resulttext = result.read()
soup = BS(resulttext, 'html.parser')
type(soup)

In [None]:
table = soup.find_all('table', class_="sortable wikitable")
#table
type(table)
#len(table)

In [None]:
table_2 = table[0]
type(table_2)

In [None]:
table_headers = table_2.find_all("th")

In [None]:
type(table_headers)

In [None]:
columns = [th.text for th in table_headers]

In [None]:
type(columns)

In [None]:
rows = []
for tr in table_2.find_all("tr")[1:]:
    row_data = [td.text for td in tr.find_all("td")]
    rows.append(row_data)

In [None]:
rows

In [None]:
earthquakes = pd.DataFrame(rows, columns = columns)

In [None]:
earthquakes.info()

In [None]:
earthquakes = earthquakes.replace("", np.nan)

In [None]:
earthquakes.info()

In [None]:
earthquakes["Other Source Deaths"].value_counts()

In [None]:
earthquakes[earthquakes["Other Source Deaths"].str.contains("231000", na = False)]

In [None]:
earthquakes[earthquakes["Other Source Deaths"].str.contains("26271", na = False)]

In [None]:
earthquakes[earthquakes["Present-day country and link to Wikipedia article"].str.contains("Iran", na = False)]

In [None]:
earthquakes.info()

In [None]:
test = "(\[\d+\]|\(.*\)|\*|\+)"

In [None]:
earthquakes["Other Source Deaths"] = earthquakes["Other Source Deaths"].str.replace(test, "")
#footnotes = earthquakes["Other Source Deaths"].str.extract("(?P<footnotes>\[\d+\]|\(.*\)|\*|\+)", expand = False)
#earthquakes["footnotes"] = footnotes.
#footnotes.unique()

In [None]:
earthquakes

In [None]:
earthquakes["Other Source Deaths"].value_counts()

In [None]:
#This can probably be combined with previous regex to consolidate code.
earthquakes["Other Source Deaths"] = earthquakes["Other Source Deaths"].str.replace(",", "")

In [None]:
earthquakes["Other Source Deaths"].value_counts()

In [None]:
earthquakes[earthquakes["Present-day country and link to Wikipedia article"].str.contains("Iran", na = False)]

In [None]:
means = []
for row in earthquakes["Other Source Deaths"]:
    print("\noriginal: {}".format(type(row)))
    #if isinstance(row, float):
    #    int_list = [str(row)]
    if isinstance(row, str):
        #print(len(row))
        int_list = row.strip().split(" ")
        print("transformed: {}".format(int_list))
        #row.split(" ")
        #make_int = list(map(int, int_list))
        #print("new type:", make_int)
        total = sum(list(map(int, int_list)))
        number = len(int_list)
        mean = total/number
        means.append(mean)
        print(mean)
    else:
        means.append(row)
        
earthquakes["Other Source Deaths"] = means
    
    #int_list = row.str.split(" ")
    #print(int_list)

In [None]:
means

In [None]:
earthquakes["Other Source Deaths"].value_counts()

In [None]:
"""There are some bad characters in this column, so this regex just grabs the int"""
for col in ['EM-DAT Total Deaths', 'Magnitude']:
    earthquakes[col] = earthquakes[col].str.extract('(\d+\.\d+)', expand=True)

In [None]:
def nan_to_int(df, columns, value=0, inplace=True):
    """Transforms the column in the dataframe to a float32 column, rather than the object type. 
    Also fills all NaN values to -1.
    """
    for col in columns:        
        df[col].fillna(value=value, inplace=inplace)
        df[col] = pd.to_numeric(df[col], downcast='float')

In [None]:
columns_to_convert = ['PDE Total Deaths', 'PDE Shaking Deaths', 'Utsu Total Deaths', 'EM-DAT Total Deaths', 'Magnitude']
nan_to_int(df=earthquakes, columns=columns_to_convert)

In [None]:
earthquakes.info()

In [None]:
subdf = earthquakes[["PDE Total Deaths", "Utsu Total Deaths", "EM-DAT Total Deaths", "Other Source Deaths"]]
subdf.head(1)

In [None]:
earthquakes["deaths"] = subdf.max(axis = 1)

In [None]:
earthquakes.head()

In [None]:
earthquakes.columns = ["UTC", "Country", "Lat", "Long", "Depth_km", "Mag", "Secondary_Effects", "PDE_Shaking_Deaths", "PDE_Deaths", "Utsu_Deaths", "EM-DAT_Deahts", "Other_Deaths", "Max_Deaths"]

In [None]:
test2="(\s\(.*)"
earthquakes['Country']= earthquakes['Country'].str.replace(test2, "")


In [None]:
earthquakes[earthquakes["Country"].str.contains("Alaska", case = False)]

In [None]:
earthquakes.info()

In [None]:
earthquakes["UTC"] = pd.to_datetime(earthquakes["UTC"])

In [None]:
earthquakes["UTC"]

In [None]:
earthquakes["year"] = pd.DatetimeIndex(earthquakes["UTC"]).year
#earthquakes["month"] = pd.DatetimeIndex(earthquakes["UTC"]).month
#earthquakes["month"] = earthquakes.month.map("{:02}".format)
#earthquakes["day"] = pd.DatetimeIndex(earthquakes["UTC"]).day
#earthquakes["date"] = earthquakes["year"].astype(str)+earthquakes["month"].astype(str)+earthquakes["day"].astype(str)
#earthquakes["date"] = earthquakes.date.astype(int)

In [None]:
earthquakes.head(1)

In [None]:
earthquakes.info()

earthquakes['date'] = pd.DatetimeIndex(earthquakes.UTC).normalize()

earthquakes["date"]

In [None]:
quakes_country = earthquakes.groupby("Country").agg(len)
quakes_country

In [None]:
quakes_country = quakes_country.reset_index()
#quakes_country
quakes_country_25 = quakes_country.nlargest(25, columns = "UTC")
#quakes_country_25

f, ax = plt.subplots(figsize=(15, 10))
#plt.ylabel('Number of Quakes')
quake_count = sns.barplot(quakes_country_25.Country, quakes_country_25.UTC)
quake_count.set_xticklabels(quake_count.get_xticklabels(), rotation=30, ha="right")

In [None]:
country_list = quakes_country_25["Country"].tolist()
#country_list
earthquakes_25 = earthquakes[earthquakes["Country"].str.contains('|'.join(country_list))]
earthquakes_25.info()

earthquakes_25['UTC'] = earthquakes_25['UTC'].astype('datetime64[ns]')
earthquakes_25.info()

plt.hist(earthquakes["UTC"])

In [None]:
sns.set(font_scale = 1.5)
sns.set_style("white")
plt.subplots(figsize=(15,10))
sns.stripplot(x = "year", y = "Country", data = earthquakes_25, jitter=True)

sns.stripplot(x="Country", y="UTC", data=earthquakes_25)

In [None]:
type(earthquakes_25["UTC"])

In [None]:
plt.yscale('log', nonposy='clip')
plt.hist(earthquakes_25["Max_Deaths"], bins = 50)

In [None]:
plt.figure(figsize=(20,10))
plt.hist(earthquakes["Depth_km"], bins = 100)

In [None]:
earthquakes["Depth_km"].value_counts()

In [None]:
earthquakes_cont = earthquakes[["Depth_km", "Mag", "Max_Deaths", "year"]]

In [None]:
earthquakes_cont.iloc[413]

In [None]:
earthquakes_cont = earthquakes_cont.replace("?", np.nan)

In [None]:
earthquakes_cont["Depth_km"] = pd.to_numeric(earthquakes_cont["Depth_km"])

In [None]:
earthquakes_cont.info()

In [None]:
earthquakes_cont = earthquakes_cont.fillna(0)

In [None]:
sns.pairplot(earthquakes_cont, dropna = True)