In [1]:
from bs4 import BeautifulSoup as BS
import urllib as url
import pandas as pd
import numpy as np

In [2]:
# Initializing Request object by passing in the earthquake's url, and assigning the reults to resulttext
request = url.request.Request("https://en.wikipedia.org/wiki/List_of_deadly_earthquakes_since_1900")
result = url.request.urlopen(request)
resulttext = result.read()

In [3]:
# Initialize a BeautifulSoup instance of the sring-ed version of the website's html
soup = BS(resulttext, 'html.parser')

In [4]:
# There are six different tables on the website, so I am targetting the table with specific classes. 
# This returns a list, so I assert that the list length is equal to one (so I know I am only working with one table),
# then assign the first index to my variable to reference.
earthquake_table_list = soup.find_all('table', class_=['sortable', 'wikitable', 'jquery-tablesorter'])
assert len(earthquake_table_list) == 1

earthquake_table = earthquake_table_list[0]

In [5]:
# Grab all of the 'th' elements, grab the text from each one, and assign the results to a list. 
# This will be used when creating the pandas df for column name assignment
table_header = earthquake_table.find_all('th')

columns = [th.text for th in table_header]

In [6]:
print(columns)

['Origin (UTC)', 'Present-day country and link to Wikipedia article', 'Lat', 'Long', 'Depth (km)', 'Magnitude', 'Secondary Effects', 'PDE Shaking Deaths', 'PDE Total Deaths', 'Utsu Total Deaths', 'EM-DAT Total Deaths', 'Other Source Deaths']


In [7]:
# Find all table rows
table_rows = earthquake_table.find_all('tr')[1:]

# Create an empty list that will ultimately hold.. more lists. The nested lists will hold each individual value per 
# column, per row
df_rows = list()
for row in table_rows:
    # Grab the text from each row, and assign it to a list. Append this list to df_rows
    row_data= [data.text for data in row.find_all('td')]
    
    df_rows.append(row_data)

In [8]:
for row in df_rows[:5]:
    print(row)

['1900-05-11 17:23', 'Japan', '38.700', '141.100', '5', '7.0 MJMA', '', '', '', '', '', '']
['1900-07-12 06:25', 'Turkey', '40.300', '43.100', '', '5.9 Muk', '', '', '', '140', '', '']
['1900-10-29 09:11', 'Venezuela', '11.000', '-66.000', '0', '7.7 Mw', '', '', '', '', '', '']
['1901-02-15 00:00', 'China', '26.000', '100.100', '0', '6.5 Ms', '', '', '', '', '', '']
['1901-03-31 07:11', 'Bulgaria', '43.400', '28.700', '', '6.4 Muk', '', '', '', '4', '', '']


In [90]:
# Create a pandas df using the columns and rows we scraped.
earthquake_df = pd.DataFrame(data=df_rows, columns=columns)

# Drop 'Other Source Deaths' column, per request from README.md
earthquake_df.drop(columns=['Other Source Deaths'], inplace=True)

earthquake_df['Magnitude'] = earthquake_df['Magnitude'].str.extract('(?P<Magnitude>\d)', expand=True)
earthquake_df['Magnitude'] = pd.to_numeric(earthquake_df['Magnitude'])

# Since the 'Origin (UTC)' column is a date string, I am transforming it to a datetime object and assigning it 
# to the index. I can imagine using datetime range queries later on, and converting these now seems appropriate
# and will be much easier later on. 
earthquake_df['Origin (UTC)'] = pd.to_datetime(earthquake_df['Origin (UTC)'])
earthquake_df.set_index('Origin (UTC)', inplace=True)

# Replace the remaining empty strings to np.nan values
# earthquake_df.replace(r'^\s*$', np.nan, regex=True, inplace=True)
earthquake_df.replace("", np.nan, inplace=True)

earthquake_df['PDE Total Deaths'] = pd.to_numeric(earthquake_df['PDE Total Deaths'])
earthquake_df['PDE Total Deaths'].fillna(value=-1, inplace=True)

earthquake_df['PDE Shaking Deaths'] = pd.to_numeric(earthquake_df['PDE Shaking Deaths'])
earthquake_df['PDE Shaking Deaths'].fillna(value=-1, inplace=True)

earthquake_df['Utsu Total Deaths'] = pd.to_numeric(earthquake_df['Utsu Total Deaths'])
earthquake_df['Utsu Total Deaths'].fillna(value=-1, inplace=True)

earthquake_df['EM-DAT Total Deaths'] = earthquake_df['EM-DAT Total Deaths'].str.find('[\d+]')
earthquake_df['EM-DAT Total Deaths'] = pd.to_numeric(earthquake_df['EM-DAT Total Deaths'])
earthquake_df['EM-DAT Total Deaths'].fillna(value=-1, inplace=True)

earthquake_df.columns

Index(['Present-day country and link to Wikipedia article', 'Lat', 'Long',
       'Depth (km)', 'Magnitude', 'Secondary Effects', 'PDE Shaking Deaths',
       'PDE Total Deaths', 'Utsu Total Deaths', 'EM-DAT Total Deaths'],
      dtype='object')

In [88]:
earthquake_df.head()

Unnamed: 0_level_0,Present-day country and link to Wikipedia article,Lat,Long,Depth (km),Magnitude,Secondary Effects,PDE Shaking Deaths,PDE Total Deaths,Utsu Total Deaths,EM-DAT Total Deaths
Origin (UTC),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1900-05-11 17:23:00,Japan,38.7,141.1,5.0,7.0,,-1.0,-1.0,-1.0,-1.0
1900-07-12 06:25:00,Turkey,40.3,43.1,,5.0,,-1.0,-1.0,140.0,-1.0
1900-10-29 09:11:00,Venezuela,11.0,-66.0,0.0,7.0,,-1.0,-1.0,-1.0,-1.0
1901-02-15 00:00:00,China,26.0,100.1,0.0,6.0,,-1.0,-1.0,-1.0,-1.0
1901-03-31 07:11:00,Bulgaria,43.4,28.7,,6.0,,-1.0,-1.0,4.0,-1.0


In [79]:
earthquake_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1340 entries, 1900-05-11 17:23:00 to 2011-09-23 10:41:00
Data columns (total 10 columns):
Present-day country and link to Wikipedia article    1340 non-null object
Lat                                                  1326 non-null object
Long                                                 1326 non-null object
Depth (km)                                           1250 non-null object
Magnitude                                            1339 non-null float64
Secondary Effects                                    373 non-null object
PDE Shaking Deaths                                   1340 non-null float64
PDE Total Deaths                                     1340 non-null float64
Utsu Total Deaths                                    1027 non-null object
EM-DAT Total Deaths                                  560 non-null object
dtypes: float64(3), object(7)
memory usage: 115.2+ KB


In [83]:
earthquake_df['EM-DAT Total Deaths'].unique()

array([nan, '2000', '20000', '2500', '400', '12000', '75000', '923', '20',
       '29980', '1800', '10000', '100', '116', '180000', '1000', '5000',
       '143000', '2925', '200000', '107', '3300', '500', '0', '36', '256',
       '6', '3008', '6000', '3410', '60000', '9', '149', '30000', '32962',
       '249', '980', '200', '3000', '213', '1083', '2824', '3959', '998',
       '1961', '4000', '165', '73', '1400', '233', '27', '5131', '110000',
       '8', '437', '1500', '30', '14', '1200', '455', '7', '13', '1250',
       '39', '53', '160', '28', '191', '57', '131', '4', '25', '23', '38',
       '128', '10[7]|', '80', '2394', '120', '61', '54', '183', '240',
       '177', '19', '47', '11', '271', '29', '15', '41', '24', '150',
       '1086', '66794', '176', '65', '878', '85', '5057', '17', '78',
       '2385', '1', '23000', '50', '2', '10', '922', '420', '573',
       '242000', '16', '3840', '1641', '167', '352', '3', '185', '589',
       '21', '25000', '45', '5', '121', '34', '26', '35