In [73]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as BS
from datetime import datetime
import numpy as np

In [2]:
response = requests.get('https://en.wikipedia.org/wiki/List_of_deadly_earthquakes_since_1900')
result_text = response.text

In [14]:
soup = BS(result_text, 'html.parser')

In [21]:
type(soup)

bs4.BeautifulSoup

In [101]:
soup.findAll('th')

[<th width="22%">Origin (<a class="mw-redirect" href="/wiki/UTC" title="UTC">UTC</a>)</th>,
 <th width="33%">Present-day country and link to Wikipedia article</th>,
 <th width="10%"><a href="/wiki/Latitude" title="Latitude">Lat</a></th>,
 <th width="10%"><a href="/wiki/Longitude" title="Longitude">Long</a></th>,
 <th width="8%">Depth (<a class="mw-redirect" href="/wiki/Km" title="Km">km</a>)</th>,
 <th width="6%">Magnitude</th>,
 <th width="9%">Secondary Effects</th>,
 <th width="10%">PDE Shaking Deaths</th>,
 <th width="10%">PDE Total Deaths</th>,
 <th width="10%">Utsu Total Deaths</th>,
 <th width="10%">EM-DAT Total Deaths</th>,
 <th width="12%">Other Source Deaths
 </th>]

In [41]:
column_headers = [th.getText() for th in 
                  soup.findAll('th')]

In [42]:
column_headers

['Origin (UTC)',
 'Present-day country and link to Wikipedia article',
 'Lat',
 'Long',
 'Depth (km)',
 'Magnitude',
 'Secondary Effects',
 'PDE Shaking Deaths',
 'PDE Total Deaths',
 'Utsu Total Deaths',
 'EM-DAT Total Deaths',
 'Other Source Deaths\n']

In [108]:
soup.findAll('tr')[6:] 

[<tr>
 <td>1900-05-11 17:23</td>
 <td>Japan</td>
 <td>38.700</td>
 <td>141.100</td>
 <td>5</td>
 <td>7.0 <a href="/wiki/Seismic_magnitude_scales#Mjma" title="Seismic magnitude scales"><span title="JMA mag.">M<sub>JMA</sub></span></a></td>
 <td></td>
 <td></td>
 <td></td>
 <td></td>
 <td></td>
 <td>
 </td></tr>, <tr>
 <td>1900-07-12 06:25</td>
 <td>Turkey</td>
 <td>40.300</td>
 <td>43.100</td>
 <td></td>
 <td>5.9 <a href="/wiki/Seismic_magnitude_scales#Muk" title="Seismic magnitude scales"><span title="Unknown mag. scale">M<sub>uk</sub></span></a></td>
 <td></td>
 <td></td>
 <td></td>
 <td>140</td>
 <td></td>
 <td>
 </td></tr>, <tr>
 <td>1900-10-29 09:11</td>
 <td>Venezuela</td>
 <td>11.000</td>
 <td>-66.000</td>
 <td>0</td>
 <td>7.7 <a href="/wiki/Seismic_magnitude_scales#Mw" title="Seismic magnitude scales"><span title="Moment mag. scale">M<sub>w</sub></span></a></td>
 <td></td>
 <td></td>
 <td></td>
 <td></td>
 <td></td>
 <td>
 </td></tr>, <tr>
 <td>1901-02-15 00:00</td>
 <td>China</

In [48]:
data_rows = soup.findAll('tr')[6:] 

In [49]:
type(data_rows)

list

In [50]:
earthquake_data = [[td.getText() for td in data_rows[i].findAll('td')]
            for i in range(len(data_rows))]

In [80]:
df = pd.DataFrame(earthquake_data, columns=column_headers)

In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1340 entries, 0 to 1339
Data columns (total 12 columns):
Origin (UTC)                                         1340 non-null object
Present-day country and link to Wikipedia article    1340 non-null object
Lat                                                  1340 non-null object
Long                                                 1340 non-null object
Depth (km)                                           1340 non-null object
Magnitude                                            1340 non-null object
Secondary Effects                                    1340 non-null object
PDE Shaking Deaths                                   1340 non-null object
PDE Total Deaths                                     1340 non-null object
Utsu Total Deaths                                    1340 non-null object
EM-DAT Total Deaths                                  1340 non-null object
Other Source Deaths
                                 1338 non-null object
dtype

In [82]:
df.columns = ['origin','country','lat','lng','depth','magnitude','secondary_effects','pde_shaking_deaths','pde_total_deaths','utsu_total_deaths','emdat_total_deaths','other_deaths']

In [99]:
df.head()

Unnamed: 0,origin,country,lat,lng,depth,magnitude,secondary_effects,pde_shaking_deaths,pde_total_deaths,utsu_total_deaths,emdat_total_deaths,other_deaths
0,1900-05-11 17:23:00,Japan,38.7,141.1,5.0,7.0 MJMA,,,,,,\n
1,1900-07-12 06:25:00,Turkey,40.3,43.1,,5.9 Muk,,,,140.0,,\n
2,1900-10-29 09:11:00,Venezuela,11.0,-66.0,0.0,7.7 Mw,,,,,,\n
3,1901-02-15 00:00:00,China,26.0,100.1,0.0,6.5 Ms,,,,,,\n
4,1901-03-31 07:11:00,Bulgaria,43.4,28.7,,6.4 Muk,,,,4.0,,\n


In [84]:
df['origin'] = pd.to_datetime(df.origin)

In [85]:
df.head()

Unnamed: 0,origin,country,lat,lng,depth,magnitude,secondary_effects,pde_shaking_deaths,pde_total_deaths,utsu_total_deaths,emdat_total_deaths,other_deaths
0,1900-05-11 17:23:00,Japan,38.7,141.1,5.0,7.0 MJMA,,,,,,\n
1,1900-07-12 06:25:00,Turkey,40.3,43.1,,5.9 Muk,,,,140.0,,\n
2,1900-10-29 09:11:00,Venezuela,11.0,-66.0,0.0,7.7 Mw,,,,,,\n
3,1901-02-15 00:00:00,China,26.0,100.1,0.0,6.5 Ms,,,,,,\n
4,1901-03-31 07:11:00,Bulgaria,43.4,28.7,,6.4 Muk,,,,4.0,,\n


In [87]:
df = df.replace('', np.NaN)

In [98]:
df.other_deaths.str[:-1]


0                                   
1                                   
2                                   
3                                   
4                                   
5                                   
6                                   
7                                   
8                                   
9                                   
10                                  
11                                  
12                                  
13                                  
14                                  
15                                  
16                              3500
17                                  
18                                  
19                                  
20                                  
21                                  
22                                  
23                                  
24                                  
25                                  
26                                  
2