In [1]:
from bs4 import BeautifulSoup as BS
import urllib as url
import pandas as pd
import numpy as np  
import re

In [2]:
request = url.request.Request("https://en.wikipedia.org/wiki/List_of_deadly_earthquakes_since_1900")
result = url.request.urlopen(request)
resulttext = result.read()

In [3]:
soup = BS(resulttext, 'html.parser')


In [4]:
# the html has a number of tables and we want to  have a table that we are looking for , so that we explictly 
#pass the characterstics of the tables  through class_ as an argument . We find the characterstics from the html 

earthquake_table_list = soup.find_all('table', class_=['sortable', 'wikitable', 'jquery-tablesorter'])

# we will assert if we have got the table that we want for , we wnat only one table

assert len(earthquake_table_list) == 1

#  since we have the list , we  want to convert the list in to table (have to check this again)

earthquake_table = earthquake_table_list[0]

In [5]:
# once we have the table, we want to extract the table header represented as 'th' by using the find_all, find_all 

table_head = earthquake_table.find_all('th')

# here we are assigning the table for  header by looping through table head . 
#The result of the .text() method is a string containing the combined text of all matched elements.

columns = [th.text for th in table_head]

In [6]:
print(columns)

['Origin (UTC)', 'Present-day country and link to Wikipedia article', 'Lat', 'Long', 'Depth (km)', 'Magnitude', 'Secondary Effects', 'PDE Shaking Deaths', 'PDE Total Deaths', 'Utsu Total Deaths', 'EM-DAT Total Deaths', 'Other Source Deaths']


In [7]:
# find_all the rows of the table and skip the first column as the table header is one child it self.

table_rows = earthquake_table.find_all('tr')[1:]

#do a loop , create an empty set for  and store it as df_rows

df_rows = list()
for row in table_rows:
    row_data= [data.text for data in row.find_all('td')]
    
    df_rows.append(row_data)

In [8]:
for row in df_rows[:5]:
    print(row)

['1900-05-11 17:23', 'Japan', '38.700', '141.100', '5', '7.0 MJMA', '', '', '', '', '', '']
['1900-07-12 06:25', 'Turkey', '40.300', '43.100', '', '5.9 Muk', '', '', '', '140', '', '']
['1900-10-29 09:11', 'Venezuela', '11.000', '-66.000', '0', '7.7 Mw', '', '', '', '', '', '']
['1901-02-15 00:00', 'China', '26.000', '100.100', '0', '6.5 Ms', '', '', '', '', '', '']
['1901-03-31 07:11', 'Bulgaria', '43.400', '28.700', '', '6.4 Muk', '', '', '', '4', '', '']


In [9]:
#   the next step is creating a data frame , where we combine the two tables  row and column tables.# the data and the header 
earthquake_df = pd.DataFrame(data=df_rows, columns=columns)

earthquake_df.head()

Unnamed: 0,Origin (UTC),Present-day country and link to Wikipedia article,Lat,Long,Depth (km),Magnitude,Secondary Effects,PDE Shaking Deaths,PDE Total Deaths,Utsu Total Deaths,EM-DAT Total Deaths,Other Source Deaths
0,1900-05-11 17:23,Japan,38.7,141.1,5.0,7.0 MJMA,,,,,,
1,1900-07-12 06:25,Turkey,40.3,43.1,,5.9 Muk,,,,140.0,,
2,1900-10-29 09:11,Venezuela,11.0,-66.0,0.0,7.7 Mw,,,,,,
3,1901-02-15 00:00,China,26.0,100.1,0.0,6.5 Ms,,,,,,
4,1901-03-31 07:11,Bulgaria,43.4,28.7,,6.4 Muk,,,,4.0,,


In [10]:
#earthquake_df.replace(r'^\s*$', np.nan, inplace = True)

In [11]:
earthquake_df.replace('', np.NaN)

Unnamed: 0,Origin (UTC),Present-day country and link to Wikipedia article,Lat,Long,Depth (km),Magnitude,Secondary Effects,PDE Shaking Deaths,PDE Total Deaths,Utsu Total Deaths,EM-DAT Total Deaths,Other Source Deaths
0,1900-05-11 17:23,Japan,38.700,141.100,5,7.0 MJMA,,,,,,
1,1900-07-12 06:25,Turkey,40.300,43.100,,5.9 Muk,,,,140,,
2,1900-10-29 09:11,Venezuela,11.000,-66.000,0,7.7 Mw,,,,,,
3,1901-02-15 00:00,China,26.000,100.100,0,6.5 Ms,,,,,,
4,1901-03-31 07:11,Bulgaria,43.400,28.700,,6.4 Muk,,,,4,,
5,1901-08-09 09:23,Japan,40.500,142.500,35,7.2 Mw,T,,,,,
6,1901-11-15 20:15,New Zealand (see 1901 Cheviot earthquake),-43.000,173.000,0,6.8 Ms,,,,1,,
7,1902-01-30 14:01,Japan,40.500,141.300,35,6.9 Ms,,,,1,,
8,1902-02-13 09:39,Azerbaijan,40.700,48.600,15,6.9 Muk,,,,86,,
9,1902-03-09 07:46,Turkey,40.700,33.600,,5.5 Muk,,,,4,,


In [12]:
earthquake_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1340 entries, 0 to 1339
Data columns (total 12 columns):
Origin (UTC)                                         1340 non-null object
Present-day country and link to Wikipedia article    1340 non-null object
Lat                                                  1340 non-null object
Long                                                 1340 non-null object
Depth (km)                                           1340 non-null object
Magnitude                                            1340 non-null object
Secondary Effects                                    1340 non-null object
PDE Shaking Deaths                                   1340 non-null object
PDE Total Deaths                                     1340 non-null object
Utsu Total Deaths                                    1340 non-null object
EM-DAT Total Deaths                                  1340 non-null object
Other Source Deaths                                  1338 non-null object
dtype

In [13]:
earthquake_df.columns=["Origin(UTC)","country","Lat","Long", "Depth(km)","Magnitude","Secondary Effects","PDE Shaking Deaths",
                       "PDE Total Deaths","Utsu Total Deaths","EM-DAT Total Deaths","Other Source Deaths"]

In [14]:
earthquake_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1340 entries, 0 to 1339
Data columns (total 12 columns):
Origin(UTC)            1340 non-null object
country                1340 non-null object
Lat                    1340 non-null object
Long                   1340 non-null object
Depth(km)              1340 non-null object
Magnitude              1340 non-null object
Secondary Effects      1340 non-null object
PDE Shaking Deaths     1340 non-null object
PDE Total Deaths       1340 non-null object
Utsu Total Deaths      1340 non-null object
EM-DAT Total Deaths    1340 non-null object
Other Source Deaths    1338 non-null object
dtypes: object(12)
memory usage: 125.7+ KB


In [15]:
test="(\s\(.*)"
earthquake_df['country']= earthquake_df['country'].str.replace(test, "")


In [16]:
earthquake_df['country']

0                   Japan
1                  Turkey
2               Venezuela
3                   China
4                Bulgaria
5                   Japan
6             New Zealand
7                   Japan
8              Azerbaijan
9                  Turkey
10              Guatemala
11                  China
12                  China
13              Australia
14             Uzbekistan
15      Judea and Samaria
16                 Turkey
17                 Turkey
18                 Greece
19                   Iran
20                 Taiwan
21                 Greece
22                  China
23                 Taiwan
24                  India
25                Albania
26                  Japan
27                  Italy
28                 Greece
29                Ecuador
              ...        
1310                China
1311              Algeria
1312               Greece
1313                Japan
1314                China
1315               Greece
1316                China
1317        

In [24]:
earthquake_df['scale']=earthquake_df['Magnitude']

In [25]:
earthquake_df.head()

Unnamed: 0,Origin(UTC),country,Lat,Long,Depth(km),Magnitude,Secondary Effects,PDE Shaking Deaths,PDE Total Deaths,Utsu Total Deaths,EM-DAT Total Deaths,Other Source Deaths,scale
0,1900-05-11 17:23,Japan,38.7,141.1,5.0,7.0 MJMA,,,,,,,7.0 MJMA
1,1900-07-12 06:25,Turkey,40.3,43.1,,5.9 Muk,,,,140.0,,,5.9 Muk
2,1900-10-29 09:11,Venezuela,11.0,-66.0,0.0,7.7 Mw,,,,,,,7.7 Mw
3,1901-02-15 00:00,China,26.0,100.1,0.0,6.5 Ms,,,,,,,6.5 Ms
4,1901-03-31 07:11,Bulgaria,43.4,28.7,,6.4 Muk,,,,4.0,,,6.4 Muk
