In [275]:
from bs4 import BeautifulSoup as BS
import urllib as url
import pandas as pd
import numpy as np  
import re
from datetime import datetime


In [276]:
request = url.request.Request("https://en.wikipedia.org/wiki/List_of_deadly_earthquakes_since_1900")
result = url.request.urlopen(request)
resulttext = result.read()

In [277]:
soup = BS(resulttext, 'html.parser')


In [278]:
# the html has a number of tables and we want to  have a table that we are looking for , so that we explictly 
#pass the characterstics of the tables  through class_ as an argument . We find the characterstics from the html 

earthquake_table_list = soup.find_all('table', class_=['sortable', 'wikitable', 'jquery-tablesorter'])

# we will assert if we have got the table that we want for , we wnat only one table

assert len(earthquake_table_list) == 1

#  since we have the list , we  want to convert the list in to table (have to check this again)

earthquake_table = earthquake_table_list[0]

In [279]:
# once we have the table, we want to extract the table header represented as 'th' by using the find_all, find_all 

table_head = earthquake_table.find_all('th')

# here we are assigning the table for  header by looping through table head . 
#The result of the .text() method is a string containing the combined text of all matched elements.

columns = [th.text for th in table_head]

In [280]:
print(columns)

['Origin (UTC)', 'Present-day country and link to Wikipedia article', 'Lat', 'Long', 'Depth (km)', 'Magnitude', 'Secondary Effects', 'PDE Shaking Deaths', 'PDE Total Deaths', 'Utsu Total Deaths', 'EM-DAT Total Deaths', 'Other Source Deaths']


In [281]:
# find_all the rows of the table and skip the first column as the table header is one child it self.

table_rows = earthquake_table.find_all('tr')[1:]

#do a loop , create an empty set for  and store it as df_rows

df_rows = list()
for row in table_rows:
    row_data= [data.text for data in row.find_all('td')]
    
    df_rows.append(row_data)

In [282]:
for row in df_rows[:5]:
    print(row)

['1900-05-11 17:23', 'Japan', '38.700', '141.100', '5', '7.0 MJMA', '', '', '', '', '', '']
['1900-07-12 06:25', 'Turkey', '40.300', '43.100', '', '5.9 Muk', '', '', '', '140', '', '']
['1900-10-29 09:11', 'Venezuela', '11.000', '-66.000', '0', '7.7 Mw', '', '', '', '', '', '']
['1901-02-15 00:00', 'China', '26.000', '100.100', '0', '6.5 Ms', '', '', '', '', '', '']
['1901-03-31 07:11', 'Bulgaria', '43.400', '28.700', '', '6.4 Muk', '', '', '', '4', '', '']


In [283]:
#   the next step is creating a data frame , where we combine the two tables  row and column tables.# the data and the header 
earthquake_df = pd.DataFrame(data=df_rows, columns=columns)

earthquake_df

Unnamed: 0,Origin (UTC),Present-day country and link to Wikipedia article,Lat,Long,Depth (km),Magnitude,Secondary Effects,PDE Shaking Deaths,PDE Total Deaths,Utsu Total Deaths,EM-DAT Total Deaths,Other Source Deaths
0,1900-05-11 17:23,Japan,38.700,141.100,5,7.0 MJMA,,,,,,
1,1900-07-12 06:25,Turkey,40.300,43.100,,5.9 Muk,,,,140,,
2,1900-10-29 09:11,Venezuela,11.000,-66.000,0,7.7 Mw,,,,,,
3,1901-02-15 00:00,China,26.000,100.100,0,6.5 Ms,,,,,,
4,1901-03-31 07:11,Bulgaria,43.400,28.700,,6.4 Muk,,,,4,,
5,1901-08-09 09:23,Japan,40.500,142.500,35,7.2 Mw,T,,,,,
6,1901-11-15 20:15,New Zealand (see 1901 Cheviot earthquake),-43.000,173.000,0,6.8 Ms,,,,1,,
7,1902-01-30 14:01,Japan,40.500,141.300,35,6.9 Ms,,,,1,,
8,1902-02-13 09:39,Azerbaijan,40.700,48.600,15,6.9 Muk,,,,86,,
9,1902-03-09 07:46,Turkey,40.700,33.600,,5.5 Muk,,,,4,,


In [284]:
#earthquake_df.replace(r'^\s*$', np.nan, inplace = True)

In [285]:
earthquake_df=earthquake_df.replace('', np.NaN)

In [286]:
earthquake_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1340 entries, 0 to 1339
Data columns (total 12 columns):
Origin (UTC)                                         1340 non-null object
Present-day country and link to Wikipedia article    1340 non-null object
Lat                                                  1326 non-null object
Long                                                 1326 non-null object
Depth (km)                                           1250 non-null object
Magnitude                                            1339 non-null object
Secondary Effects                                    373 non-null object
PDE Shaking Deaths                                   739 non-null object
PDE Total Deaths                                     750 non-null object
Utsu Total Deaths                                    1027 non-null object
EM-DAT Total Deaths                                  560 non-null object
Other Source Deaths                                  37 non-null object
dtypes: obj

In [287]:
# renaming a columns, is  there any other mechanism to rename a target column alone
earthquake_df.columns=["Origin(UTC)","country","Lat","Long", "Depth(km)","Magnitude","Secondary Effects","PDE Shaking Deaths",
                       "PDE Total Deaths","Utsu Total Deaths","EM-DAT Total Deaths","Other Source Deaths"]

In [288]:
earthquake_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1340 entries, 0 to 1339
Data columns (total 12 columns):
Origin(UTC)            1340 non-null object
country                1340 non-null object
Lat                    1326 non-null object
Long                   1326 non-null object
Depth(km)              1250 non-null object
Magnitude              1339 non-null object
Secondary Effects      373 non-null object
PDE Shaking Deaths     739 non-null object
PDE Total Deaths       750 non-null object
Utsu Total Deaths      1027 non-null object
EM-DAT Total Deaths    560 non-null object
Other Source Deaths    37 non-null object
dtypes: object(12)
memory usage: 125.7+ KB


In [289]:
# fixing issues with country columns  to have the unique name for each country.
# we need to make a decesion on the SaudiArabia/Egybt  observation

test="\s\(.*"
earthquake_df['country']= earthquake_df['country'].str.replace(test, "")
test2= "\,.*"
earthquake_df['country']=earthquake_df['country'].str.replace(test2, "")
test3="United States Minor.*"
earthquake_df['country']=earthquake_df['country'].str.replace(test3,"United States")
test4="US T.*"
earthquake_df['country']=earthquake_df['country'].str.replace(test4,"United States")
test5="\(.*"
earthquake_df['country']=earthquake_df['country'].str.replace(test5,"Venezuela")

In [290]:
earthquake_df['country'].value_counts()
# here we are observing  that a country name followed by ',' [Iran, 2005 Qeshm earthquake ]                              1
#and  country names with parenthesis . 

China                                  166
Indonesia                               96
Turkey                                  95
Iran                                    92
Japan                                   82
Peru                                    54
Taiwan                                  48
Mexico                                  46
Chile                                   44
Greece                                  44
Philippines                             39
Italy                                   37
United States                           37
Afghanistan                             30
Pakistan                                26
India                                   25
Colombia                                24
Algeria                                 21
Ecuador                                 19
Papua New Guinea                        17
Guatemala                               15
South Africa                            14
Russian Federation                      13
Venezuela  

In [291]:
#splitted_country=earthquake_df['country'].str.split('(')
#splitted_country
#type(splitted_country)
#earthquake_df['country']=splitted_country
#earthquake_df

In [292]:
earthquake_df['scale']=earthquake_df['Magnitude']

In [293]:
earthquake_df

Unnamed: 0,Origin(UTC),country,Lat,Long,Depth(km),Magnitude,Secondary Effects,PDE Shaking Deaths,PDE Total Deaths,Utsu Total Deaths,EM-DAT Total Deaths,Other Source Deaths,scale
0,1900-05-11 17:23,Japan,38.700,141.100,5,7.0 MJMA,,,,,,,7.0 MJMA
1,1900-07-12 06:25,Turkey,40.300,43.100,,5.9 Muk,,,,140,,,5.9 Muk
2,1900-10-29 09:11,Venezuela,11.000,-66.000,0,7.7 Mw,,,,,,,7.7 Mw
3,1901-02-15 00:00,China,26.000,100.100,0,6.5 Ms,,,,,,,6.5 Ms
4,1901-03-31 07:11,Bulgaria,43.400,28.700,,6.4 Muk,,,,4,,,6.4 Muk
5,1901-08-09 09:23,Japan,40.500,142.500,35,7.2 Mw,T,,,,,,7.2 Mw
6,1901-11-15 20:15,New Zealand,-43.000,173.000,0,6.8 Ms,,,,1,,,6.8 Ms
7,1902-01-30 14:01,Japan,40.500,141.300,35,6.9 Ms,,,,1,,,6.9 Ms
8,1902-02-13 09:39,Azerbaijan,40.700,48.600,15,6.9 Muk,,,,86,,,6.9 Muk
9,1902-03-09 07:46,Turkey,40.700,33.600,,5.5 Muk,,,,4,,,5.5 Muk


In [294]:
#for date_str in earthquake_df["Origin(UTC)]":
   # Origin(UTC) = datetime.strptime(date_str,'%m/%d/%Y')
   # print(Origin(UTC))

In [295]:
#Data cleaning tasks include:

#Replace empty strings with NaN
#Remove the footnotes from the 'Other Source Deaths' column
#Convert Magnitude to a numeric
#Create a new column ('deaths') that evaluates the four total-death columns ('PDE Total Deaths', 'Utsu Total Deaths', 'EM-DAT Total Deaths', and 'Other Source Deaths') and populates the new column with the highest value.
#Explore the data in terms of when and where earthquakes occurred and how severe they were (magnitude, deaths, secondary effects).

In [296]:
earthquake_df['Other Source Deaths']

0                                NaN
1                                NaN
2                                NaN
3                                NaN
4                                NaN
5                                NaN
6                                NaN
7                                NaN
8                                NaN
9                                NaN
10                               NaN
11                               NaN
12                               NaN
13                               NaN
14                               NaN
15                               NaN
16                              3500
17                               NaN
18                               NaN
19                               NaN
20                               NaN
21                               NaN
22                               NaN
23                               NaN
24                               NaN
25                               NaN
26                               NaN
2

In [297]:
# removing the footnotes and commas between numbers in [Other Source Deaths]

test6="\[.*|\+|\s\(.*|\*"
earthquake_df['Other Source Deaths']=earthquake_df['Other Source Deaths'].str.replace(test6,"")
test7="\,"
earthquake_df['Other Source Deaths']=earthquake_df['Other Source Deaths'].str.replace(test7,"")

In [298]:
earthquake_df['Other Source Deaths']

0          NaN
1          NaN
2          NaN
3          NaN
4          NaN
5          NaN
6          NaN
7          NaN
8          NaN
9          NaN
10         NaN
11         NaN
12         NaN
13         NaN
14         NaN
15         NaN
16        3500
17         NaN
18         NaN
19         NaN
20         NaN
21         NaN
22         NaN
23         NaN
24         NaN
25         NaN
26         NaN
27         NaN
28         NaN
29         NaN
         ...  
1310       NaN
1311       NaN
1312       NaN
1313       NaN
1314       NaN
1315       NaN
1316       NaN
1317        41
1318        60
1319       NaN
1320       215
1321        34
1322       295
1323       NaN
1324         1
1325        79
1326       189
1327      1115
1328    222517
1329       521
1330        42
1331         1
1332      2698
1333       181
1334       NaN
1335     15894
1336       150
1337       NaN
1338       111
1339       601
Name: Other Source Deaths, Length: 1340, dtype: object

In [299]:
# Converting magnitude to numeric 
earthquake_df['Magnitude'].dtype

dtype('O')

In [300]:
d_types = set([type(val) for val in earthquake_df['Magnitude']])

In [301]:
d_types

{float, str}

In [320]:
y=earthquake_df['Magnitude'].str.split(' ').str.get(1)
y

0       MJMA
1        Muk
2         Mw
3         Ms
4        Muk
5         Mw
6         Ms
7         Ms
8        Muk
9        Muk
10        Mw
11        Ms
12        Mw
13        ML
14       Muk
15       Muk
16        Ms
17       Muk
18       Muk
19       Muk
20       Muk
21       Muk
22        Ms
23       Muk
24        Mw
25       Muk
26        mb
27        Ms
28        Ms
29       Muk
        ... 
1310      Mw
1311      Mw
1312      Mw
1313      Mw
1314      mb
1315      Mw
1316      Ms
1317      Mw
1318      Mw
1319      Mw
1320      Mw
1321      Mw
1322      Mw
1323      Mw
1324      Mw
1325      Mw
1326      Mw
1327      Mw
1328      Mw
1329      Mw
1330      Mw
1331      Mw
1332      Mw
1333      Mw
1334      Mw
1335      Mw
1336      Mw
1337      Mw
1338      Mw
1339      Mw
Name: Magnitude, Length: 1340, dtype: object

In [302]:
x = '7.86 mg'

In [303]:
y = x.split(' ')

ValueError: split() requires a non-empty pattern match.

In [None]:
def (x):
    y=earthquake_df['Magnitude'].str.split('')
    
    if y[1]=='MW':
        return float(y[0])
    elif y[1]=='MW':
        return float(y[0])
    elif y[1]=='MW':
        return float(y[0])
    elif y[1]=='MW':
        return float(y[0])
    elif y[1]=='MW':
        return float(y[0])
    elify[1]=='MW':
        return float(y[0])
    else:
        return NaN
earthquake_df['']