In [98]:
import pandas as pd
import numpy as np

In [99]:
df = pd.read_csv("earthquake_1995-2023.csv")

In [100]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 19 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   title      1000 non-null   object 
 1   magnitude  1000 non-null   float64
 2   date_time  1000 non-null   object 
 3   cdi        1000 non-null   int64  
 4   mmi        1000 non-null   int64  
 5   alert      449 non-null    object 
 6   tsunami    1000 non-null   int64  
 7   sig        1000 non-null   int64  
 8   net        1000 non-null   object 
 9   nst        1000 non-null   int64  
 10  dmin       1000 non-null   float64
 11  gap        1000 non-null   float64
 12  magType    1000 non-null   object 
 13  depth      1000 non-null   float64
 14  latitude   1000 non-null   float64
 15  longitude  1000 non-null   float64
 16  location   994 non-null    object 
 17  continent  284 non-null    object 
 18  country    651 non-null    object 
dtypes: float64(6), int64(5), object(8)
memory usage: 

In [101]:
df.head()

Unnamed: 0,title,magnitude,date_time,cdi,mmi,alert,tsunami,sig,net,nst,dmin,gap,magType,depth,latitude,longitude,location,continent,country
0,"M 6.5 - 42 km W of Sola, Vanuatu",6.5,16-08-2023 12:47,7,4,green,0,657,us,114,7.177,25.0,mww,192.955,-13.8814,167.158,"Sola, Vanuatu",,Vanuatu
1,"M 6.5 - 43 km S of Intipucá, El Salvador",6.5,19-07-2023 00:22,8,6,yellow,0,775,us,92,0.679,40.0,mww,69.727,12.814,-88.1265,"Intipucá, El Salvador",,
2,"M 6.6 - 25 km ESE of Loncopué, Argentina",6.6,17-07-2023 03:05,7,5,green,0,899,us,70,1.634,28.0,mww,171.371,-38.1911,-70.3731,"Loncopué, Argentina",South America,Argentina
3,"M 7.2 - 98 km S of Sand Point, Alaska",7.2,16-07-2023 06:48,6,6,green,1,860,us,173,0.907,36.0,mww,32.571,54.3844,-160.699,"Sand Point, Alaska",,
4,M 7.3 - Alaska Peninsula,7.3,16-07-2023 06:48,0,5,,1,820,at,79,0.879451,172.8,Mi,21.0,54.49,-160.796,Alaska Peninsula,,


##  Preprocessing

In [102]:
df['latitude'] = df['latitude'].round(2)
df['longitude'] = df['longitude'].round(2)      

### Checking for Null


#### - Handling Location Nulls


In [103]:
df_loc_na = df[df['location'].isna()]
df_loc_na

Unnamed: 0,title,magnitude,date_time,cdi,mmi,alert,tsunami,sig,net,nst,dmin,gap,magType,depth,latitude,longitude,location,continent,country
30,M 6.8 -,6.8,20-01-2023 22:09,4,3,green,0,718,us,60,5.129,61.0,mww,610.671,-26.72,-63.04,,South America,Argentina
37,M 7.0 -,7.0,12-11-2022 07:09,3,3,green,1,755,us,147,3.125,18.0,mww,579.0,-20.05,-178.35,,Oceania,Fiji
39,M 6.6 -,6.6,09-11-2022 10:14,0,2,green,1,670,us,131,4.998,27.0,mww,624.464,-25.59,178.28,,,
54,M 6.9 -,6.9,19-05-2022 10:13,2,5,green,1,733,us,127,0.371,45.0,mww,10.0,-54.13,159.03,,,
103,M 6.9 -,6.9,01-05-2021 01:27,7,6,green,1,919,us,0,2.619,35.0,mww,43.0,38.23,141.66,,Asia,Japan
281,M 6.9 -,6.9,28-05-2016 05:38,3,3,green,1,733,us,0,5.485,19.0,mww,405.69,-21.97,-178.2,,,


In [104]:
from geopy.geocoders import Nominatim
import time

In [105]:
def impute_locations(df, lat_col='latitude', lon_col='longitude', location_col='location'):
    """
    Impute missing location values with English names of country and state/province.
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing latitude, longitude, and location columns
    lat_col (str): Name of the latitude column
    lon_col (str): Name of the longitude column
    location_col (str): Name of the location column to impute
    
    Returns:
    pandas.DataFrame: DataFrame with imputed location values in English
    """
    df_copy = df.copy()
    # Set language preference to English
    geolocator = Nominatim(user_agent="my_agent")
    
    def get_location(row):
        if pd.isna(row[location_col]):
            try:
                coords = f"{row[lat_col]}, {row[lon_col]}"
                # Request data in English
                location = geolocator.reverse(coords, language='en')
                
                if location and location.raw.get('address'):
                    address = location.raw['address']
                    # Get English names for state/province and country
                    state = (
                        address.get('state_en') or 
                        address.get('state') or 
                        address.get('province_en') or 
                        address.get('province') or 
                        ''
                    )
                    country = address.get('country_en') or address.get('country', '')
                    return f"{state}, {country}".strip(', ')
                return None
            except Exception as e:
                print(f"Error getting location for coordinates {coords}: {e}")
                return None
            finally:
                time.sleep(1)
        return row[location_col]
    
    mask = df_copy[location_col].isna()
    if mask.any():
        print(f"Imputing {mask.sum()} missing locations...")
        df_copy[location_col] = df_copy.apply(get_location, axis=1)
    
    return df_copy


In [106]:
df = impute_locations(df)
print("\nResults:")
df.isna().sum()

Imputing 6 missing locations...

Results:


title          0
magnitude      0
date_time      0
cdi            0
mmi            0
alert        551
tsunami        0
sig            0
net            0
nst            0
dmin           0
gap            0
magType        0
depth          0
latitude       0
longitude      0
location       3
continent    716
country      349
dtype: int64

In [107]:
print(df.iloc[37])

title                M 7.0 - 
magnitude                 7.0
date_time    12-11-2022 07:09
cdi                         3
mmi                         3
alert                   green
tsunami                     1
sig                       755
net                        us
nst                       147
dmin                    3.125
gap                      18.0
magType                   mww
depth                   579.0
latitude               -20.05
longitude             -178.35
location        Eastern, Fiji
continent             Oceania
country                  Fiji
Name: 37, dtype: object


In [108]:
df['location'] = df['location'].replace('None', np.nan)
df = df.dropna(subset=['location'])

#### - Handling Country Null

In [109]:
df['country'] = df['country'].fillna(df['location'].str.split(',').str[1])

In [110]:
df.isna().sum()

title          0
magnitude      0
date_time      0
cdi            0
mmi            0
alert        551
tsunami        0
sig            0
net            0
nst            0
dmin           0
gap            0
magType        0
depth          0
latitude       0
longitude      0
location       0
continent    713
country       51
dtype: int64

In [111]:
df_country_na = df[df['country'].isna()]
df_country_na

Unnamed: 0,title,magnitude,date_time,cdi,mmi,alert,tsunami,sig,net,nst,dmin,gap,magType,depth,latitude,longitude,location,continent,country
4,M 7.3 - Alaska Peninsula,7.3,16-07-2023 06:48,0,5,,1,820,at,79,0.879451,172.8,Mi,21.0,54.49,-160.8,Alaska Peninsula,,
6,M 6.9 - Tonga,6.9,02-07-2023 10:27,4,4,green,1,741,us,136,1.179,23.0,mww,229.0,-17.85,-174.94,Tonga,,
7,M 7.2 - south of the Fiji Islands,7.2,15-06-2023 18:06,8,6,green,1,804,us,85,2.59,24.0,mww,167.404,-22.98,-177.21,the Fiji Islands,,
9,M 7.1 - southeast of the Loyalty Islands,7.1,20-05-2023 01:51,3,4,green,1,777,us,98,2.812,56.0,mww,35.981,-23.06,170.46,the Loyalty Islands,,
10,M 7.7 - southeast of the Loyalty Islands,7.7,19-05-2023 02:57,5,4,green,1,927,us,277,3.111,15.0,mww,18.027,-23.23,170.69,the Loyalty Islands,,
12,M 6.6 - south of the Fiji Islands,6.6,28-04-2023 03:13,0,2,green,0,670,us,52,5.123,50.0,mww,598.467,-25.27,178.42,the Fiji Islands,,
15,M 6.7 - south of the Fiji Islands,6.7,18-04-2023 04:31,0,3,green,1,691,us,166,4.719,10.0,mww,595.854,-22.28,179.39,the Fiji Islands,,
22,M 7.0 - Kermadec Islands region,7.0,16-03-2023 00:56,8,5,green,1,756,us,116,1.789,22.0,mww,22.065,-30.11,-176.11,Kermadec Islands region,,
40,M 7.0 - south of the Fiji Islands,7.0,09-11-2022 09:51,4,3,green,1,755,us,142,4.578,26.0,mwb,660.0,-26.04,178.38,the Fiji Islands,,
41,M 6.8 - south of the Fiji Islands,6.8,09-11-2022 09:38,1,3,green,1,711,us,136,4.678,22.0,mww,630.379,-25.97,178.36,the Fiji Islands,,


In [112]:
from geopy.exc import GeocoderTimedOut

In [113]:
def get_country_from_location(df, location_col='location', country_col='country'):
    """
    Convert location names to country names using geocoding.
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing location column
    location_col (str): Name of the location column
    country_col (str): Name of the country column to fill
    
    Returns:
    pandas.DataFrame: DataFrame with filled country values
    """
    df_copy = df.copy()
    geolocator = Nominatim(user_agent="my_geocoder")
    
    def extract_country(location):
        if pd.isna(location):
            return None
            
        try:
            # Search for the location
            geocode_result = geolocator.geocode(
                location,
                language='en',  # Get results in English
                addressdetails=True,  # Get detailed address information
                exactly_one=True  # Return only the best match
            )
            
            if geocode_result and geocode_result.raw.get('address'):
                # Get country from address details
                country = geocode_result.raw['address'].get('country')
                return country
                
        except GeocoderTimedOut:
            print(f"Timeout for location: {location}")
        except Exception as e:
            print(f"Error processing location '{location}': {e}")
            
        time.sleep(1)  # Respect API rate limits
        return None
    
    # Only process rows where country is missing
    mask = df_copy[country_col].isna()
    if mask.any():
        print(f"Processing {mask.sum()} missing country values...")
        df_copy.loc[mask, country_col] = df_copy.loc[mask, location_col].apply(extract_country)
        
        # Print results
        filled = mask.sum() - df_copy[country_col].isna().sum()
        print(f"\nFilled {filled} out of {mask.sum()} missing country values")
        
        if df_copy[country_col].isna().sum() > 0:
            print("\nLocations that couldn't be mapped:")
            print(df_copy[df_copy[country_col].isna()][location_col].unique())
    
    return df_copy

    


In [114]:
df = get_country_from_location(df)
df

Processing 51 missing country values...

Filled 28 out of 51 missing country values

Locations that couldn't be mapped:
['Kermadec Islands region' 'the Kermadec Islands'
 'South Sandwich Islands region' 'central Mid-Atlantic Ridge'
 'Prince Edward Islands region' 'Bouvet Island region'
 'northern Mid-Atlantic Ridge' 'off the west coast of northern Sumatra'
 'the Kuril Islands']


Unnamed: 0,title,magnitude,date_time,cdi,mmi,alert,tsunami,sig,net,nst,dmin,gap,magType,depth,latitude,longitude,location,continent,country
0,"M 6.5 - 42 km W of Sola, Vanuatu",6.5,16-08-2023 12:47,7,4,green,0,657,us,114,7.177000,25.0,mww,192.955,-13.88,167.16,"Sola, Vanuatu",,Vanuatu
1,"M 6.5 - 43 km S of Intipucá, El Salvador",6.5,19-07-2023 00:22,8,6,yellow,0,775,us,92,0.679000,40.0,mww,69.727,12.81,-88.13,"Intipucá, El Salvador",,El Salvador
2,"M 6.6 - 25 km ESE of Loncopué, Argentina",6.6,17-07-2023 03:05,7,5,green,0,899,us,70,1.634000,28.0,mww,171.371,-38.19,-70.37,"Loncopué, Argentina",South America,Argentina
3,"M 7.2 - 98 km S of Sand Point, Alaska",7.2,16-07-2023 06:48,6,6,green,1,860,us,173,0.907000,36.0,mww,32.571,54.38,-160.70,"Sand Point, Alaska",,Alaska
4,M 7.3 - Alaska Peninsula,7.3,16-07-2023 06:48,0,5,,1,820,at,79,0.879451,172.8,Mi,21.000,54.49,-160.80,Alaska Peninsula,,United States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,"M 7.1 - 85 km S of Tungor, Russia",7.1,27-05-1995 13:03,0,9,,0,776,us,0,0.000000,0.0,mwb,11.000,52.63,142.83,"Tungor, Russia",Asia,Russia
996,"M 7.7 - 249 km E of Vao, New Caledonia",7.7,16-05-1995 20:12,0,4,,0,912,us,0,0.000000,0.0,mw,20.200,-23.01,169.90,"Vao, New Caledonia",,New Caledonia
997,"M 6.9 - 27 km NNW of Maubara, Timor Leste",6.9,14-05-1995 11:33,0,6,,0,732,us,0,0.000000,0.0,mw,11.200,-8.38,125.13,"Maubara, Timor Leste",,Indonesia
998,"M 6.6 - 10 km W of Aianí, Greece",6.6,13-05-1995 08:47,0,9,,0,670,us,0,0.000000,0.0,mw,14.000,40.15,21.70,"Aianí, Greece",Europe,Greece


In [118]:
df.iloc[704]

title        M 7.0 - 51 km SW of Nemuro, Japan
magnitude                                  7.0
date_time                     28-11-2004 18:32
cdi                                          4
mmi                                          7
alert                                      NaN
tsunami                                      0
sig                                        771
net                                         us
nst                                        929
dmin                                       0.0
gap                                       23.9
magType                                    mwb
depth                                     39.0
latitude                                 43.01
longitude                               145.12
location                         Nemuro, Japan
continent                                 Asia
country                                  Japan
Name: 707, dtype: object

In [119]:
df.to_csv('0_earthquake_1995-2023.csv', index=False)