In [26]:
import pandas as pd
import numpy as np

In [27]:
df = pd.read_csv("earthquake_1995-2023.csv")

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 19 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   title      1000 non-null   object 
 1   magnitude  1000 non-null   float64
 2   date_time  1000 non-null   object 
 3   cdi        1000 non-null   int64  
 4   mmi        1000 non-null   int64  
 5   alert      449 non-null    object 
 6   tsunami    1000 non-null   int64  
 7   sig        1000 non-null   int64  
 8   net        1000 non-null   object 
 9   nst        1000 non-null   int64  
 10  dmin       1000 non-null   float64
 11  gap        1000 non-null   float64
 12  magType    1000 non-null   object 
 13  depth      1000 non-null   float64
 14  latitude   1000 non-null   float64
 15  longitude  1000 non-null   float64
 16  location   994 non-null    object 
 17  continent  284 non-null    object 
 18  country    651 non-null    object 
dtypes: float64(6), int64(5), object(8)
memory usage: 

In [29]:
df.head()

Unnamed: 0,title,magnitude,date_time,cdi,mmi,alert,tsunami,sig,net,nst,dmin,gap,magType,depth,latitude,longitude,location,continent,country
0,"M 6.5 - 42 km W of Sola, Vanuatu",6.5,16-08-2023 12:47,7,4,green,0,657,us,114,7.177,25.0,mww,192.955,-13.8814,167.158,"Sola, Vanuatu",,Vanuatu
1,"M 6.5 - 43 km S of Intipucá, El Salvador",6.5,19-07-2023 00:22,8,6,yellow,0,775,us,92,0.679,40.0,mww,69.727,12.814,-88.1265,"Intipucá, El Salvador",,
2,"M 6.6 - 25 km ESE of Loncopué, Argentina",6.6,17-07-2023 03:05,7,5,green,0,899,us,70,1.634,28.0,mww,171.371,-38.1911,-70.3731,"Loncopué, Argentina",South America,Argentina
3,"M 7.2 - 98 km S of Sand Point, Alaska",7.2,16-07-2023 06:48,6,6,green,1,860,us,173,0.907,36.0,mww,32.571,54.3844,-160.699,"Sand Point, Alaska",,
4,M 7.3 - Alaska Peninsula,7.3,16-07-2023 06:48,0,5,,1,820,at,79,0.879451,172.8,Mi,21.0,54.49,-160.796,Alaska Peninsula,,


##  Preprocessing

### Checking for Null


In [30]:
df_loc_na = df[df['location'].isna()]
df_loc_na

Unnamed: 0,title,magnitude,date_time,cdi,mmi,alert,tsunami,sig,net,nst,dmin,gap,magType,depth,latitude,longitude,location,continent,country
30,M 6.8 -,6.8,20-01-2023 22:09,4,3,green,0,718,us,60,5.129,61.0,mww,610.671,-26.7209,-63.0394,,South America,Argentina
37,M 7.0 -,7.0,12-11-2022 07:09,3,3,green,1,755,us,147,3.125,18.0,mww,579.0,-20.0508,-178.346,,Oceania,Fiji
39,M 6.6 -,6.6,09-11-2022 10:14,0,2,green,1,670,us,131,4.998,27.0,mww,624.464,-25.5948,178.278,,,
54,M 6.9 -,6.9,19-05-2022 10:13,2,5,green,1,733,us,127,0.371,45.0,mww,10.0,-54.1325,159.027,,,
103,M 6.9 -,6.9,01-05-2021 01:27,7,6,green,1,919,us,0,2.619,35.0,mww,43.0,38.2296,141.665,,Asia,Japan
281,M 6.9 -,6.9,28-05-2016 05:38,3,3,green,1,733,us,0,5.485,19.0,mww,405.69,-21.9724,-178.204,,,


In [31]:
from geopy.geocoders import Nominatim
import time

In [32]:
def impute_locations(df, lat_col='latitude', lon_col='longitude', location_col='location'):
    """
    Impute missing location values with English names of country and state/province.
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing latitude, longitude, and location columns
    lat_col (str): Name of the latitude column
    lon_col (str): Name of the longitude column
    location_col (str): Name of the location column to impute
    
    Returns:
    pandas.DataFrame: DataFrame with imputed location values in English
    """
    df_copy = df.copy()
    # Set language preference to English
    geolocator = Nominatim(user_agent="my_agent")
    
    def get_location(row):
        if pd.isna(row[location_col]):
            try:
                coords = f"{row[lat_col]}, {row[lon_col]}"
                # Request data in English
                location = geolocator.reverse(coords, language='en')
                
                if location and location.raw.get('address'):
                    address = location.raw['address']
                    # Get English names for state/province and country
                    state = (
                        address.get('state_en') or 
                        address.get('state') or 
                        address.get('province_en') or 
                        address.get('province') or 
                        ''
                    )
                    country = address.get('country_en') or address.get('country', '')
                    return f"{state}, {country}".strip(', ')
                return None
            except Exception as e:
                print(f"Error getting location for coordinates {coords}: {e}")
                return None
            finally:
                time.sleep(1)
        return row[location_col]
    
    mask = df_copy[location_col].isna()
    if mask.any():
        print(f"Imputing {mask.sum()} missing locations...")
        df_copy[location_col] = df_copy.apply(get_location, axis=1)
    
    return df_copy


In [33]:

df = impute_locations(df)
print("\nResults:")
df.isna().sum()

Imputing 6 missing locations...

Results:


title          0
magnitude      0
date_time      0
cdi            0
mmi            0
alert        551
tsunami        0
sig            0
net            0
nst            0
dmin           0
gap            0
magType        0
depth          0
latitude       0
longitude      0
location       3
continent    716
country      349
dtype: int64

In [36]:
print(df.iloc[37])

title                M 7.0 - 
magnitude                 7.0
date_time    12-11-2022 07:09
cdi                         3
mmi                         3
alert                   green
tsunami                     1
sig                       755
net                        us
nst                       147
dmin                    3.125
gap                      18.0
magType                   mww
depth                   579.0
latitude             -20.0508
longitude            -178.346
location        Eastern, Fiji
continent             Oceania
country                  Fiji
Name: 37, dtype: object


In [35]:
df_loc_na = df[df['location'].isna()]
df_loc_na

Unnamed: 0,title,magnitude,date_time,cdi,mmi,alert,tsunami,sig,net,nst,dmin,gap,magType,depth,latitude,longitude,location,continent,country
39,M 6.6 -,6.6,09-11-2022 10:14,0,2,green,1,670,us,131,4.998,27.0,mww,624.464,-25.5948,178.278,,,
54,M 6.9 -,6.9,19-05-2022 10:13,2,5,green,1,733,us,127,0.371,45.0,mww,10.0,-54.1325,159.027,,,
281,M 6.9 -,6.9,28-05-2016 05:38,3,3,green,1,733,us,0,5.485,19.0,mww,405.69,-21.9724,-178.204,,,
