In [28]:
import pandas as pd
import numpy as np

df= pd.read_csv("0_earthquake_1995-2023.csv")

In [29]:
df['country'].unique()

array(['Vanuatu', 'El Salvador', 'Argentina', 'United States',
       'Antigua and Barbuda', 'Tonga', 'Fiji', 'Colombia', 'France',
       'Indonesia', 'New Zealand', 'Russian Federation',
       'Papua New Guinea', 'Afghanistan', 'Ecuador', 'Tajikistan',
       'Turkey', 'Solomon Islands', 'Panama', 'Mexico', 'Taiwan', 'China',
       'Philippines', 'Brazil', 'Peru', 'Nicaragua', 'Japan', 'Cyprus',
       'Antarctica', 'Haiti', 'Mongolia', 'Chile', 'Greece', 'Jamaica',
       'United Kingdom', 'Australia', 'Norway', 'Canada', 'Venezuela',
       'Bolivia', 'Honduras', 'Costa Rica', 'Iran', 'Guatemala',
       'Botswana', 'Italy', 'Myanmar', 'India', 'Barbados', 'Nepal',
       'Guam', 'Pakistan', 'Samoa', 'Kyrgyzstan', 'Mozambique',
       'Tanzania', 'Algeria', 'Turkmenistan', 'Azerbaijan', 'Iceland',
       'Trinidad and Tobago', 'Saudi Arabia'], dtype=object)

## Preprocessing

### Handling Nulls

#### - Handling Continent Null

In [None]:
# Filling in null values of continent column
import pycountry
import pycountry_convert

In [30]:
def map_to_continent(df, country_col='country', location_col='location', continent_col='continent'):
    """
    Map countries and locations to continents.

    Parameters:
    df (pandas.DataFrame): DataFrame containing country and location columns
    country_col (str): Name of the country column
    location_col (str): Name of the location column
    continent_col (str): Name of the continent column to fill

    Returns:
    pandas.DataFrame: DataFrame with filled continent values
    """
    df_copy = df.copy()

    # Country to continent mapping using pycountry
    def get_continent_from_country(country_name):
        try:
            # Use fuzzy search for better matches
            if country_name.lower() == "antarctica":
                return "Antarctica"
            
            country = pycountry.countries.search_fuzzy(country_name)[0]
            if country:
                continent_code = pycountry_convert.country_alpha2_to_continent_code(country.alpha_2)
                return pycountry_convert.convert_continent_code_to_continent_name(continent_code)
        except Exception as e:
            print(f"Error mapping country '{country_name}': {e}")
        return None

    # Special location patterns to continent mapping
    location_patterns = {
        'Atlantic': 'Atlantic Ocean',
        'Mid-Atlantic': 'Atlantic Ocean',
        'Indian Ocean': 'Indian Ocean',
        'Pacific': 'Pacific Ocean',
        'Caribbean': 'North America',
        'Antarctic': 'Antarctica',
        'Kermadec': 'Oceania',
        'Fiji': 'Oceania',
        'Tonga': 'Oceania',
        'Vanuatu': 'Oceania',
        'Kuril': 'Asia',
        'Alaska': 'North America',
        'Philippines': 'Asia',
        'Sumatra': 'Asia',
        'Loyalty Islands': 'Oceania',
        'Macquarie': 'Oceania',
        'Australia': 'Australia'
    }

    def get_continent(row):
        # If continent is already filled, return it
        if pd.notna(row[continent_col]):
            return row[continent_col]

        # Try to get continent from country first
        if pd.notna(row[country_col]):
            continent = get_continent_from_country(row[country_col].strip())
            if continent:
                return continent

        # If country not found or is null, try location patterns
        if pd.notna(row[location_col]):
            location = str(row[location_col]).lower()
            for pattern, continent in location_patterns.items():
                if pattern.lower() in location:
                    return continent

        return None

    # Apply the mapping
    mask = df_copy[continent_col].isna()
    if mask.any():
        df_copy.loc[mask, continent_col] = df_copy[mask].apply(get_continent, axis=1)

    # Print statistics
    unmapped = df_copy[df_copy[continent_col].isna()]
    if not unmapped.empty:
        print("\nUnmapped locations:")
        print(unmapped[[country_col, location_col]].drop_duplicates())

    return df_copy



In [31]:
df = map_to_continent(df)

In [32]:
df.isna().sum()

Unnamed: 0      0
title           0
magnitude       0
date_time       0
cdi             0
mmi             0
alert         548
tsunami         0
sig             0
net             0
nst             0
dmin            0
gap             0
magType         0
depth           0
latitude        0
longitude       0
location        0
continent       0
country         0
dtype: int64

In [33]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,title,magnitude,date_time,cdi,mmi,alert,tsunami,sig,net,nst,dmin,gap,magType,depth,latitude,longitude,location,continent,country
0,0,"M 6.5 - 42 km W of Sola, Vanuatu",6.5,16-08-2023 12:47,7,4,green,0,657,us,114,7.177,25.0,mww,192.955,-13.88,167.16,"Sola, Vanuatu",Oceania,Vanuatu
1,1,"M 6.5 - 43 km S of Intipucá, El Salvador",6.5,19-07-2023 00:22,8,6,yellow,0,775,us,92,0.679,40.0,mww,69.727,12.81,-88.13,"Intipucá, El Salvador",North America,El Salvador
2,2,"M 6.6 - 25 km ESE of Loncopué, Argentina",6.6,17-07-2023 03:05,7,5,green,0,899,us,70,1.634,28.0,mww,171.371,-38.19,-70.37,"Loncopué, Argentina",South America,Argentina


In [34]:
df["continent"].unique()

array(['Oceania', 'North America', 'South America', 'Europe', 'Asia',
       'Antarctica', 'Africa'], dtype=object)

In [37]:
df["country"].unique()

array(['Vanuatu', 'El Salvador', 'Argentina', 'United States',
       'Antigua and Barbuda', 'Tonga', 'Fiji', 'Colombia', 'France',
       'Indonesia', 'New Zealand', 'Russian Federation',
       'Papua New Guinea', 'Afghanistan', 'Ecuador', 'Tajikistan',
       'Turkey', 'Solomon Islands', 'Panama', 'Mexico', 'Taiwan', 'China',
       'Philippines', 'Brazil', 'Peru', 'Nicaragua', 'Japan', 'Cyprus',
       'Antarctica', 'Haiti', 'Mongolia', 'Chile', 'Greece', 'Jamaica',
       'United Kingdom', 'Australia', 'Norway', 'Canada', 'Venezuela',
       'Bolivia', 'Honduras', 'Costa Rica', 'Iran', 'Guatemala',
       'Botswana', 'Italy', 'Myanmar', 'India', 'Barbados', 'Nepal',
       'Guam', 'Pakistan', 'Samoa', 'Kyrgyzstan', 'Mozambique',
       'Tanzania', 'Algeria', 'Turkmenistan', 'Azerbaijan', 'Iceland',
       'Trinidad and Tobago', 'Saudi Arabia'], dtype=object)

In [38]:
df.loc[df["country"]== "United Kingdom of Great Britain and Northern Ireland (the)" ,"continent"] = "Europe"

In [39]:
df.isna().sum()

Unnamed: 0      0
title           0
magnitude       0
date_time       0
cdi             0
mmi             0
alert         548
tsunami         0
sig             0
net             0
nst             0
dmin            0
gap             0
magType         0
depth           0
latitude        0
longitude       0
location        0
continent       0
country         0
dtype: int64

In [40]:
df.to_csv("1_earthquake_1995-2023.csv")