This file contains code to clean our datasets.

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/billionaires.csv')

# Load ISO mapping from your country table if needed
iso_map = {
    'Algeria': 'DZA',
    'Andorra': 'AND',
    'Argentina': 'ARG',
    'Armenia': 'ARM',
    'Australia': 'AUS',
    'Austria': 'AUT',
    'Bahamas': 'BHS',
    'Bahrain': 'BHR',
    'Belgium': 'BEL',
    'Bermuda': 'BMU',
    'Brazil': 'BRA',
    'British Virgin Islands': 'VGB',
    'Cambodia': 'KHM',
    'Canada': 'CAN',
    'Cayman Islands': 'CYM',
    'Chile': 'CHL',
    'China': 'CHN',
    'Colombia': 'COL',
    'Cyprus': 'CYP',
    'Czech Republic': 'CZE',
    'Denmark': 'DNK',
    'Egypt': 'EGY',
    'Eswatini (Swaziland)': 'SWZ',
    'Finland': 'FIN',
    'France': 'FRA',
    'Georgia': 'GEO',
    'Germany': 'DEU',
    'Greece': 'GRC',
    'Guernsey': 'GGY',
    'Hong Kong': 'HKG',
    'Hungary': 'HUN',
    'India': 'IND',
    'Indonesia': 'IDN',
    'Ireland': 'IRL',
    'Israel': 'ISR',
    'Italy': 'ITA',
    'Japan': 'JPN',
    'Kazakhstan': 'KAZ',
    'Latvia': 'LVA',
    'Lebanon': 'LBN',
    'Liechtenstein': 'LIE',
    'Luxembourg': 'LUX',
    'Malaysia': 'MYS',
    'Mexico': 'MEX',
    'Monaco': 'MCO',
    'Morocco': 'MAR',
    'Nepal': 'NPL',
    'Netherlands': 'NLD',
    'New Zealand': 'NZL',
    'Nigeria': 'NGA',
    'Norway': 'NOR',
    'Oman': 'OMN',
    'Peru': 'PER',
    'Philippines': 'PHL',
    'Poland': 'POL',
    'Portugal': 'PRT',
    'Qatar': 'QAT',
    'Romania': 'ROU',
    'Russia': 'RUS',
    'Singapore': 'SGP',
    'Slovakia': 'SVK',
    'South Africa': 'ZAF',
    'South Korea': 'KOR',
    'Spain': 'ESP',
    'Sweden': 'SWE',
    'Switzerland': 'CHE',
    'Taiwan': 'TWN',
    'Tanzania': 'TZA',
    'Thailand': 'THA',
    'Turkey': 'TUR',
    'Turks and Caicos Islands': 'TCA',
    'Ukraine': 'UKR',
    'United Arab Emirates': 'ARE',
    'United Kingdom': 'GBR',
    'United States': 'USA',
    'Uruguay': 'URY',
    'Uzbekistan': 'UZB',
    'Vietnam': 'VNM',
    None: None,
    float('nan'): None,
    'nan': None
}

df_clean = pd.DataFrame({
    'ranking': df['rank'],
    'name': df['personName'],
    'age': df['age'],
    'sector': df['category'],  # or df['industries']
    'gender': df['gender'],
    'title': df['title'],
    'self_made': df['selfMade'].map({'True': True, 'False': False, 'Yes': True, 'No': False}),
    'organization': df['organization'],
    'country': df['country'].map(iso_map)

df_clean = df_clean.dropna(subset=['country', 'age'])

df_clean["age"] = df_clean["age"].astype(int)
df_clean = df_clean[df_clean['country'] != 'FRA']
df_clean = df_clean[df_clean['country'] != 'NOR']

df_clean.to_csv('billionaires_clean.csv', index=False)

In [None]:
df = pd.read_csv('emission.csv')

df = df.rename(columns={
    'ISO 3166-1 alpha-3': 'country',
    'Year': 'year',
    'Total': 'value'
})

df = df[['country', 'year', 'value']]
df['source'] = 'global_emissions_dataset'

df = df.dropna(subset=['value'])
df = df.dropna(subset=['country'])
df = df[df['value'] > 0]
exclude_countries = ['FRA', 'GUF', 'GLP', 'KSV', 'MTQ', 'MYT', 'NOR', 'REU', 'XIT', 'WLD', 'BES', 'CXR', 'PCZ']
df = df[~df['country'].isin(exclude_countries)]

df = df[~df['country'].isin(exclude_countries)]

df.to_csv('emission_clean.csv', index=False)

In [None]:
df = pd.read_csv('indicator.csv')

exclude_countries = ['FRA', 'GUF', 'GLP', 'KSV', 'MTQ', 'MYT',
                     'NOR', 'REU', 'XIT', 'WLD', 'AETMP', 'AFRTMP',
                     'AMETMP', 'ASIATMP', 'CHA', 'EMDETMP',
                     'EMU', 'G20', 'G7', 'ANT', 'OCETMP', 'SCG' ]

df = df[~df['ISO3'].isin(exclude_countries)]

# Melt wide format to long: keep metadata, reshape year columns
df_long = df.melt(
    id_vars=['ISO3', 'Indicator', 'Source'],
    value_vars=[col for col in df.columns if col.startswith('F')],
    var_name='year',
    value_name='value'
)

# convert 'F1992' to int 1992
df_long['year'] = df_long['year'].str.extract(r'F(\d{4})').astype(int)

df_long = df_long.rename(columns={
    'ISO3': 'country',
    'Source': 'source',
    'Unit': 'unit'
})

df_long = df_long.dropna(subset=['value'])

df_long.to_csv('indicator_clean.csv', index=False)