In [100]:
import requests
import pandas as pd

BASE_URL = "https://ghoapi.azureedge.net/api/"
INDICATOR = "MH_12"  # Suicide mortality rate

rows = []

# WHO API uses pagination via $skip
skip = 0
page_size = 1000

while True:
    params = {
        "$top": page_size,
        "$skip": skip
    }

    response = requests.get(f"{BASE_URL}{INDICATOR}", params=params)
    data = response.json()
    records = data.get("value", [])

    if not records:
        break

    for item in records:
        if item.get("NumericValue") is not None:
            rows.append({
                "country_code": item.get("SpatialDim"),
                "year": int(item.get("TimeDim")),
                "mental_health_value": item.get("NumericValue"),
                "indicator_code": item.get("IndicatorCode")
            })

    skip += page_size
# Save as JSON
who_df.to_json("Raw data//who_mental_health.json", orient="records", indent=4)

who_df = pd.DataFrame(rows)
who_df.head()


Unnamed: 0,country_code,year,mental_health_value,indicator_code
0,LTU,2012,52.254544,MH_12
1,JOR,2010,1.670238,MH_12
2,SSD,2020,5.916572,MH_12
3,SYR,2001,1.809799,MH_12
4,ZWE,2021,25.423327,MH_12


In [101]:
who_df


Unnamed: 0,country_code,year,mental_health_value,indicator_code
0,LTU,2012,52.254544,MH_12
1,JOR,2010,1.670238,MH_12
2,SSD,2020,5.916572,MH_12
3,SYR,2001,1.809799,MH_12
4,ZWE,2021,25.423327,MH_12
...,...,...,...,...
12931,CRI,2004,2.259396,MH_12
12932,BRN,2015,1.193121,MH_12
12933,KEN,2014,9.083175,MH_12
12934,COD,2013,23.642500,MH_12


In [102]:
print(who_df.isna().sum())


country_code           0
year                   0
mental_health_value    0
indicator_code         0
dtype: int64


In [103]:
# Save as CSV
who_df.to_csv("Raw Data//who_mental_health.csv", index=False)


In [104]:
import requests
import pandas as pd

# Define the indicators we want
indicators = {
    "GDP_per_capita": "NY.GDP.PCAP.CD",
    "Unemployment": "SL.UEM.TOTL.ZS",
    "School_enrollment": "SE.SEC.ENRR"
}

rows = []

# Fetch data for each indicator
for name, code in indicators.items():
    url = f"http://api.worldbank.org/v2/country/all/indicator/{code}?format=json&per_page=20000"
    response = requests.get(url)
    data = response.json()
    
    # Actual data is in the second element
    records = data[1]
    for item in records:
        if item["value"] is not None:
            rows.append({
                "country_code": item["country"]["id"],
                "year": int(item["date"]),
                "indicator": name,
                "value": item["value"]
            })

# Create DataFrame
wb_df = pd.DataFrame(rows)

# Pivot indicators to columns
wb_df_pivot = wb_df.pivot_table(index=["country_code", "year"],
                                columns="indicator",
                                values="value").reset_index()

# Optional: Remove multi-index if exists
wb_df_pivot.columns.name = None

# Filter years to match WHO data (e.g., 2000–2023)
wb_df_pivot = wb_df_pivot[(wb_df_pivot['year'] >= 2000) & (wb_df_pivot['year'] <= 2023)]

# Drop rows where all indicator values are NaN
wb_df_clean = wb_df_pivot.dropna(subset=["GDP_per_capita", "Unemployment", "School_enrollment"], how='all')

# Save cleaned dataset
wb_df_clean.to_csv("Raw Data//world_bank_data_clean.csv", index=False)
wb_df_clean.to_json("Raw Data//world_bank_data_clean.json", orient="records", indent=4)

# Inspect
print(wb_df_clean.head())
print(wb_df_clean.isna().sum())  # Check missing values


   country_code  year  GDP_per_capita  School_enrollment  Unemployment
39           1A  2000     2664.125201          59.216282     12.622723
40           1A  2001     2536.692326          60.490860     12.498202
41           1A  2002     2483.111875          61.226109     12.506228
42           1A  2003     2696.672125          62.769958     12.449697
43           1A  2004     3133.701879          64.075691     11.427116
country_code            0
year                    0
GDP_per_capita        108
School_enrollment    1707
Unemployment          665
dtype: int64


In [105]:
wb_df_pivot

Unnamed: 0,country_code,year,GDP_per_capita,School_enrollment,Unemployment
39,1A,2000,2664.125201,59.216282,12.622723
40,1A,2001,2536.692326,60.490860,12.498202
41,1A,2002,2483.111875,61.226109,12.506228
42,1A,2003,2696.672125,62.769958,12.449697
43,1A,2004,3133.701879,64.075691,11.427116
...,...,...,...,...,...
14960,ZW,2019,2184.329239,,7.373000
14961,ZW,2020,2059.674454,,8.617000
14962,ZW,2021,2613.605421,,9.540000
14963,ZW,2022,2536.400502,,10.087000


In [106]:
print(wb_df_pivot.isna().sum())


country_code            0
year                    0
GDP_per_capita        108
School_enrollment    1707
Unemployment          665
dtype: int64


In [107]:
wb_df_filled = wb_df_clean.copy()

wb_df_filled["GDP_per_capita"] = wb_df_filled["GDP_per_capita"].fillna(
    wb_df_filled["GDP_per_capita"].mean()
)

wb_df_filled["School_enrollment"] = wb_df_filled["School_enrollment"].fillna(
    wb_df_filled["School_enrollment"].mean()
)

wb_df_filled["Unemployment"] = wb_df_filled["Unemployment"].fillna(
    wb_df_filled["Unemployment"].mean()
)

# Verify
print(wb_df_filled.isna().sum())
wb_df_filled.head()


country_code         0
year                 0
GDP_per_capita       0
School_enrollment    0
Unemployment         0
dtype: int64


Unnamed: 0,country_code,year,GDP_per_capita,School_enrollment,Unemployment
39,1A,2000,2664.125201,59.216282,12.622723
40,1A,2001,2536.692326,60.49086,12.498202
41,1A,2002,2483.111875,61.226109,12.506228
42,1A,2003,2696.672125,62.769958,12.449697
43,1A,2004,3133.701879,64.075691,11.427116


In [108]:
print(wb_df_filled.isna().sum())



country_code         0
year                 0
GDP_per_capita       0
School_enrollment    0
Unemployment         0
dtype: int64


In [109]:
wb_df_filled.to_json("Raw data//world_bank_data_filled.json", orient="records", indent=4)
wb_df_filled.to_csv("Raw data//world_bank_data_filled.csv", index=False)


In [93]:
print(who_df.dtypes)
print(wb_df_filled.dtypes)


country_code            object
year                     int64
mental_health_value    float64
indicator_code          object
dtype: object
country_code          object
year                   int64
GDP_per_capita       float64
School_enrollment    float64
Unemployment         float64
dtype: object


In [110]:
print("WHO codes:", who_df["country_code"].unique()[:10])
print("WB codes:", wb_df_filled["country_code"].unique()[:10])


WHO codes: ['LTU' 'JOR' 'SSD' 'SYR' 'ZWE' 'TUN' 'BRB' 'TKM' 'CHL' 'TCD']
WB codes: ['1A' '1W' '4E' '7E' '8S' 'AD' 'AE' 'AF' 'AG' 'AL']


In [111]:
final_df['country_code']  # from WHO
wb_df_filled['country_code']  # from World Bank


39       1A
40       1A
41       1A
42       1A
43       1A
         ..
14960    ZW
14961    ZW
14962    ZW
14963    ZW
14964    ZW
Name: country_code, Length: 6301, dtype: object

In [112]:
who_df = who_df[who_df["country_code"].str.isalpha() & (who_df["country_code"].str.len() == 3)]
wb_df_filled = wb_df_filled[wb_df_filled["country_code"].str.isalpha() & (wb_df_filled["country_code"].str.len() == 3)]


In [113]:
final_df = pd.merge(
    who_df, 
    wb_df_filled, 
    on=["country_code", "year"],
    how="inner"
)

print(final_df.shape)
print(final_df.head())


(0, 7)
Empty DataFrame
Columns: [country_code, year, mental_health_value, indicator_code, GDP_per_capita, School_enrollment, Unemployment]
Index: []


In [114]:
final_df_outer = pd.merge(
    who_df, 
    wb_df_filled, 
    on=["country_code", "year"], 
    how="outer"
)

# Fill missing socioeconomic data with column mean
for col in ["GDP_per_capita", "School_enrollment", "Unemployment"]:
    final_df_outer[col] = final_df_outer[col].fillna(final_df_outer[col].mean())

print(final_df_outer.shape)


(12540, 7)


In [115]:
final_df_outer

Unnamed: 0,country_code,year,mental_health_value,indicator_code,GDP_per_capita,School_enrollment,Unemployment
0,AFG,2000,4.934659,MH_12,,,
1,AFG,2000,9.609622,MH_12,,,
2,AFG,2000,7.185717,MH_12,,,
3,AFG,2001,9.712121,MH_12,,,
4,AFG,2001,4.996167,MH_12,,,
...,...,...,...,...,...,...,...
12535,ZWE,2020,14.211690,MH_12,,,
12536,ZWE,2020,36.060025,MH_12,,,
12537,ZWE,2021,25.423327,MH_12,,,
12538,ZWE,2021,15.788454,MH_12,,,


In [116]:
wb_df_filled = wb_df_filled[
    wb_df_filled["country_code"].str.isalpha() & (wb_df_filled["country_code"].str.len() == 3)
]


In [117]:
for col in ["GDP_per_capita", "School_enrollment", "Unemployment"]:
    wb_df_filled[col] = wb_df_filled[col].fillna(wb_df_filled[col].mean())


In [118]:
final_df = pd.merge(
    who_df,
    wb_df_filled,
    on=["country_code", "year"],
    how="inner"  # keeps only matching countries & years
)

print(final_df.shape)
print(final_df.head())


(0, 7)
Empty DataFrame
Columns: [country_code, year, mental_health_value, indicator_code, GDP_per_capita, School_enrollment, Unemployment]
Index: []


In [119]:
final_df = pd.merge(
    who_df,
    wb_df_filled,
    on=["country_code", "year"],
    how="outer"
)

# Fill missing values with column mean
for col in ["GDP_per_capita", "School_enrollment", "Unemployment"]:
    final_df[col] = final_df[col].fillna(final_df[col].mean())


In [120]:
final_df

Unnamed: 0,country_code,year,mental_health_value,indicator_code,GDP_per_capita,School_enrollment,Unemployment
0,AFG,2000,4.934659,MH_12,,,
1,AFG,2000,9.609622,MH_12,,,
2,AFG,2000,7.185717,MH_12,,,
3,AFG,2001,9.712121,MH_12,,,
4,AFG,2001,4.996167,MH_12,,,
...,...,...,...,...,...,...,...
12535,ZWE,2020,14.211690,MH_12,,,
12536,ZWE,2020,36.060025,MH_12,,,
12537,ZWE,2021,25.423327,MH_12,,,
12538,ZWE,2021,15.788454,MH_12,,,
