In [2]:
import pandas as pd

who_df = pd.read_csv("data//processed//who_mental_health_cleaned.csv")
who_df

# Load raw data another dataaset 
wb_df = pd.read_csv("data//processed//world_bank_data_clean.csv")
wb_df


Unnamed: 0,country_name,year,GDP_per_capita,School_enrollment,Unemployment
0,Afghanistan,2000,174.930991,79.720263,7.897
1,Afghanistan,2001,138.706822,14.040410,7.973
2,Afghanistan,2002,178.954088,79.720263,7.867
3,Afghanistan,2003,198.871116,13.959530,7.844
4,Afghanistan,2004,221.763654,19.214380,7.794
...,...,...,...,...,...
6296,Zimbabwe,2019,2184.329239,79.720263,7.373
6297,Zimbabwe,2020,2059.674454,79.720263,8.617
6298,Zimbabwe,2021,2613.605421,79.720263,9.540
6299,Zimbabwe,2022,2536.400502,79.720263,10.087


In [3]:
from thefuzz import process
import pandas as pd

# wb_df and who_df are your DataFrames
# List of WB country names
wb_names_list = wb_df["country_name"].unique()

# Function to match WHO country names to WB names
def match_name_to_wb(who_country):
    match, score = process.extractOne(who_country, wb_names_list)
    if score >= 90:  # high similarity threshold
        return match
    else:
        return None


In [4]:

# Apply fuzzy matching
who_df["country_name_matched"] = who_df["country_name"].apply(match_name_to_wb)

In [5]:

# Merge on matched names + year
final_df = pd.merge(
    who_df,
    wb_df,
    left_on=["country_name_matched", "year"],
    right_on=["country_name", "year"],
    how="inner"
)

print("Merged dataset shape:", final_df.shape)
print(final_df.head())

Merged dataset shape: (5202, 9)
  country_name_x  year  mental_health_value indicator_code  \
0            JOR  2010             1.670238          MH_12   
1            SYR  2001             1.809799          MH_12   
2            TUN  2011             1.596429          MH_12   
3            AGO  2017            20.410024          MH_12   
4            UGA  2000            24.813049          MH_12   

   country_name_matched        country_name_y  GDP_per_capita  \
0                Jordan                Jordan     3718.465716   
1  Syrian Arab Republic  Syrian Arab Republic     1186.829135   
2               Tunisia               Tunisia     4420.647722   
3   Trinidad and Tobago   Trinidad and Tobago    17566.099970   
4              Portugal              Portugal    11526.372067   

   School_enrollment  Unemployment  
0          87.364441        12.500  
1          44.838951        11.630  
2          89.341469        18.334  
3          79.720263         3.389  
4         108.37288

In [6]:
# Missing values
print("Final merged dataset missing values:\n", final_df.isna().sum())

Final merged dataset missing values:
 country_name_x          0
year                    0
mental_health_value     0
indicator_code          0
country_name_matched    0
country_name_y          0
GDP_per_capita          0
School_enrollment       0
Unemployment            0
dtype: int64


In [7]:
# Number of unmatched countries
unmatched_countries = who_df[who_df["country_name_matched"].isna()]
print("Number of WHO countries not matched to WB:", unmatched_countries.shape[0])


Number of WHO countries not matched to WB: 7722


In [8]:

# Duplicates
print("Duplicates in merged dataset:", final_df.duplicated(subset=["country_name_y","year"]).sum())


Duplicates in merged dataset: 3621


In [9]:
final_df.to_csv("data/WHO_WorldBank_merged.csv", index=False)


In [10]:
final_df

Unnamed: 0,country_name_x,year,mental_health_value,indicator_code,country_name_matched,country_name_y,GDP_per_capita,School_enrollment,Unemployment
0,JOR,2010,1.670238,MH_12,Jordan,Jordan,3718.465716,87.364441,12.500
1,SYR,2001,1.809799,MH_12,Syrian Arab Republic,Syrian Arab Republic,1186.829135,44.838951,11.630
2,TUN,2011,1.596429,MH_12,Tunisia,Tunisia,4420.647722,89.341469,18.334
3,AGO,2017,20.410024,MH_12,Trinidad and Tobago,Trinidad and Tobago,17566.099970,79.720263,3.389
4,UGA,2000,24.813049,MH_12,Portugal,Portugal,11526.372067,108.372887,3.806
...,...,...,...,...,...,...,...,...,...
5197,KAZ,2012,44.878845,MH_12,Kazakhstan,Kazakhstan,12018.796653,99.018019,5.290
5198,JOR,2017,0.881077,MH_12,Jordan,Jordan,4065.616287,88.875829,18.120
5199,POL,2010,15.545104,MH_12,French Polynesia,French Polynesia,22494.806706,79.720263,12.133
5200,UKR,2020,32.485972,MH_12,Ukraine,Ukraine,3709.769287,84.428334,9.475
