In [4]:
import pandas as pd

# Load the dataset
file_path = ('/Users/pranjalts/Desktop/RESEARCH/HALE/HALE/dataset/merge.csv')  
data = pd.read_csv(file_path, encoding='ISO-8859-1')

In [5]:
missing_values = data.isnull().sum()
print("Missing values:")
print(missing_values)

Missing values:
Period                                                                                                                                                        0
ParentLocation                                                                                                                                                0
Location                                                                                                                                                      0
Dim1                                                                                                                                                          0
HALE_Birth                                                                                                                                                    0
HALE_60                                                                                                                                                       0
infant mortality rate (b

In [6]:
#Filter the data for the period 2000-2019
data_filtered = data[(data['Period'] >= 2000) & (data['Period'] <= 2019)]
#Drop columns with more than 60% missing values
threshold = 0.6 * len(data_filtered)  # 60% threshold
data_filtered_cleaned = data_filtered.dropna(thresh=threshold, axis=1)

In [7]:
#remaining missing values
missing_values = data_filtered_cleaned.isnull().sum()
print("Missing values after filter:")
print(missing_values)

Missing values after filter:
Period                                                                                                                                                       0
ParentLocation                                                                                                                                               0
Location                                                                                                                                                     0
Dim1                                                                                                                                                         0
HALE_Birth                                                                                                                                                   0
HALE_60                                                                                                                                                      0
infant mortality 

In [8]:
# Backward fill for missing values ONLY in the year 2000
# Isolate rows for 2000 and apply backward fill for 2000 using future data from the same country
for country in data_filtered_cleaned['Location'].unique():
    country_data = data_filtered_cleaned[data_filtered_cleaned['Location'] == country]
    country_2000_data = country_data[country_data['Period'] == 2000]
    
    if country_2000_data.isnull().values.any():  # Check if there are missing values
        # Get the next available data after 2000 for this country
        future_data = country_data[country_data['Period'] > 2000].sort_values('Period')
        
        # Fill missing values in 2000 with the first available future data
        if not future_data.empty:
            data_filtered_cleaned.loc[(data_filtered_cleaned['Location'] == country) & 
                                      (data_filtered_cleaned['Period'] == 2000)] = (
                data_filtered_cleaned.loc[(data_filtered_cleaned['Location'] == country) & 
                                          (data_filtered_cleaned['Period'] == 2000)].fillna(future_data.iloc[0])
            )
# Round values after backward fill to ensure consistency
data_filtered_cleaned = data_filtered_cleaned.round(2)           

In [9]:
#  Check for remaining missing values
missing_values = data_filtered_cleaned.isnull().sum()
print("Missing values after backward filling:")
print(missing_values)

Missing values after backward filling:
Period                                                                                                                                                       0
ParentLocation                                                                                                                                               0
Location                                                                                                                                                     0
Dim1                                                                                                                                                         0
HALE_Birth                                                                                                                                                   0
HALE_60                                                                                                                                                      0
infant 

In [10]:
# Apply interpolation for the remaining missing values
data_interpolated = data_filtered_cleaned.groupby('Location').apply(lambda group: group.interpolate(method='linear'))

# Round values after interpolation
data_interpolated = data_interpolated.round(2)




In [11]:
# Check for remaining missing values
missing_values = data_interpolated.isnull().sum()
print("Missing values after interpolation:")
print(missing_values)

Missing values after interpolation:
Period                                                                                                                                                       0
ParentLocation                                                                                                                                               0
Location                                                                                                                                                     0
Dim1                                                                                                                                                         0
HALE_Birth                                                                                                                                                   0
HALE_60                                                                                                                                                      0
infant mor

In [12]:
# Apply global mean imputation for any remaining missing values
data_filled = data_interpolated.fillna(data_interpolated.mean(numeric_only=True))

#Round values after global mean imputation
data_filled_rounded = data_filled.round(2)

In [13]:
# Check for remaining missing values
missing_values = data_filled_rounded.isnull().sum()
print("Missing values after global mean imputation:")
print(missing_values)

Missing values after global mean imputation:
Period                                                                                                                                                     0
ParentLocation                                                                                                                                             0
Location                                                                                                                                                   0
Dim1                                                                                                                                                       0
HALE_Birth                                                                                                                                                 0
HALE_60                                                                                                                                                    0
infant mortal

In [15]:
# Save the cleaned dataset
data_filled_rounded.to_csv('/Users/pranjalts/Desktop/RESEARCH/HALE/HALE/dataset/final1.csv', index=False)

print("Data preprocessing complete! The cleaned dataset with rounded values is saved as 'cleaned_dataset.csv'.")

Data preprocessing complete! The cleaned dataset with rounded values is saved as 'cleaned_dataset.csv'.
