## Part 3 - Feature Engineering

In [9]:
import pandas as pd
import numpy as np


In [10]:
output_path = "../output/"
df_demographics = pd.read_csv(output_path + 'demographics_data.csv')
df_gdp = pd.read_csv('../gdp_per_capita_2021.csv')
df_pop = pd.read_csv('../population_2021.csv')


## 5.1 New Feature

In [19]:
def create_total_gdp(df_final):
    """
    Creates a Total GDP feature by multiplying GDP per capita PPP and Population
    """
    print("Creating Total GDP feature...")
    
    # Calculate Total GDP = GDP per capita PPP × Population
    df_final['TotalGDP'] = df_final['GDP_per_capita_PPP'] * df_final['Population']
    
    print(f"Total GDP feature created.\n")
    return df_final


## 5.2 Log Transforms

In [21]:
def apply_log_transforms(df_final):
    """
    Applies log10 transformations to GDP per capita PPP and Population
    """
    print("Applying log transformations...")
    
    # Check for non-positive values before log transformation
    if (df_final['GDP_per_capita_PPP'] <= 0).any():
        print("Warning: Some GDP per capita values are non-positive. These will be clipped to a small positive value.")
    
    if (df_final['Population'] <= 0).any():
        print("Warning: Some Population values are non-positive. These will be clipped to a small positive value.")
    
    # Apply log10 transformation to GDP per capita PPP
    df_final['LogGDPperCapita'] = np.log10(df_final['GDP_per_capita_PPP'].clip(lower=1e-10))
    
    # Apply log10 transformation to Population
    df_final['LogPopulation'] = np.log10(df_final['Population'].clip(lower=1e-10))
    
    print("Log transformations completed.\n")
    return df_final

create feature matrix

In [22]:
def create_feature_matrix(df_final, output_path):
    """
    Builds the final feature matrix and saves it to X.npy
    """
    print("Creating final feature matrix...")
    
    # Sort the DataFrame by Country index
    df_final_sorted = df_final.sort_index()
    
    # Create array of normalized features in the required order
    features = ['LifeExpectancy_Both', 'LogGDPperCapita', 'LogPopulation']
    X = df_final_sorted[[f"Normalized_{feature}" for feature in features]].values
    
    # Save to output/X.npy
    np.save(output_path + 'X.npy', X)
    
    print(f"Feature matrix with shape {X.shape} saved to {output_path}X.npy\n")
    return X

## 5.3 Scaling

In [23]:
def normalize_features(df_final, output_path):
    """
    Applies z-score normalization to selected features
    """
    print("Applying z-score normalization...")
    
    # Select the three columns to normalize
    features = ['LifeExpectancy_Both', 'LogGDPperCapita', 'LogPopulation']
    
    # Calculate mean and standard deviation for each feature
    feature_means = df_final[features].mean()
    feature_stds = df_final[features].std()
    
    # Apply z-score normalization
    for feature in features:
        normalized_feature = f"Normalized_{feature}"
        df_final[normalized_feature] = (df_final[feature] - feature_means[feature]) / feature_stds[feature]
    
    print("Z-score normalization completed.\n")
    return df_final


## 5.4 Data Integration (inner join)

In [24]:
def data_integration(df_demographics, df_gdp, df_pop, output_path):
    """
    Combines the three datasets into a single analysis dataset using inner join
    """
    print("Performing data integration...\n")

    # Ensure 'Country' is the index for df_demographics dataset
    if df_demographics.index.name != "Country":
        df_demographics.set_index("Country", inplace=True)

    # Ensure 'Country' is the index for GDP dataset
    if df_gdp.index.name != "Country":
        df_gdp.set_index("Country", inplace=True)

    # Ensure 'Country' is the index for Population dataset
    if df_pop.index.name != "Country":
        df_pop.set_index("Country", inplace=True)

    # Perform an inner join on Country
    df_final = df_demographics.join(df_gdp, how="inner").join(df_pop, how="inner")
    
    # Record how many countries remain after the merge
    countries_after_merge = len(df_final)
    print(f"Number of countries after merging: {countries_after_merge}")
    
    # Save list of countries lost during join
    # Get original country lists
    demographics_countries = set(df_demographics.index.tolist())
    gdp_countries = set(df_gdp.index.tolist())
    pop_countries = set(df_pop.index.tolist())
    
    # Get all countries that should be in the merge (union of all sets)
    all_countries = demographics_countries.union(gdp_countries).union(pop_countries)
    
    # Countries in the final merged dataset
    merged_countries = set(df_final.index.tolist())
    
    # Find lost countries (in at least one original dataset but not in final)
    lost_countries = all_countries - merged_countries
    
    # Convert to DataFrame and sort
    lost_countries_df = pd.DataFrame({'Country': sorted(list(lost_countries))})
    lost_countries_df.to_csv(output_path + 'lost_countries.csv', index=False)
    print(f"Saved {len(lost_countries)} lost countries to {output_path}lost_countries.csv")
    
    # Check for any remaining missing values
    missing_values = df_final.isnull().sum()
    print("Missing values in merged dataset:")
    print(missing_values)
    
    # Replace missing numeric values with column mean
    numeric_columns = df_final.select_dtypes(include=['number']).columns
    for col in numeric_columns:
        if df_final[col].isnull().sum() > 0:  # Only process if column has nulls
            print(f"Replacing {df_final[col].isnull().sum()} missing values in {col} with mean")
            df_final[col].fillna(df_final[col].mean(), inplace=True)
    
    # Remove rows with missing categorical values
    categorical_columns = df_final.select_dtypes(exclude=['number']).columns
    if len(categorical_columns) > 0 and df_final[categorical_columns].isnull().sum().sum() > 0:
        before_rows = len(df_final)
        df_final.dropna(subset=categorical_columns, inplace=True)
        after_rows = len(df_final)
        print(f"Removed {before_rows - after_rows} rows with missing categorical values")
    
    print("Data integration completed.\n")
    return df_final

# data_integration(df_demographics, df_gdp, df_pop, output_path)

Main function: run feature engineering

In [25]:
def run_feature_engineering(df_demographics, df_gdp, df_pop, output_path):
    """
    Runs the complete feature engineering pipeline
    """
    # Step 1: Data Integration
    df_final = data_integration(df_demographics, df_gdp, df_pop, output_path)
    
    # Step 2: Create Total GDP feature
    df_final = create_total_gdp(df_final)
    
    # Step 3: Apply log transformations
    df_final = apply_log_transforms(df_final)
    
    # Step 4: Normalize features
    df_final = normalize_features(df_final, output_path)
    
    # Step 5: Create and save feature matrix
    X = create_feature_matrix(df_final, output_path)
    
    return df_final, X

# After loading your cleaned datasets
df_final, X = run_feature_engineering(df_demographics, df_gdp, df_pop, output_path)

Performing data integration...

Number of countries after merging: 173
Saved 108 lost countries to ../output/lost_countries.csv
Missing values in merged dataset:
LifeExpectancy_Both           0
LifeExpectancy_Female         0
LifeExpectancy_Male           0
UrbanPopulation_Percentage    0
UrbanPopulation_Absolute      5
PopulationDensity             0
GDP_per_capita_PPP            0
Population                    0
dtype: int64
Replacing 5 missing values in UrbanPopulation_Absolute with mean
Data integration completed.

Creating Total GDP feature...
Total GDP feature created.

Applying log transformations...
Log transformations completed.

Applying z-score normalization...
Z-score normalization completed.

Creating final feature matrix...
Feature matrix with shape (173, 3) saved to ../output/X.npy

