In [1]:
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


# LOAD ALL PROCESSING CLASSES
from ama_tlbx.data import LifeExpectancyDataset, LECol

from ama_tlbx.analysis import (
    IQROutlierDetector,
    IsolationForestOutlierDetector,
    ZScoreOutlierDetector,
)
from ama_tlbx.plotting import (
    plot_correlation_heatmap,
    plot_explained_variance,
    plot_loadings_heatmap,
    plot_target_correlations,
    plot_top_correlated_pairs,
)


In [2]:
DATA_DIR = Path.cwd().parent / "_data"
if not DATA_DIR.exists():
    DATA_DIR = Path.cwd() / "_data"

assert DATA_DIR.exists(), f"Data directory {DATA_DIR} does not exist."

dataset = LifeExpectancyDataset.from_csv(
    DATA_DIR / "life_expectancy_data.csv",
    aggregate_by_country=True,  # Average across years per country
    drop_missing_target=True,  # Remove rows without life expectancy
)

print(f"   Shape: {dataset.df.shape}")
print(f"   Countries: {dataset.df.shape[0]}")
print(f"   Features: {dataset.df.shape[1] - len(dataset.identifier_columns)}")

print("\nColumn normalization examples:")
print(f"   'Life expectancy ' -> '{LECol.LIFE_EXPECTANCY}'")
print(f"   ' BMI ' -> '{LECol.BMI}'")
print(f"   'Income composition of resources' → '{LECol.INCOME_COMPOSITION}'")

dataset.df

   Shape: (183, 22)
   Countries: 183
   Features: 19

Column normalization examples:
   'Life expectancy ' -> 'life_expectancy'
   ' BMI ' -> 'bmi'
   'Income composition of resources' → 'income_composition_of_resources'


Unnamed: 0,country,adult_mortality,alcohol,bmi,diphtheria,gdp,hepatitis_b,hiv_aids,income_composition_of_resources,infant_deaths,...,percentage_expenditure,polio,population,schooling,thinness_1_19_years,thinness_5_9_years,total_expenditure,under_five_deaths,status,year
0,Afghanistan,269.0625,0.014375,15.51875,52.3125,340.015425,64.562500,0.10000,0.415375,78.2500,...,34.960110,48.3750,9.972260e+06,8.21250,16.58125,15.58125,8.252500,107.5625,Developing,2015-01-01
1,Albania,45.0625,4.848750,49.06875,98.0625,2119.726679,98.000000,0.10000,0.709875,0.6875,...,193.259091,98.1250,6.969116e+05,12.13750,1.61875,1.70000,5.945625,0.9375,Developing,2015-01-01
2,Algeria,108.1875,0.406667,48.74375,91.8750,2847.853392,78.000000,0.10000,0.694875,20.3125,...,236.185241,91.7500,2.164983e+07,12.71250,6.09375,5.97500,4.604000,23.5000,Developing,2015-01-01
3,Angola,328.5625,5.740667,18.01875,47.6875,1975.143045,70.222222,2.36875,0.458375,83.7500,...,102.100268,46.1250,1.014710e+07,8.04375,6.19375,6.66875,3.919333,132.6250,Developing,2015-01-01
4,Antigua and Barbuda,127.5000,7.949333,38.42500,98.3125,9759.305728,98.266667,0.12500,0.488625,0.0000,...,1001.585226,96.9375,1.274585e+07,8.84375,3.42500,3.37500,4.791333,0.0000,Developing,2015-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178,Venezuela (Bolivarian Republic of),163.0000,7.420000,54.48750,68.5000,7389.061605,66.250000,0.10000,0.726812,9.3750,...,0.000000,74.6875,1.274585e+07,12.78750,1.65000,1.56250,4.998667,10.7500,Developing,2015-01-01
179,Viet Nam,126.5625,3.087333,11.18750,91.7500,7389.061605,87.538462,0.14375,0.627062,29.1875,...,0.000000,94.9375,1.274585e+07,11.51250,14.92500,15.62500,5.977333,36.5000,Developing,2015-01-01
180,Yemen,211.8125,0.047333,33.48750,72.6250,7389.061605,55.687500,0.10000,0.475500,39.3750,...,0.000000,67.1250,1.274585e+07,8.50625,13.83125,13.75000,5.005333,51.6250,Developing,2015-01-01
181,Zambia,354.3125,2.239333,17.45000,74.2500,811.811841,69.818182,11.93125,0.498437,33.4375,...,89.650407,64.3750,6.260246e+06,11.21250,6.88125,6.76250,5.824000,52.3750,Developing,2015-01-01


In [3]:
from ama_tlbx.analysis import (
    ColumnConcatenator,
)


# SELECT COLUMNS FOR CONCAT
selected_columns = [
    LECol.HEPATITIS_B,
    LECol.POLIO,
    LECol.DIPHTHERIA,
]

# INITIALISE THE CONCATINATOR
column_concatinator = ColumnConcatenator(dataset)

trimmed_dataset = column_concatinator.concatenate(
    columns=selected_columns,
    new_column_name="Immunisation Rate"
)

print("trimmed dataset")
print(trimmed_dataset.df["Immunisation Rate"])

trimmed dataset
0      55.032880
1      98.062408
2      87.303285
3      54.538586
4      97.850537
         ...    
178    69.776710
179    91.410879
180    65.305836
181    69.592487
182    73.819366
Name: Immunisation Rate, Length: 183, dtype: float64


In [4]:
from ama_tlbx.analysis import (
    ColumnConcatenator,
)

corr_analyzer = dataset.make_correlation_analyzer(
    columns=selected_columns,
    standardized=True,  # Use standardized data
    include_target=True,
)
corr_result = corr_analyzer.compute(top_n_pairs=10)
plot_correlation_heatmap(corr_result)

ValueError: Target column 'life_expectancy' not found in data