In [2]:
# inspect_pipeline.py
from preprocessors import (
    LowerCaseStrings,
    StringConverter,
    YearExtractor,
    IQRCapper,
    ConstructionYearTransformer,
    ObjectToNumericConverter,
    AgeCalculator,
    FrequencyEncoder,
    RegionCodeCombiner,
    ColumnDropper,
    AgePipeline,
    GeoContextImputer
)
import joblib
import numpy as np

# 1) Load the full pipeline you saved.
pipeline = joblib.load("full_pipeline.joblib")

# 2) Grab the ColumnTransformer (named "preprocessor") and the selector step (named "feature_selection").
preprocessor = pipeline.named_steps["preprocessor"]
selector     = pipeline.named_steps["feature_selection"]

# 3) Ask the ColumnTransformer for its output feature names
#    This returns a numpy array like ["age", "num__gps_height", "cat__basin_lake victoria", ...]
all_proc_feature_names = preprocessor.get_feature_names_out()

# 4) Ask the selector which of these it kept
mask = selector.get_support()           # boolean mask, same length as all_proc_feature_names
important_features = np.array(all_proc_feature_names)[mask]

print("\n🚨 Total features after preprocessing (before selection):", len(all_proc_feature_names))
print("   Example:", all_proc_feature_names[:10], "…\n")

print("🚨 Number of features kept by SelectFromModel:", important_features.shape[0])
print("🚨 These are the features that the RandomForest sees:\n")
for feat in important_features:
    print("   •", feat)



🚨 Total features after preprocessing (before selection): 116
   Example: ['age__age' 'num__amount_tsh' 'cat__source_class_surface'
 'cat__source_class_unknown' 'cat__water_quality_fluoride'
 'cat__water_quality_fluoride abandoned' 'cat__water_quality_milky'
 'cat__water_quality_salty' 'cat__water_quality_salty abandoned'
 'cat__water_quality_soft'] …

🚨 Number of features kept by SelectFromModel: 88
🚨 These are the features that the RandomForest sees:

   • age__age
   • num__amount_tsh
   • cat__source_class_surface
   • cat__water_quality_milky
   • cat__water_quality_salty
   • cat__water_quality_salty abandoned
   • cat__water_quality_soft
   • cat__water_quality_unknown
   • cat__payment_type_monthly
   • cat__payment_type_never pay
   • cat__payment_type_on failure
   • cat__payment_type_other
   • cat__payment_type_per bucket
   • cat__payment_type_unknown
   • cat__extraction_type_class_handpump
   • cat__extraction_type_class_motorpump
   • cat__extraction_type_class_other
  

In [1]:
import pandas as pd

df=pd.read_csv("../data/Training_Set_Values.csv")       # Read the Training data CSV file

In [7]:
# 1. Replace blank strings with NaN
df['subvillage'] = df['subvillage'].replace('', pd.NA)

# 2. Function to fill NaN with fallback hierarchy
def fill_subvillage(row):
    if pd.isna(row['subvillage']):
        ward_mode = df[df['ward'] == row['ward']]['subvillage'].mode()
        if not ward_mode.empty:
            return ward_mode[0]

        lga_mode = df[df['lga'] == row['lga']]['subvillage'].mode()
        if not lga_mode.empty:
            return lga_mode[0]

        region_mode = df[df['district_code'] == row['district_code']]['subvillage'].mode()
        if not region_mode.empty:
            return region_mode[0]

        return 'unknown'

    return row['subvillage']

# 3. Apply the function
df['subvillage'] = df.apply(fill_subvillage, axis=1)

In [4]:
print(df)

          id  amount_tsh date_recorded           funder  gps_height  \
0      69572      6000.0    2011-03-14            Roman        1390   
1       8776         0.0    2013-03-06          Grumeti        1399   
2      34310        25.0    2013-02-25     Lottery Club         686   
3      67743         0.0    2013-01-28           Unicef         263   
4      19728         0.0    2011-07-13      Action In A           0   
...      ...         ...           ...              ...         ...   
59395  60739        10.0    2013-05-03  Germany Republi        1210   
59396  27263      4700.0    2011-05-07      Cefa-njombe        1212   
59397  37057         0.0    2011-04-11              NaN           0   
59398  31282         0.0    2011-03-08            Malec           0   
59399  26348         0.0    2011-03-23       World Bank         191   

          installer  longitude   latitude              wpt_name  num_private  \
0             Roman  34.938093  -9.856322                  none    

In [3]:
import pandas as pd

# 1) Read your full training‐set CSV. Adjust the path if needed.
df = pd.read_csv("../data/Training_Set_Values.csv")

# 2) Select only the geographic columns you care about, then drop duplicate rows.
geo_cols = ["region", "region_code", "district_code", "lga", "ward", "subvillage", "basin", "latitude", "longitude"]
geo_lookup = df[geo_cols].drop_duplicates().reset_index(drop=True)

# 3) (Optional) You can check that it looks reasonable:
print("Total unique rows in lookup:", len(geo_lookup))
print(geo_lookup.head(10))

# 4) Save the lookup to a new file (e.g. “data/geo_lookup.csv”):
geo_lookup.to_csv("../data/geo_lookup.csv", index=False)


Total unique rows in lookup: 58663
      region  region_code  district_code              lga        ward  \
0     Iringa           11              5           Ludewa    Mundindi   
1       Mara           20              2        Serengeti       Natta   
2    Manyara           21              4        Simanjiro     Ngorika   
3     Mtwara           90             63         Nanyumbu    Nanyumbu   
4     Kagera           18              1          Karagwe  Nyakasimbi   
5      Tanga            4              8           Mkinga         Moa   
6  Shinyanga           17              3  Shinyanga Rural      Samuye   
7  Shinyanga           17              3           Kahama      Chambo   
8     Tabora           14              6     Tabora Urban    Itetemia   
9     Kagera           18              1          Karagwe      Kaisho   

        subvillage                    basin   latitude  longitude  
0         Mnyusi B               Lake Nyasa  -9.856322  34.938093  
1          Nyamara       

In [2]:
import pandas as pd

# 1) Load the full training CSV
df = pd.read_csv("../data/Training_Set_Values.csv")

# 2) Filter for region == "Dodoma" and district_code == 3
mask = (df["region"].str.lower() == "mwanza") & (df["district_code"] == 1)
subset = df.loc[mask]

# 3) Extract and print the unique subvillage names (dropping any NaN)
subvillages = subset["subvillage"].dropna().unique().tolist()
print("Subvillages in mwanza, district 1:", sorted(subvillages))


Subvillages in mwanza, district 1: ['Alugoma', 'Amgera', 'Amrumo', 'Amuhama', 'Amwama', 'Apembe', 'Azimio', 'Azimio A', 'Bahati', 'Bondeni', 'Bubange', 'Bugando', 'Buganza', 'Bugengere', 'Bugomwa', 'Bugoye', 'Buhaganzara', 'Bukesela', 'Bukimwi', 'Bukindo Nkokolo', 'Bukongokati', 'Bukulungila', 'Bukumi', 'Bukungu', 'Bunguru', 'Busele', 'Busenda', 'Busiri Center', 'Busumagu', 'Butiama', 'Butiliti', 'Buyanja', 'Buyoga', 'Buzegwe Ofisini', 'Bwisya', 'Bwiyombe', 'Ccm', 'Center', 'Chamatule', 'Chamlindi', 'Chang`Ombe', 'Chankobe', 'Chemba', 'Chemchemu', 'Chirago', 'Corner', 'Ebhugwe', 'Elimu', 'Emama', 'Gallu', 'Habare', 'Halwaya', 'Halwego Kati', 'Halweya B', 'Hamkuno', 'Harulalo', 'Huduma', 'Igatengwa', 'Iringo', 'Irondo Mgharibi', 'Jabatundu', 'Jamnono', 'Jiranimwema', 'Jiwekuu', 'Kabakala', 'Kabakara', 'Kabakarz', 'Kabanga', 'Kabasuma', 'Kagera', 'Kagulembela', 'Kakerege', 'Kakoma', 'Kambarage', 'Kamengo', 'Kamote', 'Kanisani', 'Karibusele', 'Kasahunga', 'Kasalu', 'Kaseniziwani', 'Kasozu

In [11]:
df["region_lower"] = df["region"].str.lower()
region_to_basins = (
    df
    .dropna(subset=["region_lower", "basin"])
    .groupby("region_lower")["basin"]
    .unique()
    .apply(list)
    .to_dict()
)


In [13]:
print(region_to_basins)

{'arusha': ['Pangani', 'Internal', 'Lake Victoria'], 'dar es salaam': ['Wami / Ruvu'], 'dodoma': ['Wami / Ruvu', 'Internal', 'Rufiji'], 'iringa': ['Lake Nyasa', 'Rufiji'], 'kagera': ['Lake Victoria', 'Lake Tanganyika'], 'kigoma': ['Lake Tanganyika'], 'kilimanjaro': ['Pangani', 'Internal'], 'lindi': ['Ruvuma / Southern Coast', 'Rufiji'], 'manyara': ['Pangani', 'Internal', 'Wami / Ruvu'], 'mara': ['Lake Victoria'], 'mbeya': ['Lake Nyasa', 'Rufiji', 'Lake Rukwa'], 'morogoro': ['Rufiji', 'Wami / Ruvu'], 'mtwara': ['Ruvuma / Southern Coast'], 'mwanza': ['Lake Victoria', 'Lake Tanganyika'], 'pwani': ['Wami / Ruvu', 'Rufiji'], 'rukwa': ['Lake Tanganyika', 'Lake Rukwa'], 'ruvuma': ['Lake Nyasa', 'Ruvuma / Southern Coast', 'Rufiji'], 'shinyanga': ['Internal', 'Lake Tanganyika', 'Lake Victoria'], 'singida': ['Internal', 'Rufiji', 'Lake Rukwa', 'Lake Tanganyika'], 'tabora': ['Lake Tanganyika', 'Internal', 'Lake Rukwa', 'Rufiji'], 'tanga': ['Pangani', 'Wami / Ruvu']}


In [4]:
import pandas as pd

df = pd.read_csv("../data/Training_Set_Values.csv")
print(sorted(df["source_type"].dropna().unique().tolist()))


['borehole', 'dam', 'other', 'rainwater harvesting', 'river/lake', 'shallow well', 'spring']


In [2]:
from preprocessors import (
    LowerCaseStrings,
    StringConverter,
    YearExtractor,
    IQRCapper,
    ConstructionYearTransformer,
    ObjectToNumericConverter,
    AgeCalculator,
    FrequencyEncoder,
    RegionCodeCombiner,
    ColumnDropper,
    AgePipeline,
    GeoContextImputer,
)

import joblib

# Now you can safely unpickle:
pipeline = joblib.load("full_pipeline.joblib")

# Inspect which raw column names the ColumnTransformer expects:
print(pipeline.named_steps["preprocessor"].feature_names_in_)

pipeline = joblib.load("full_pipeline.joblib")
print(pipeline.named_steps["preprocessor"].feature_names_in_)


['amount_tsh' 'date_recorded' 'funder' 'gps_height' 'installer'
 'longitude' 'latitude' 'wpt_name' 'basin' 'subvillage' 'region_code'
 'district_code' 'lga' 'ward' 'population' 'public_meeting'
 'scheme_management' 'scheme_name' 'permit' 'construction_year'
 'extraction_type_class' 'management' 'management_group' 'payment'
 'payment_type' 'water_quality' 'quantity' 'source_type' 'source_class'
 'waterpoint_type' 'region_with_code']
['amount_tsh' 'date_recorded' 'funder' 'gps_height' 'installer'
 'longitude' 'latitude' 'wpt_name' 'basin' 'subvillage' 'region_code'
 'district_code' 'lga' 'ward' 'population' 'public_meeting'
 'scheme_management' 'scheme_name' 'permit' 'construction_year'
 'extraction_type_class' 'management' 'management_group' 'payment'
 'payment_type' 'water_quality' 'quantity' 'source_type' 'source_class'
 'waterpoint_type' 'region_with_code']


In [1]:
import json

# load one feature to inspect its property names
geo_district = json.load(open("../data/gadm41_TZA_shp/gadm41_TZA_2.json"))
print(geo_district["features"][0]["properties"].keys())


dict_keys(['GID_2', 'GID_0', 'COUNTRY', 'GID_1', 'NAME_1', 'NL_NAME_1', 'NAME_2', 'VARNAME_2', 'NL_NAME_2', 'TYPE_2', 'ENGTYPE_2', 'CC_2', 'HASC_2'])


In [3]:
district_summary_df = agg_district.rename(columns={
    "district_code": "ID_2",
    "pct_functional": "pct_functional"
})

folium.Choropleth(
    geo_data=geo_district,
    data=district_summary_df,
    columns=["ID_2","pct_functional"],
    key_on="feature.properties.ID_2",

)


NameError: name 'agg_district' is not defined