In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.ensemble import RandomForestRegressor

world_df = pd.read_excel("deduplicated_by_mode.xlsx")
india_df = pd.read_excel("india_imputed_full_covariates.xlsx")

def clean_screening_year(val):
    try:
        return float(val)
    except:
        if str(val).lower() == "not started":
            return 0
        return np.nan
world_df["Start of Screening coverage (year)"] = world_df["Start of Screening coverage (year)"].apply(clean_screening_year)

def map_male_circumcision(val):
    if isinstance(val, str):
        val = val.strip()
        if '<' in val:
            return 10
        elif '–' in val or '-' in val:
            return 50
        elif '>' in val:
            return 90
    try:
        return float(val)
    except:
        return np.nan
world_df["Male circumcision (WHO 2007)"] = world_df["Male circumcision (WHO 2007)"].apply(map_male_circumcision)

feature_map = {
    "Total_fertility_rate_2018_Total": "Total Fertility rate (2017)",
    "Contraception_Any method (%)": "Contraception use (updated 2019)",
    "All women age 15-49 years who are anaemic (%)": "Anemia prevalence among women of reproductive age (% of women ages 15-49)",
    "Pregnant women age 15-49 years who are anaemic (<11.0 g/dl) (%)": "Anemia prevalence among pregnant women (%)",
    "Non-pregnant women age 15-49 years who are anaemic (<12.0 g/dl) (%)": "Anemia prevalence among non-pregnant women (% of women ages 15-49)",
    "HIV_2018-19": "HIV Prevalence (in adults)",
    "Marital_age_2020_All Ages": "Mean marital age",
    "Diabetes": "Diabetes Prevalence",
    "Hypertension": "Hypertension",
    "Life_Expectancy_SRS 2016-20 Female": "Life expectancy at birth",
    "Mean_schoolingUrban_Female": "Mean years of schooling",
    "2017-18_Expected_Years_Female": "Expected years of schooling",
    "2011-12_Income_Female": "Gross national income (GNI) per capita",
    "Upto 2010": "HPV vaccination introduction",
    "TB_2020_Total_Notified": "Incidence of TB"
}

world_features = world_df.columns.tolist()
indian_features = india_df.columns.tolist()
all_possible_mapped = list(feature_map.keys())
mapped_world_cols = [feature_map[k] for k in all_possible_mapped if k in india_df.columns]

exclude_keywords = ['prevalence', 'cases', 'Country', 'Continent', 'Sample size']
missing_world_covariates = [
    col for col in world_features
    if col not in mapped_world_cols
    and world_df[col].dtype in ['float64', 'int64']
    and not any(key in col for key in exclude_keywords)
]

manual_additions = [
    "Start of Screening coverage (year)",
    "Male circumcision (WHO 2007)",
    "Multiple pregnancies (%age)",
    "Smoking Prevalence (Current smoking prevalence females, 2016)",
    "Anemia prevalence among women of reproductive age (% of women ages 15-49)",
    "Sexual Initiation age"
]
for col in manual_additions:
    if col in world_df.columns and col not in missing_world_covariates:
        missing_world_covariates.append(col)

imputed_features = {}
for target_col in missing_world_covariates:
    try:
        available_indian_cols = [k for k in feature_map if k in india_df.columns and feature_map[k] in world_df.columns]
        world_input_cols = [feature_map[k] for k in available_indian_cols]
        temp_df = world_df[world_input_cols + [target_col]].dropna()
        if len(temp_df) < 5:
            print(f"⚠️ Skipped {target_col}: insufficient data ({len(temp_df)} rows)")
            continue
        X_train = temp_df[world_input_cols]
        y_train = temp_df[target_col]

        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)

        X_india = india_df[available_indian_cols].copy()
        X_india.columns = world_input_cols
        imputed_features[target_col] = model.predict(X_india)
        print(f"✅ Imputed: {target_col}")
    except Exception as e:
        print(f"❌ Could not impute {target_col}: {e}")
        continue

if "Male circumcision (WHO 2007)" in india_df.columns or "Male circumcision (WHO 2007)" in imputed_features:
    if "Male circumcision (WHO 2007)" not in india_df.columns:
        india_df["Male circumcision (WHO 2007)"] = imputed_features["Male circumcision (WHO 2007)"]
    def categorize(val):
        if val < 20:
            return "Low"
        elif val <= 80:
            return "Medium"
        else:
            return "High"
    india_df["Male circumcision category"] = india_df["Male circumcision (WHO 2007)"].apply(categorize)
    dummies = pd.get_dummies(india_df["Male circumcision category"], prefix="Male circumcision category")
    india_df = pd.concat([india_df, dummies], axis=1)

imputed_df = pd.DataFrame(imputed_features)
india_extended = pd.concat([india_df.reset_index(drop=True), imputed_df], axis=1)

india_extended.to_excel("india_imputed_full_covariates7.xlsx", index=False)
print("✅ Saved file: 'india_imputed_full_covariates5.xlsx'")


✅ Imputed: Physicians per 1,000 people
✅ Imputed: Mean targeted age
✅ Imputed: Population estimate
✅ Imputed: Total Fertility rate (2017)
✅ Imputed: Contraception use (updated 2019)
✅ Imputed: HIV Prevalence (in adults)
✅ Imputed: Sexual Initiation age
✅ Imputed: Mean marital age
✅ Imputed: Multiple pregnancies (%age)
✅ Imputed: Male circumcision (WHO 2007)
✅ Imputed: Condom Use
✅ Imputed: Start of Screening coverage (year)
✅ Imputed: HPV vaccination introduction
✅ Imputed: Age adjusted incidence (standardized rates)
✅ Imputed: Number of deaths (all ages, 2021)
✅ Imputed: Mortality rates (age standardized)
✅ Imputed: Human Development Index (HDI)
✅ Imputed: Life expectancy at birth
✅ Imputed: Expected years of schooling
✅ Imputed: Mean years of schooling
✅ Imputed: Gross national income (GNI) per capita
✅ Imputed: Incidence of TB
✅ Imputed: Diabetes Prevalence
✅ Imputed: HPV Vaccine
✅ Imputed: Coverage ever screened of women 30 - 49 years (%)
✅ Imputed: Coverage in last 5 years of wome