In [3]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from scipy.interpolate import interp1d
from sklearn.linear_model import LinearRegression
import joblib
import os
import openpyxl

In [12]:
def load_and_process_files(boys_file, girls_file, is_age_based=False):
    """Loads, cleans, and combines boy and girl data from WHO excel files."""
    # The fix is changing header=1 to header=0
    df_b = pd.read_excel(f"data/{boys_file}", header=0)
    df_g = pd.read_excel(f"data/{girls_file}", header=0)

    primary_col = df_b.columns[0]
    
    df_b['Sex'] = 'Male'
    df_g['Sex'] = 'Female'
    
    df_combined = pd.concat([df_b, df_g], ignore_index=True)
    
    metric_col_name = primary_col
    if is_age_based:
        df_combined['Agemos'] = df_combined[primary_col] / 30.4375
        metric_col_name = 'Agemos'
        
    return df_combined, metric_col_name

# Load all datasets using the corrected function
wfa_df, wfa_metric = load_and_process_files('wfa-boys-percentiles-expanded-tables.xlsx', 'wfa-girls-percentiles-expanded-tables.xlsx', is_age_based=True)
print(f"Loaded Weight-for-Age data successfully. Shape: {wfa_df.shape}")

lhfa_df, lhfa_metric = load_and_process_files('lhfa-boys-percentiles-expanded-tables.xlsx', 'lhfa-girls-percentiles-expanded-tables.xlsx', is_age_based=True)
print(f"Loaded Length/Height-for-Age data successfully. Shape: {lhfa_df.shape}")

wfl_df, wfl_metric = load_and_process_files('wfl-boys-percentiles-expanded-tables.xlsx', 'wfl-girls-percentiles-expanded-tables.xlsx')
print(f"Loaded Weight-for-Length data successfully. Shape: {wfl_df.shape}")

wfh_df, wfh_metric = load_and_process_files('wfh-boys-percentiles-expanded-tables.xlsx', 'wfh-girls-percentiles-expanded-tables.xlsx')
print(f"Loaded Weight-for-Height data successfully. Shape: {wfh_df.shape}")

Loaded Weight-for-Age data successfully. Shape: (3714, 21)
Loaded Length/Height-for-Age data successfully. Shape: (3714, 21)
Loaded Weight-for-Length data successfully. Shape: (1302, 20)
Loaded Weight-for-Height data successfully. Shape: (1102, 20)


In [13]:
def create_and_save_interpolators(df, metric_col, model_name):
    p_cols = [col for col in df.columns if isinstance(col, str) and col.startswith('P')]
    
    interpolators = {'Male': {}, 'Female': {}}
    for gender_str in ['Male', 'Female']:
        df_gender = df[df['Sex'] == gender_str].copy().sort_values(by=metric_col)
        for col in p_cols:
            interpolators[gender_str][col] = interp1d(df_gender[metric_col], df_gender[col], bounds_error=False, fill_value="extrapolate")
    
    model_path = f'models/{model_name}_interpolators.joblib'
    joblib.dump(interpolators, model_path)
    print(f"Successfully saved {model_name} model.")

# Create interpolator models
create_and_save_interpolators(wfa_df, wfa_metric, 'wfa')
create_and_save_interpolators(lhfa_df, lhfa_metric, 'lhfa')
create_and_save_interpolators(wfl_df, wfl_metric, 'wfl')
create_and_save_interpolators(wfh_df, wfh_metric, 'wfh')

# Create and save the predictive model
p_cols_wfa = [col for col in wfa_df.columns if isinstance(col, str) and col.startswith('P')]
p_vals_wfa_numeric = [float(re.findall(r'(\d+\.?\d*)', p_col)[0]) for p_col in p_cols_wfa]
median_idx = (np.abs(np.array(p_vals_wfa_numeric) - 50)).argmin()
median_p_col = p_cols_wfa[median_idx]
print(f"Using '{median_p_col}' as the median for the predictive model.")

df_wfa_avg = wfa_df[['Agemos', median_p_col, 'Sex']].copy()
df_wfa_avg.rename(columns={median_p_col: 'P50'}, inplace=True)
df_wfa_avg['Weight_T_plus_6'] = df_wfa_avg.groupby('Sex')['P50'].shift(-182)
df_wfa_avg.dropna(inplace=True)

X = df_wfa_avg[['Agemos', 'P50']]
y = df_wfa_avg['Weight_T_plus_6']

lr_model = LinearRegression()
lr_model.fit(X, y)

model_path = 'models/growth_predictor.joblib'
joblib.dump(lr_model, model_path)
print(f"Successfully saved predictive model.")

Successfully saved wfa model.
Successfully saved lhfa model.
Successfully saved wfl model.
Successfully saved wfh model.
Using 'P50' as the median for the predictive model.
Successfully saved predictive model.


In [17]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import plotly.graph_objects as go

# --- 1. Get Predictions from the Model ---
# We use the same data we trained on to see how well the model fits
X_eval = df_wfa_avg[['Agemos', 'P50']]
y_true = df_wfa_avg['Weight_T_plus_6']
y_pred = lr_model.predict(X_eval)


# --- 2. Calculate and Print the Regression Report ---
mae = mean_absolute_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

print("--- Regression Model Evaluation Report ---")
print(f"R-squared (R²):           {r2:.4f}")
print(f"Mean Absolute Error (MAE):  {mae:.4f} kg")
print(f"Mean Squared Error (MSE):   {mse:.4f}")


# --- 3. Plot the Graph: Actual vs. Predicted Values ---
fig = go.Figure()

# Add a scatter plot of actual vs. predicted values
fig.add_trace(go.Scatter(
    x=y_true, 
    y=y_pred,
    mode='markers',
    name='Actual vs. Predicted',
    marker=dict(color='blue', opacity=0.7)
))

# Add the "perfect fit" line (where y=x) for comparison
fig.add_trace(go.Scatter(
    x=[y_true.min(), y_true.max()],
    y=[y_true.min(), y_true.max()],
    mode='lines',
    name='Perfect Fit Line',
    line=dict(color='red', dash='dash')
))

fig.update_layout(
    title='<b>Model Performance: Actual vs. Predicted Future Weight</b>',
    xaxis_title='Actual Future Weight (kg)',
    yaxis_title='Predicted Future Weight (kg)',
    template='plotly_white'
)

fig.show()

--- Regression Model Evaluation Report ---
R-squared (R²):           0.9962
Mean Absolute Error (MAE):  0.1449 kg
Mean Squared Error (MSE):   0.0347


In [15]:
# This cell is for testing the models and generating all reports/plots in the notebook

# --- 1. Define Helper Functions ---
# We need to recreate the percentile calculation logic here for testing
def get_percentile_notebook(metric_value, measurement, gender, model_dict):
    p_cols_names = list(model_dict[gender].keys())
    p_labels = [float(re.findall(r'(\d+\.?\d*)', p_col)[0]) for p_col in p_cols_names]
    p_values = [func(metric_value) for func in model_dict[gender].values()]

    sorted_pairs = sorted(zip(p_values, p_labels))
    p_values_sorted, p_labels_sorted = zip(*sorted_pairs)
    
    percentile = np.interp(measurement, p_values_sorted, p_labels_sorted)
    return round(percentile, 2)

def get_classification(percentile):
    if percentile < 3: return "At Risk / Low"
    if 3 <= percentile <= 97: return "Normal"
    return "At Risk / High"

def plot_chart_notebook(df, x_metric, y_metric, x_val, y_val, gender, title):
    df_gender = df[df['Sex'] == gender]
    fig = go.Figure()
    
    p_cols_to_plot = [col for col in df.columns if col in ['P3', 'P15', 'P50', 'P85', 'P97']]
    
    for p in p_cols_to_plot:
        fig.add_trace(go.Scatter(x=df_gender[x_metric], y=df_gender[p], mode='lines', name=f'{p} Curve', line_dash='dash', line_color='lightgray'))
    
    fig.add_trace(go.Scatter(x=[x_val], y=[y_val], mode='markers', name='Child', marker=dict(color='red', size=12, symbol='star')))
    fig.update_layout(title=f'<b>{title} ({gender})</b>', xaxis_title=x_metric, yaxis_title=y_metric, template='plotly_white')
    fig.show()


# --- 2. Define a Test Case ---
test_gender = 'Female'
test_age_days = 450 # ~15 months
test_weight_kg = 9.5
test_height_cm = 78.0
test_age_months = test_age_days / 30.4375

print("---  एनालिटिक्स रिपोर्ट (Analytics Report) ---")
print(f"Sample Case: Gender={test_gender}, Age={test_age_days} days, Weight={test_weight_kg}kg, Height={test_height_cm}cm\n")

# --- 3. Load All Models ---
models = {
    'wfa': joblib.load('models/wfa_interpolators.joblib'),
    'lhfa': joblib.load('models/lhfa_interpolators.joblib'),
    'wfl': joblib.load('models/wfl_interpolators.joblib'),
    'wfh': joblib.load('models/wfh_interpolators.joblib'),
    'predictor': joblib.load('models/growth_predictor.joblib')
}
print("All models loaded successfully.\n")


# --- 4. Generate Reports and Plots ---

# Report 1: Weight-for-Age
print("--- Weight-for-Age (WFA) Report ---")
wfa_perc = get_percentile_notebook(test_age_months, test_weight_kg, test_gender, models['wfa'])
wfa_class = get_classification(wfa_perc)
print(f"Percentile: {wfa_perc}%")
print(f"Classification: {wfa_class}")
plot_chart_notebook(wfa_df, 'Agemos', 'P50', test_age_months, test_weight_kg, test_gender, "Weight-for-Age")

# Report 2: Length/Height-for-Age
print("\n--- Length/Height-for-Age (LHFA) Report ---")
lhfa_perc = get_percentile_notebook(test_age_months, test_height_cm, test_gender, models['lhfa'])
lhfa_class = get_classification(lhfa_perc)
print(f"Percentile: {lhfa_perc}%")
print(f"Classification: {lhfa_class}")
plot_chart_notebook(lhfa_df, 'Agemos', 'P50', test_age_months, test_height_cm, test_gender, "Length/Height-for-Age")

# Report 3: Weight-for-Length/Height
print("\n--- Weight-for-Length/Height (WFLH) Report ---")
if test_age_days < 730:
    wflh_model_key = 'wfl'
    wflh_df_to_plot = wfl_df
    wflh_metric_col = wfl_metric
    print("Info: Using Weight-for-Length data (child is under 2 years).")
else:
    wflh_model_key = 'wfh'
    wflh_df_to_plot = wfh_df
    wflh_metric_col = wfh_metric
    print("Info: Using Weight-for-Height data (child is 2 years or older).")
    
wflh_perc = get_percentile_notebook(test_height_cm, test_weight_kg, test_gender, models[wflh_model_key])
wflh_class = get_classification(wflh_perc)
print(f"Percentile: {wflh_perc}%")
print(f"Classification: {wflh_class}")
plot_chart_notebook(wflh_df_to_plot, wflh_metric_col, 'P50', test_height_cm, test_weight_kg, test_gender, "Weight-for-Length")

# Report 4: Predictive Model
print("\n--- Future Growth Prediction Report ---")
predicted_weight = models['predictor'].predict([[test_age_months, test_weight_kg]])[0]
print(f"Predicted weight at {test_age_months + 6:.1f} months: {predicted_weight:.2f} kg")

---  एनालिटिक्स रिपोर्ट (Analytics Report) ---
Sample Case: Gender=Female, Age=450 days, Weight=9.5kg, Height=78.0cm

All models loaded successfully.

--- Weight-for-Age (WFA) Report ---
Percentile: 48.18%
Classification: Normal



--- Length/Height-for-Age (LHFA) Report ---
Percentile: 59.92%
Classification: Normal



--- Weight-for-Length/Height (WFLH) Report ---
Info: Using Weight-for-Length data (child is under 2 years).
Percentile: 41.03%
Classification: Normal



--- Future Growth Prediction Report ---
Predicted weight at 20.8 months: 11.07 kg



X does not have valid feature names, but LinearRegression was fitted with feature names

