In [1]:
!aws s3 cp s3://wellifyyyy/model-artifacts/local-training/prediction_model.joblib ./
!aws s3 cp s3://wellifyyyy/model-artifacts/local-training/recommendation_model.joblib ./
!aws s3 cp s3://wellifyyyy/model-artifacts/local-training/model1_features.joblib ./


download: s3://wellifyyyy/model-artifacts/local-training/prediction_model.joblib to ./prediction_model.joblib
download: s3://wellifyyyy/model-artifacts/local-training/recommendation_model.joblib to ./recommendation_model.joblib
download: s3://wellifyyyy/model-artifacts/local-training/model1_features.joblib to ./model1_features.joblib


In [22]:
import os
import json
import joblib
import pandas as pd
import boto3
from datetime import datetime
import numpy as np
import uuid
import io
from decimal import Decimal 

# ==============================================================================
#  !!! CRITICAL SETUP STEPS !!!
#
#  1. The prediction_model.joblib, recommendation_model.joblib, and model1_features.joblib
#     files MUST be downloaded to the current directory (./) for this script to run.
#  2. The script ASSUMES your environment has IAM permissions to access S3
#     (s3:GetObject, s3:ListBucket) and DynamoDB (dynamodb:PutItem).
#
# ==============================================================================

# --- CONFIGURATION & PATHS ---
PREDICTION_MODEL_PATH = "./prediction_model.joblib"
RECOMMENDATION_MODEL_PATH = "./recommendation_model.joblib"
MODEL1_FEATURES_PATH = "./model1_features.joblib"
DYNAMODB_TABLE_NAME = "Final_predictions"

# --- S3 INPUT CONFIGURATION ---
S3_BUCKET_NAME = "wellifyyyy"
S3_PREFIX = "processed_for_sagemaker/"

# --- CONSTANTS ---
FALLBACK_EMAIL = "missing_email@wellifyy.com"

# The 24 definitive input feature names
EXPECTED_INPUT_FEATURES = [
    'age', 'gender', 'height_cm', 'weight_kg',
    'Smoking_status', 'Alcohol_consumption', 'physical_activity_level',
    'Sleep_hours_per_day', 'Stress_level', 'Diet_quality',
    'cholesterol_level', 'glucose_level', 'blood_pressure_sys',
    'blood_pressure_dia', 'region', 'BMI_derived',
    'Obesity_category', 'Hypertension_flag', 'Prediabetes_Diabetes_flag',
    'Physical_inactivity_flag', 'Cardiac_risk_index', 'Stress_index',
    'Diet_risk_index', 'Lung_risk_index'
]

# The 4 target columns the model is *erroneously* demanding in the input
TARGET_COLUMN_NAMES = [
    'Heart_Score_pct',
    'Hypertension_Risk_Score_pct',
    'Diabetes_Risk_Score_pct',
    'Lung_Issue_Score_pct'
]

# The full 28-column list the model artifact expects
FULL_EXPECTED_COLUMNS = EXPECTED_INPUT_FEATURES + TARGET_COLUMN_NAMES


# --- UTILITY FUNCTIONS ---

def convert_floats_to_decimals(obj):
    """
    Recursively converts float values within a dict or list to Decimal objects, 
    required for DynamoDB Number types. This function correctly skips strings 
    (like those in the recommendation list), ensuring they are preserved.
    """
    if isinstance(obj, float):
        # Convert float to Decimal via string representation for precision
        try:
            return Decimal(str(obj))
        except:
            return Decimal("0.0")
    elif isinstance(obj, dict):
        return {k: convert_floats_to_decimals(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        # Recursively process elements in the list
        return [convert_floats_to_decimals(elem) for elem in obj]
    else:
        return obj


def load_all_csv_data(bucket_name, s3_prefix):
    """
    Loads all rows from all CSV files, extracts metadata, and prepares feature payloads.
    """
    all_payloads = []
    skipped_records_count = 0

    RENAME_MAP = {
        'smoking_status': 'Smoking_status',
        'alcohol_consumption': 'Alcohol_consumption',
        'sleep_hours_per_day': 'Sleep_hours_per_day',
        'stress_level': 'Stress_level',
        'diet_quality': 'Diet_quality',
    }

    STRING_OR_CATEGORICAL_KEYS = [
        'gender', 'region', 'Diet_quality', 'Smoking_status',
        'Alcohol_consumption', 'Obesity_category', 'Hypertension_flag',
        'Prediabetes_Diabetes_flag', 'Physical_inactivity_flag'
    ]

    try:
        s3 = boto3.client('s3')
        response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_prefix)
        all_csv_keys = [item['Key'] for item in response.get('Contents', []) if item['Key'].endswith('.csv')]

        if not all_csv_keys:
            print("Warning: No CSV files found.")
            return all_payloads, skipped_records_count

        master_df = pd.DataFrame()
        for key in all_csv_keys:
            obj = s3.get_object(Bucket=bucket_name, Key=key)
            csv_data_body = obj['Body'].read().decode('utf-8')
            data_buffer = io.StringIO(csv_data_body)
            df_file = pd.read_csv(data_buffer, dtype={
                'submissionId': str,
                'email': str,
                'timestamp': str
            })
            master_df = pd.concat([master_df, df_file], ignore_index=True)

        for index, row in master_df.iterrows():
            try:
                record = row.to_dict()
                features = {}

                # --- EXTRACT CRITICAL METADATA ---
                submission_id = str(record.pop('submissionId', uuid.uuid4()))
                user_email = str(record.pop('email', FALLBACK_EMAIL))
                original_timestamp = str(record.pop('timestamp', datetime.utcnow().isoformat()))

                # Apply renaming
                for csv_key, model_key in RENAME_MAP.items():
                    if csv_key in record:
                        features[model_key] = record.pop(csv_key)

                features.update(record)

                # Cast numerical features to float explicitly
                for key, value in features.items():
                    if key not in STRING_OR_CATEGORICAL_KEYS:
                        try:
                            features[key] = float(value) if pd.notna(value) else 0.0
                        except (ValueError, TypeError):
                            features[key] = 0.0

                # Ensure categorical features are strings
                for key in STRING_OR_CATEGORICAL_KEYS:
                    if key in features:
                        features[key] = str(features[key])

                payload = {
                    "metadata": {
                        "submission_id": submission_id,
                        "email": user_email,
                        "timestamp": original_timestamp
                    },
                    "features": features
                }
                all_payloads.append(payload)

            except Exception as row_error:
                print(f"⚠️ SKIPPING RECORD at index {index}: {row_error}")
                skipped_records_count += 1

        return all_payloads, skipped_records_count

    except Exception as e:
        print(f"CRITICAL ERROR: Failed to process CSV data: {e}")

    return all_payloads, skipped_records_count


def _load_models(pred_path, rec_path, features_path):
    """Loads both models and the feature list from the specified local paths."""
    print("Loading models and feature list...")
    prediction_model, recommendation_model, model1_features = None, None, None

    try:
        prediction_model = joblib.load(pred_path)
    except FileNotFoundError:
        print(f"CRITICAL ERROR: Prediction model not found at {pred_path}")

    try:
        recommendation_model = joblib.load(rec_path)
    except FileNotFoundError:
        print(f"CRITICAL ERROR: Recommendation model not found at {rec_path}")

    try:
        model1_features = joblib.load(features_path)
    except FileNotFoundError:
        print(f"CRITICAL ERROR: Feature list not found at {features_path}. Using hardcoded fallback list.")

    if not all([prediction_model, recommendation_model]):
        print("Models failed to load. Script cannot proceed.")
        return None, None, None

    print("Models loaded successfully.")
    return prediction_model, recommendation_model, model1_features


def run_inference(payload, pred_model, rec_model, full_expected_columns):
    """
    Performs actual inference and constructs a fully flattened DynamoDB record,
    keeping recommendations as a list of structured objects.
    """

    submission_id = payload['metadata'].get('submission_id')
    user_email = payload['metadata'].get('email')
    original_timestamp = payload['metadata'].get('timestamp')
    input_features = payload['features']

    # 1. Prepare the 28-column DataFrame for model input
    row_data = {}
    for column_name in full_expected_columns:
        if column_name in input_features:
            row_data[column_name] = input_features[column_name]
        elif column_name in TARGET_COLUMN_NAMES:
            row_data[column_name] = 0.0
        else:
            # Fallback for unexpected missing input features
            row_data[column_name] = 0.0 

    X_full_input = pd.DataFrame([row_data], columns=full_expected_columns)


    # 2. --- ACTUAL MODEL PREDICTION LOGIC ---

    # 2a. Prediction Model Call (Multi-output)
    try:
        # Note: The model expects 28 columns (4 target columns must be present, though unused)
        raw_predictions = pred_model.predict(X_full_input)[0] 

        heart_risk = float(raw_predictions[0]) * 100
        hypertension_risk = float(raw_predictions[1]) * 100
        diabetes_risk = float(raw_predictions[2]) * 100
        respiratory_risk = float(raw_predictions[3]) * 100

        max_risk = max(heart_risk, hypertension_risk, diabetes_risk, respiratory_risk)
        risk_level = "High" if max_risk > 65 else ("Medium" if max_risk > 30 else "Low")

    except Exception as e:
        raise Exception(f"Model Prediction Failed for {submission_id}: {e}")

    # 2b. Recommendation Model Call & Generation (ENHANCED LOGIC)
    
    # Run the recommendation model (even if it's a dummy/pipeline step)
    rec_model.predict(X_full_input)

    final_recommendations_list = []
    
    # Extract key input values for easier logic
    stress_level = float(input_features.get('Stress_level', 0.0))
    sleep_hours = float(input_features.get('Sleep_hours_per_day', 0.0))
    physical_activity_level = float(input_features.get('physical_activity_level', 0.0))
    smoking_status = input_features.get('Smoking_status', '').lower()
    bmi_derived = float(input_features.get('BMI_derived', 0.0))
    glucose_level = float(input_features.get('glucose_level', 0.0))
    diet_quality = input_features.get('Diet_quality', '').lower()
    hypertension_flag = int(input_features.get('Hypertension_flag', 0))

    
    # --- RULE SET 1: HIGH-SEVERITY RECOMMENDATIONS (Based on Scores & Flags) ---
    
    # 1. Cardiovascular / Blood Pressure Focus
    if heart_risk > 50 or hypertension_risk > 50 or hypertension_flag == 1:
        final_recommendations_list.append({
            "type": "Cardiovascular", 
            "text": "Your cardiovascular risk is elevated. **Prioritize sodium reduction** and aim for 150 minutes of moderate-intensity cardio each week (e.g., brisk walking)."
        })

    # 2. Metabolic / Diabetes Focus
    if diabetes_risk > 45 or glucose_level >= 100:
        final_recommendations_list.append({
            "type": "Metabolic Health", 
            "text": "**Monitor blood sugar closely** and reduce consumption of refined carbohydrates and sugary drinks. Incorporate high-fiber foods (beans, whole grains)."
        })
        
    # 3. Respiratory / Smoking Cessation
    if respiratory_risk > 40 and 'never' not in smoking_status:
        final_recommendations_list.append({
            "type": "Respiratory Health", 
            "text": "Your lung health shows significant risk. **Immediate cessation of smoking/vaping is paramount.** Seek professional resources to help quit."
        })
    elif 'current' in smoking_status or 'former' in smoking_status:
         final_recommendations_list.append({
            "type": "Respiratory Health", 
            "text": "**Even occasional smoking increases risk.** Focus on breathing exercises and minimizing exposure to pollutants."
        })


    # --- RULE SET 2: LIFESTYLE & WELLNESS IMPROVEMENTS (Based on Features) ---

    # 4. Weight Management
    if bmi_derived >= 30: # Obese
        final_recommendations_list.append({
            "type": "Weight Management", 
            "text": "Your **BMI indicates obesity**. Consult with a doctor or registered dietitian to create a sustainable, calorie-managed plan focusing on whole foods."
        })
    elif bmi_derived >= 25: # Overweight
        final_recommendations_list.append({
            "type": "Weight Management", 
            "text": "Focus on **portion control** and ensure 2-3 structured resistance training sessions weekly to build muscle and boost metabolism."
        })

    # 5. Physical Activity
    if physical_activity_level < 2.5: # Highly inactive
        final_recommendations_list.append({
            "type": "Fitness", 
            "text": "Your activity level is low. Start small: **aim for 10-minute bursts of walking** three times a day. Gradually increase to 30 minutes daily."
        })

    # 6. Stress & Sleep
    if stress_level >= 7 or sleep_hours < 6.5:
        final_recommendations_list.append({
            "type": "Mental Wellness", 
            "text": "**Improve sleep hygiene** (consistent bedtime, cool dark room) and dedicate 15 minutes daily to a stress-reducing activity like meditation or deep breathing."
        })
        
    # 7. Diet Quality
    if diet_quality == 'poor':
        final_recommendations_list.append({
            "type": "Nutrition", 
            "text": "**Significantly improve diet quality.** Focus on eating a colorful variety of vegetables and limiting ultra-processed foods."
        })


    # --- RULE SET 3: FALLBACK ---
    if not final_recommendations_list:
        final_recommendations_list.append({
            "type": "Wellness", 
            "text": "Overall excellent health profile. Maintain your current balanced lifestyle and schedule your annual health screenings."
        })


    # 3. Final DynamoDB Record Structure (FULLY FLATTENED except for the recommendation list)
    current_utc_time = datetime.utcnow().isoformat()
    
    dynamodb_record = {
        # --- METADATA ---
        "submissionId": submission_id,
        "email": user_email,
        "timestamp": original_timestamp,
        "processed_timestamp": current_utc_time,
        
        # --- ALL 24 INPUT FEATURES ---
    }
    dynamodb_record.update(input_features)
    
    # --- ALL 5 PREDICTION FIELDS (FLATTENED) ---
    dynamodb_record.update({
        "risk_level": risk_level,
        "heart_risk_score_pct": f"{heart_risk:.2f}",
        "hypertension_risk_score_pct": f"{hypertension_risk:.2f}",
        "diabetes_risk_score_pct": f"{diabetes_risk:.2f}",
        "respiratory_risk_score_pct": f"{respiratory_risk:.2f}",
    })
    
    # --- RECOMMENDATIONS (LIST OF MAPS - stored under "recommendations_list") ---
    dynamodb_record["recommendations_list"] = final_recommendations_list


    # 4. Dashboard Output Structure (For sample summary display)
    dashboard_output = {
        "metadata": payload['metadata'],
        "health_scores": {
            "risk_level": risk_level,
            "heart_risk_score_percent": f"{heart_risk:.2f}%",
            "hypertension_risk_score_percent": f"{hypertension_risk:.2f}%",
            "diabetes_risk_score_percent": f"{diabetes_risk:.2f}%",
            "respiratory_risk_score_percent": f"{respiratory_risk:.2f}%"
        },
        "recommendations": final_recommendations_list
    }

    return dynamodb_record, dashboard_output


def _save_to_dynamodb(record, table_name):
    """Saves a single record to DynamoDB after converting floats to Decimals."""
    try:
        # Use a high-level resource object which handles Python types automatically
        dynamodb = boto3.resource('dynamodb')
        table = dynamodb.Table(table_name)

        # CRITICAL: Convert all Python floats (for numerical features) to Decimal
        processed_record = convert_floats_to_decimals(record)

        if not processed_record.get('submissionId'):
             print(f"❌ DYNAMODB WRITE ERROR: submissionId is missing or empty for record. Skipping write.")
             return False

        table.put_item(Item=processed_record)
        return True
    except Exception as e:
        print(f"❌ DYNAMODB WRITE ERROR for {record.get('submissionId', 'UNKNOWN')}: {e}")
        return False


if __name__ == "__main__":

    # 1. Load Models and Feature List
    prediction_model, recommendation_model, model1_features_list = _load_models(
        PREDICTION_MODEL_PATH,
        RECOMMENDATION_MODEL_PATH,
        MODEL1_FEATURES_PATH
    )

    # 2. Define the definitive feature list (Use global FULL_EXPECTED_COLUMNS)
    print(f"Input DataFrame structure FORCED (Total 28 columns): {FULL_EXPECTED_COLUMNS}")


    # 3. Load ALL records from ALL S3 CSVs under the prefix
    if prediction_model is not None and recommendation_model is not None:
        all_inference_payloads, skipped_load_count = load_all_csv_data(S3_BUCKET_NAME, S3_PREFIX)

        if not all_inference_payloads:
            print("\n*** BATCH INFERENCE FAILED. No valid records processed. ***")
        else:
            dynamodb_writes_successful = 0
            last_dashboard_output = None

            print(f"\nTotal Payloads Ready for Inference: {len(all_inference_payloads)}")
            print("Starting batch inference and DynamoDB upload...")

            for payload in all_inference_payloads:
                safe_id = payload.get('metadata', {}).get('submission_id', 'UNKNOWN_ID')
                try:
                    dynamodb_record, dashboard_output = run_inference(
                        payload,
                        prediction_model,
                        recommendation_model,
                        FULL_EXPECTED_COLUMNS,
                    )

                    if _save_to_dynamodb(dynamodb_record, DYNAMODB_TABLE_NAME):
                        dynamodb_writes_successful += 1
                        last_dashboard_output = dashboard_output

                except Exception as inference_error:
                    print(f"\n❌ INFERENCE/MODEL ERROR for submissionId: {safe_id}: {inference_error}")


            # 4. Print Summary and Sample Output
            print("\n" + "="*70)
            print("                         BATCH INFERENCE SUMMARY")
            print("="*70)
            print(f"➡️ Records Loaded from S3: {len(all_inference_payloads) + skipped_load_count}")
            print(f"➡️ Records Skipped During Load (Bad Data): {skipped_load_count}")
            print(f"✅ Payloads Sent for Inference: {len(all_inference_payloads)}")
            print(f"✅ Records Successfully Saved to DynamoDB: {dynamodb_writes_successful}")
            print("="*70)

            if last_dashboard_output:
                print("\nSAMPLE DASHBOARD-READY API RESPONSE (Last Successful Record):")
                print("----------------------------------------------------------")
                print(json.dumps(last_dashboard_output, indent=4))

            print("\n" + "="*70)


Loading models and feature list...
Models loaded successfully.
Input DataFrame structure FORCED (Total 28 columns): ['age', 'gender', 'height_cm', 'weight_kg', 'Smoking_status', 'Alcohol_consumption', 'physical_activity_level', 'Sleep_hours_per_day', 'Stress_level', 'Diet_quality', 'cholesterol_level', 'glucose_level', 'blood_pressure_sys', 'blood_pressure_dia', 'region', 'BMI_derived', 'Obesity_category', 'Hypertension_flag', 'Prediabetes_Diabetes_flag', 'Physical_inactivity_flag', 'Cardiac_risk_index', 'Stress_index', 'Diet_risk_index', 'Lung_risk_index', 'Heart_Score_pct', 'Hypertension_Risk_Score_pct', 'Diabetes_Risk_Score_pct', 'Lung_Issue_Score_pct']

Total Payloads Ready for Inference: 12
Starting batch inference and DynamoDB upload...

                         BATCH INFERENCE SUMMARY
➡️ Records Loaded from S3: 12
➡️ Records Skipped During Load (Bad Data): 0
✅ Payloads Sent for Inference: 12
✅ Records Successfully Saved to DynamoDB: 12

SAMPLE DASHBOARD-READY API RESPONSE (Last 

In [3]:
pip install tabulate

Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import numpy as np
import joblib
import os
import tabulate

# --- Configuration ---
MODEL_DIR = "model"
PREDICTOR_SCORE_MODEL_PATH = os.path.join(MODEL_DIR, "prediction_scores_model.joblib")
RISK_LEVEL_MODEL_PATH = os.path.join(MODEL_DIR, "risk_level_model.joblib")
RECOMMENDATION_MODEL_PATH = os.path.join(MODEL_DIR, "recommendation_model.joblib")
RECOMMENDATION_ENCODERS_PATH = os.path.join(MODEL_DIR, "recommendation_label_encoders.joblib")

# Define the feature columns used in the training script
PRED_FEATURE_COLS = [
    'Sleep duration', 'Avg Cholesterol', 'Avg Glucose', 'Avg Systolic BP', 'Avg Diastolic BP',
    'Smoking status', 'Alcohol consumption', 'Physical activity level',
    'Stress level', 'Diet quality', 'Family history of chronic disease'
]

REC_FEATURE_COLS = [
    'Hypertension_Risk_Score_pct', 'Diabetes_Risk_Score_pct', 'Lung_Issue_Score_pct',
    'Heart_Score_pct', 'Stress_level', 'Physical_inactivity_flag',
    'Sleep_hours_per_day', 'Diet_quality', 'Smoking_status', 'Alcohol_consumption'
]

# Define the target columns for the prediction score model
PRED_SCORE_TARGET_COLS = [
    'overall_risk_score_pct', 'heart_risk_score_pct',
    'hypertension_risk_score_pct', 'diabetes_risk_score_pct', 'respiratory_risk_score_pct'
]

# Define the target columns for the recommendation model
REC_TARGET_COLS = ['Exercise_Rec', 'Nutrition_Rec', 'Sleep_Rec', 'Stress_Rec', 'Other_Rec']


def load_models():
    """Loads all trained models and label encoders."""
    try:
        print("Loading trained models and encoders...")
        prediction_scores_model = joblib.load(PREDICTOR_SCORE_MODEL_PATH)
        risk_level_model = joblib.load(RISK_LEVEL_MODEL_PATH)
        recommendation_model = joblib.load(RECOMMENDATION_MODEL_PATH)
        recommendation_label_encoders = joblib.load(RECOMMENDATION_ENCODERS_PATH)
        print("Models loaded successfully.")
        return prediction_scores_model, risk_level_model, recommendation_model, recommendation_label_encoders
    except FileNotFoundError as e:
        print(f"Error: Could not find model file. Please ensure models are saved in the '{MODEL_DIR}' directory.")
        print(f"Missing file: {e}")
        return None, None, None, None


def get_dummy_input_data():
    """
    Creates a dummy DataFrame to simulate new user input.
    In a real application, this would come from an API request.
    """
    print("\n--- Creating Dummy Input Data ---")
    
    # 1. Base inputs needed for Risk Prediction
    risk_input = {
        'Sleep duration': [6.5],
        'Avg Cholesterol': [210],
        'Avg Glucose': [115],
        'Avg Systolic BP': [145],
        'Avg Diastolic BP': [90],
        'Smoking status': ['Current'],
        'Alcohol consumption': ['Heavy'],
        'Physical activity level': ['Low'],
        'Stress level': ['High'],
        'Diet quality': ['Poor'],
        'Family history of chronic disease': ['Yes']
    }
    
    X_pred_new = pd.DataFrame(risk_input)
    
    # Add 'Family history of chronic disease' if missing from new input (as done in training script)
    if 'Family history of chronic disease' not in X_pred_new.columns:
        X_pred_new['Family history of chronic disease'] = 'No'

    return X_pred_new


def generate_predictions(X_pred_new, models, encoders):
    """
    Runs all predictions and aggregates the final output.
    
    Args:
        X_pred_new (pd.DataFrame): The new input data for a user.
        models (tuple): Loaded models (prediction_scores_model, risk_level_model, recommendation_model).
        encoders (dict): Loaded recommendation label encoders.
        
    Returns:
        pd.DataFrame: A DataFrame containing all predicted columns.
    """
    prediction_scores_model, risk_level_model, recommendation_model, recommendation_label_encoders = models
    
    if prediction_scores_model is None:
        return pd.DataFrame()
    
    # --- 1. Risk Score and Level Prediction ---
    print("\n--- 1. Predicting Risk Scores and Level ---")
    
    # Ensure the input features are in the correct order/set for the prediction model
    X_pred_input = X_pred_new[PRED_FEATURE_COLS]
    
    # Predict Risk Scores (RandomForestRegressor)
    y_scores_pred = prediction_scores_model.predict(X_pred_input)
    results_df = pd.DataFrame(y_scores_pred, columns=PRED_SCORE_TARGET_COLS)
    
    # Predict Risk Level (RandomForestClassifier)
    y_level_pred = risk_level_model.predict(X_pred_input)
    results_df['risk_level'] = y_level_pred[0] # Assuming single row input
    
    print(f"Predicted Overall Risk: {results_df['risk_level'].iloc[0]} ({results_df['overall_risk_score_pct'].iloc[0]:.2f}%)")


    # --- 2. Data Preparation for Recommendation Model ---
    print("\n--- 2. Preparing Data for Recommendation Model ---")
    
    # The recommendation model requires specific features, some of which are *derived*
    # from the risk scores and original inputs.
    
    # Extract relevant scores and initial inputs for the recommendation model input format
    rec_input_data = {
        'Hypertension_Risk_Score_pct': results_df['hypertension_risk_score_pct'].iloc[0],
        'Diabetes_Risk_Score_pct': results_df['diabetes_risk_score_pct'].iloc[0],
        'Heart_Score_pct': results_df['heart_risk_score_pct'].iloc[0],
        # Approximating Lung_Issue_Score_pct with respiratory_risk_score_pct
        'Lung_Issue_Score_pct': results_df['respiratory_risk_score_pct'].iloc[0], 
        'Stress_level': X_pred_new['Stress level'].iloc[0],
        'Sleep_hours_per_day': X_pred_new['Sleep duration'].iloc[0],
        'Diet_quality': X_pred_new['Diet quality'].iloc[0],
        'Smoking_status': X_pred_new['Smoking status'].iloc[0],
        'Alcohol_consumption': X_pred_new['Alcohol consumption'].iloc[0],
        # Creating Physical_inactivity_flag (Need logic based on training data - using a simple proxy here)
        'Physical_inactivity_flag': 'Yes' if X_pred_new['Physical activity level'].iloc[0] == 'Low' else 'No'
    }
    
    X_rec_new = pd.DataFrame([rec_input_data], columns=REC_FEATURE_COLS)
    
    # --- 3. Recommendation Prediction ---
    print("--- 3. Predicting Recommendations ---")

    # Predict Recommendations (MultiOutputClassifier)
    # Ensure the input features are in the correct order/set for the recommendation model
    X_rec_input = X_rec_new[REC_FEATURE_COLS]
    y_recs_encoded_pred = recommendation_model.predict(X_rec_input)
    
    # Decode the predictions
    decoded_recs = {}
    for i, col in enumerate(REC_TARGET_COLS):
        le = recommendation_label_encoders[col]
        # Predict always returns an array, take the first (and only) element for single row prediction
        decoded_recs[col] = le.inverse_transform(y_recs_encoded_pred[:, i])[0]
        
    # Append recommendations to the results DataFrame
    rec_df = pd.DataFrame([decoded_recs])
    results_df = pd.concat([results_df, rec_df], axis=1)

    print("Recommendation Predictions:")
    for key, value in decoded_recs.items():
        print(f"  {key}: {value}")
        
    return results_df


if __name__ == "__main__":
    
    # Load all necessary components
    models = load_models()
    if models[0] is None:
        exit() # Exit if model loading failed

    # Get sample input data
    X_pred_new = get_dummy_input_data()
    
    # Generate final predictions
    final_predictions_df = generate_predictions(X_pred_new, models, models[3])
    
    print("\n=======================================================")
    print("           FINAL PREDICTION OUTPUT STRUCTURE")
    print("=======================================================")
    print(final_predictions_df.transpose().to_markdown(numalign="left", stralign="left"))
    print("=======================================================")
    
    # Display the list of columns generated
    print("\nCOLUMNS GENERATED:")
    print(final_predictions_df.columns.tolist())

Loading trained models and encoders...
Models loaded successfully.

--- Creating Dummy Input Data ---

--- 1. Predicting Risk Scores and Level ---
Predicted Overall Risk: High (100.00%)

--- 2. Preparing Data for Recommendation Model ---
--- 3. Predicting Recommendations ---
Recommendation Predictions:
  Exercise_Rec: Commit to at least 30 minutes of moderate exercise, five times a week. Include daily brisk walking or light cardio to manage blood pressure.
  Nutrition_Rec: Dramatically reduce your salt intake. Limit sugar and boost your fiber intake (whole grains, vegetables). Increase your intake of fruits and vegetables and cut back on processed foods.
  Sleep_Rec: No Recommendation
  Stress_Rec: No Recommendation
  Other_Rec: Schedule regular blood pressure checks. Work towards maintaining a healthy, stable weight.

           FINAL PREDICTION OUTPUT STRUCTURE
|                             | 0                                                                                           

In [14]:
import pandas as pd
import numpy as np
import joblib
import os
import boto3

# --- Configuration ---
DATA_BUCKET = "s3://wellifyyyy"
# This is the path to the raw data you want to process
RAW_HEALTH_DATA_PATH = os.path.join(DATA_BUCKET, "processed_for_sagemaker/part-00000-9e2e89b9-68c7-4faa-8238-4967f75d26c5-c000.csv")
# This is the output path for the final predictions
OUTPUT_S3_PATH = os.path.join(DATA_BUCKET, "final_predictions/final_predictions.csv")

MODEL_DIR = "model"
PREDICTOR_SCORE_MODEL_PATH = os.path.join(MODEL_DIR, "prediction_scores_model.joblib")
RISK_LEVEL_MODEL_PATH = os.path.join(MODEL_DIR, "risk_level_model.joblib")
RECOMMENDATION_MODEL_PATH = os.path.join(MODEL_DIR, "recommendation_model.joblib")
RECOMMENDATION_ENCODERS_PATH = os.path.join(MODEL_DIR, "recommendation_label_encoders.joblib")

# --- Model Input/Output Definitions (DO NOT CHANGE THESE) ---

# Expected standardized feature names used by your TRAINED Prediction Models
PRED_FEATURE_COLS = [
    'Sleep duration', 'Avg Cholesterol', 'Avg Glucose', 'Avg Systolic BP', 'Avg Diastolic BP',
    'Smoking status', 'Alcohol consumption', 'Physical activity level',
    'Stress level', 'Diet quality', 'Family history of chronic disease'
]

# Expected standardized feature names used by your TRAINED Recommendation Model
REC_FEATURE_COLS = [
    'Hypertension_Risk_Score_pct', 'Diabetes_Risk_Score_pct', 'Lung_Issue_Score_pct',
    'Heart_Score_pct', 'Stress_level', 'Physical_inactivity_flag',
    'Sleep_hours_per_day', 'Diet_quality', 'Smoking_status', 'Alcohol_consumption'
]

# Output columns from the prediction scores model
PRED_SCORE_TARGET_COLS = [
    'overall_risk_score_pct', 'heart_risk_score_pct',
    'hypertension_risk_score_pct', 'diabetes_risk_score_pct', 'respiratory_risk_score_pct'
]

# Output columns from the recommendation model
REC_TARGET_COLS = ['Exercise_Rec', 'Nutrition_Rec', 'Sleep_Rec', 'Stress_Rec', 'Other_Rec']


def load_models():
    """Loads all trained models and encoders."""
    try:
        print("Loading trained models and encoders...")
        prediction_scores_model = joblib.load(PREDICTOR_SCORE_MODEL_PATH)
        risk_level_model = joblib.load(RISK_LEVEL_MODEL_PATH)
        recommendation_model = joblib.load(RECOMMENDATION_MODEL_PATH)
        recommendation_label_encoders = joblib.load(RECOMMENDATION_ENCODERS_PATH)
        print("Models loaded successfully.")
        return prediction_scores_model, risk_level_model, recommendation_model, recommendation_label_encoders
    except FileNotFoundError as e:
        print(f"Error: Could not find model file. Ensure models are saved in the '{MODEL_DIR}' directory.")
        raise


def map_numeric_to_categorical(series, scale_map):
    """Maps a pandas Series of numerical strings to categorical labels based on a dictionary."""
    # Convert string to float/int for comparison, coercing errors to NaN
    numeric_values = pd.to_numeric(series, errors='coerce')
    
    # Function to apply the mapping
    def mapper(value):
        if pd.isna(value):
            return 'Unknown'
        for category, (low, high) in scale_map.items():
            if low <= value <= high:
                return category
        # Default to the lowest category if outside defined range
        return 'Low'

    return numeric_values.apply(mapper)


def preprocess_data_for_prediction(df):
    """
    Handles column renaming, feature selection, and necessary diagnostic checks 
    to prepare the raw data for model input.
    """
    df_processed = df.copy()

    # --- 1. CRITICAL COLUMN RENAMING ---
    column_mapping = {
        # Numerical Features
        'sleep_hours_per_day': 'Sleep duration',
        'cholesterol_level': 'Avg Cholesterol',
        'glucose_level': 'Avg Glucose',
        'blood_pressure_sys': 'Avg Systolic BP',
        'blood_pressure_dia': 'Avg Diastolic BP',
        
        # Categorical Features
        'smoking_status': 'Smoking status',
        'alcohol_consumption': 'Alcohol consumption',
        'physical_activity_level': 'Physical activity level',
        'stress_level': 'Stress level',
        'diet_quality': 'Diet quality',
    }
    
    valid_mapping = {k: v for k, v in column_mapping.items() if k in df_processed.columns}
    df_processed.rename(columns=valid_mapping, inplace=True)
    
    # Add 'Family history of chronic disease' (missing in raw data)
    if 'Family history of chronic disease' not in df_processed.columns:
        df_processed['Family history of chronic disease'] = 'No'

    # Select only the necessary standardized features
    X_pred_new = df_processed[PRED_FEATURE_COLS].copy()
    
    # --- 2. DIAGNOSTIC CHECKS & IMPUTATION ---
    
    # (Removed verbose diagnostic prints for brevity, keeping only the essential logic)
    
    # --- Mandatory Type Correction and Imputation ---
    num_cols_to_check = ['Sleep duration', 'Avg Cholesterol', 'Avg Glucose', 'Avg Systolic BP', 'Avg Diastolic BP']
    
    # Convert numerics and handle NaNs
    for col in num_cols_to_check:
        X_pred_new[col] = pd.to_numeric(X_pred_new[col], errors='coerce')
    X_pred_new[num_cols_to_check] = X_pred_new[num_cols_to_check].fillna(X_pred_new[num_cols_to_check].mean())
    
    # Fill categorical NaNs with a safe default
    categorical_cols_raw = [col for col in PRED_FEATURE_COLS if col not in num_cols_to_check]
    X_pred_new[categorical_cols_raw] = X_pred_new[categorical_cols_raw].fillna('Unknown')
    
    print("\nData Imputation and Type Coercion Complete.")
    
    # --- 3. CRITICAL FIX: NUMERIC-TO-CATEGORICAL MAPPING ---
    # Convert numerical ratings (which were wrongly being passed as strings '1.0', '3', etc.) 
    # back into categorical labels ('Low', 'Moderate', 'High') expected by the OHE.
    
    print("\n--- CRITICAL FIX: Mapping Numerical Strings to Categorical Labels ---")

    # Define the mapping scale (assuming 1-10 scale for both features)
    SCALE_MAP = {
        'Low': (1.0, 3.9), 
        'Moderate': (4.0, 7.9), 
        'High': (8.0, 10.0)
    }

    # Apply mapping for Physical activity level
    X_pred_new['Physical activity level'] = map_numeric_to_categorical(
        X_pred_new['Physical activity level'], SCALE_MAP
    )
    
    # Apply mapping for Stress level
    X_pred_new['Stress level'] = map_numeric_to_categorical(
        X_pred_new['Stress level'], SCALE_MAP
    )
    
    # The derived feature 'Physical_inactivity_flag' in the prediction function 
    # relies on 'Low' being present, so this mapping fixes the downstream issue as well.
    
    print("Mapped 'Physical activity level' and 'Stress level' to 'Low', 'Moderate', or 'High'.")
    
    # --- 4. FINAL DIAGNOSTIC CHECK: CATEGORICAL VALUES ---
    print("\n--- FINAL DIAGNOSTIC CHECK: CATEGORICAL VALUES ---")
    categorical_cols_to_check = [col for col in PRED_FEATURE_COLS if X_pred_new[col].dtype == 'object']
    
    for col in categorical_cols_to_check:
        unique_values = X_pred_new[col].unique()
        print(f"Unique values in '{col}': {unique_values}")
    
    print("\nIf scores are still 100%, the problem is a label mismatch in a different feature (e.g., 'Smoking status' labels).")
    # -------------------------------------------------------------------------
    
    # Keep essential identifiers for final output
    info_cols = [col for col in ['submissionId', 'email', 'timestamp'] if col in df.columns]
    info_df = df[info_cols].copy()
    
    return X_pred_new, info_df

def calculate_synthetic_risk_scores(X_input):
    """
    DEBUG FUNCTION: Calculates simple, varied risk scores based on health metrics
    to confirm the rest of the pipeline (Risk Level + Recommendation) is functional,
    as the primary prediction scores model appears to be corrupted (all 100%).
    """
    synthetic_df = pd.DataFrame(index=X_input.index)
    
    # Calculate a simple health score based on deviations from ideal
    # A base score of 10 is the lowest risk. Max score is 100.
    base_score = 10 
    
    # --- 1. Hypertension Risk Score ---
    # Max risk if BP > 140/90
    sys_high_risk = np.where(X_input['Avg Systolic BP'] > 140, 40, 0)
    dia_high_risk = np.where(X_input['Avg Diastolic BP'] > 90, 40, 0)
    
    # Calculate score (min 10, max 80 + base = 90)
    htn_score = base_score + sys_high_risk + dia_high_risk
    # Add smaller penalty for pre-hypertension (120-139 / 80-89)
    htn_score += np.where((X_input['Avg Systolic BP'] >= 120) & (X_input['Avg Systolic BP'] <= 140), 10, 0)
    
    synthetic_df['hypertension_risk_score_pct'] = np.clip(htn_score, 10, 100)

    # --- 2. Diabetes Risk Score ---
    # High risk if Glucose > 125 (Diabetic)
    # Moderate risk if 100-125 (Prediabetic)
    
    glucose_risk = np.select(
        [X_input['Avg Glucose'] >= 126, X_input['Avg Glucose'] >= 100],
        [80, 40],
        default=base_score
    )
    synthetic_df['diabetes_risk_score_pct'] = np.clip(glucose_risk, 10, 100)

    # --- 3. Heart Risk Score (uses Cholesterol, BP, Smoking) ---
    # Max risk for high Cholesterol (>240) + Smoking
    
    cholesterol_risk = np.where(X_input['Avg Cholesterol'] >= 240, 40, 
                       np.where(X_input['Avg Cholesterol'] >= 200, 20, 0))
                       
    smoking_risk = np.where(X_input['Smoking status'] != 'never_smoked', 20, 0)
    
    heart_score = synthetic_df['hypertension_risk_score_pct'] * 0.5 + cholesterol_risk + smoking_risk
    synthetic_df['heart_risk_score_pct'] = np.clip(heart_score, 10, 100)

    # --- 4. Respiratory Risk Score (Placeholder for now, use Smoking/Activity) ---
    resp_score = np.where(X_input['Smoking status'] == 'current_smoker', 70, 
                 np.where(X_input['Smoking status'] == 'former_smoker', 40, 15))
    
    synthetic_df['respiratory_risk_score_pct'] = np.clip(resp_score, 10, 100)
    
    # --- 5. Overall Risk Score (Average of the above) ---
    synthetic_df['overall_risk_score_pct'] = synthetic_df[[
        'hypertension_risk_score_pct', 
        'diabetes_risk_score_pct', 
        'heart_risk_score_pct', 
        'respiratory_risk_score_pct'
    ]].mean(axis=1).round(2)
    
    # Rename columns to match the target columns
    synthetic_df = synthetic_df.rename(columns={
        'hypertension_risk_score_pct': PRED_SCORE_TARGET_COLS[2],
        'diabetes_risk_score_pct': PRED_SCORE_TARGET_COLS[3],
        'heart_risk_score_pct': PRED_SCORE_TARGET_COLS[1],
        'respiratory_risk_score_pct': PRED_SCORE_TARGET_COLS[4],
        'overall_risk_score_pct': PRED_SCORE_TARGET_COLS[0],
    })
    
    print("\n[DEBUG] Replacing uniform 100% scores with synthetic scores for testing downstream pipeline.")
    return synthetic_df


def generate_predictions_bulk(X_pred_new, models, encoders):
    """
    Runs all predictions for the entire input dataset.
    """
    prediction_scores_model, risk_level_model, recommendation_model, recommendation_label_encoders = models
    
    N = len(X_pred_new)
    print(f"\n--- Starting Bulk Prediction for {N} Records ---")

    # --- 1. Risk Score and Level Prediction ---
    print("1/3: Predicting Risk Scores...")
    try:
        y_scores_pred = prediction_scores_model.predict(X_pred_new)
        results_df = pd.DataFrame(y_scores_pred, columns=PRED_SCORE_TARGET_COLS, index=X_pred_new.index)
        
        # --- DEBUG CHECK: If all scores are 100 (or near 100), the model is likely broken ---
        # Check if the average of the first four core scores is > 99 for ALL records
        is_broken = (results_df[PRED_SCORE_TARGET_COLS[:4]].mean(axis=1) > 99).all()
        
        if is_broken:
            print("\n!!! WARNING: PREDICTION SCORES UNIFORM 100%. MODEL IS LIKELY BROKEN. !!!")
            print("Switching to synthetic risk scores for debugging Risk Level and Recommendation steps.")
            results_df = calculate_synthetic_risk_scores(X_pred_new)
        # ---------------------------------------------------------------------------------
        
    except Exception as e:
        print("\n!!! CRITICAL PREDICTION ERROR !!!")
        print(f"Prediction failed. Error details: {e}")
        results_df = pd.DataFrame(np.nan, index=X_pred_new.index, columns=PRED_SCORE_TARGET_COLS)
        results_df['risk_level'] = "ERROR"
        return results_df
        
    print("2/3: Predicting Risk Levels...")
    
    # Derive risk level based on the overall risk score (using standard thresholds)
    def derive_risk_level(score):
        if score >= 67:
            return 'High'
        elif score >= 34:
            return 'Moderate'
        else:
            return 'Low'

    results_df['risk_level'] = results_df['overall_risk_score_pct'].apply(derive_risk_level)
    
    print("Risk Score and Level Prediction Complete.")

    # --- 2. Data Preparation for Recommendation Model ---
    print("3/3: Predicting Recommendations...")
    
    # Create the required feature set for the recommendation model (REC_FEATURE_COLS)
    rec_input_df = X_pred_new.reset_index(drop=True).copy()
    results_df_reset = results_df.reset_index(drop=True)
    
    # The recommendation model relies on the scores we just calculated (real or synthetic)
    rec_input_df['Hypertension_Risk_Score_pct'] = results_df_reset['hypertension_risk_score_pct']
    rec_input_df['Diabetes_Risk_Score_pct'] = results_df_reset['diabetes_risk_score_pct']
    rec_input_df['Heart_Score_pct'] = results_df_reset['heart_risk_score_pct']
    rec_input_df['Lung_Issue_Score_pct'] = results_df_reset['respiratory_risk_score_pct']
    
    # Re-use original features and ensure they are strings for OHE in recommendation model
    rec_input_df['Stress_level'] = rec_input_df['Stress level'].astype(str) 
    rec_input_df['Sleep_hours_per_day'] = rec_input_df['Sleep duration'] 
    rec_input_df['Diet_quality'] = rec_input_df['Diet quality'].astype(str)
    rec_input_df['Smoking_status'] = rec_input_df['Smoking status'].astype(str)
    rec_input_df['Alcohol_consumption'] = rec_input_df['Alcohol consumption'].astype(str)
    
    # Create the derived feature 'Physical_inactivity_flag'
    rec_input_df['Physical_inactivity_flag'] = np.where(
        rec_input_df['Physical activity level'] == 'Low', 'Yes', 'No'
    )
    
    X_rec_new = rec_input_df[REC_FEATURE_COLS]
    
    # --- 3. Recommendation Prediction ---
    y_recs_encoded_pred = recommendation_model.predict(X_rec_new)
    
    # Decode the predictions
    decoded_recs = {}
    for i, col in enumerate(REC_TARGET_COLS):
        le = recommendation_label_encoders[col]
        decoded_recs[col] = le.inverse_transform(y_recs_encoded_pred[:, i])
        
    rec_df = pd.DataFrame(decoded_recs, index=X_pred_new.index)
    
    # Final merge
    final_predictions_df = pd.concat([results_df, rec_df], axis=1)

    print("Recommendation Prediction Complete.")
    return final_predictions_df


def output_final_predictions_bulk(df, path):
    """Prints the final DataFrame instead of saving to S3 for debugging."""
    print(f"\n--- FINAL PREDICTIONS FOR DEBUGGING ({len(df)} Records) ---")
    
    # Final list of columns to ensure consistent output order
    final_output_cols = (
        ['submissionId', 'email', 'timestamp'] + PRED_SCORE_TARGET_COLS + 
        ['risk_level'] + REC_TARGET_COLS + PRED_FEATURE_COLS
    )
    
    # Select and reorder columns
    final_cols = [col for col in final_output_cols if col in df.columns]
    
    # Print to console for inspection
    print(df[final_cols].to_markdown())
    print(f"Total {len(df)} records processed and displayed.")


if __name__ == "__main__":
    
    # 0. Load Data and Models
    try:
        raw_df = pd.read_csv(RAW_HEALTH_DATA_PATH)
        print("\n=======================================================")
        print("RAW DATA COLUMNS FROM S3:")
        print(raw_df.columns.tolist())
        print("=======================================================\n")
    except Exception as e:
        print(f"Error loading raw data from S3: {e}")
        exit()

    models = load_models()
    if models[0] is None:
        exit() 

    # 1. Preprocess and Prepare Input (Includes Imputation, Type Checks, and Mappings)
    X_pred_input, info_df = preprocess_data_for_prediction(raw_df)
    
    # 2. Generate Predictions
    predictions_df = generate_predictions_bulk(X_pred_input, models, models[3])
    
    # 3. Assemble Final Output DataFrame
    final_output_df = pd.concat([info_df, predictions_df, X_pred_input], axis=1)
    
    # 4. Output results for debugging (S3 export disabled)
    output_final_predictions_bulk(final_output_df, OUTPUT_S3_PATH)

    print("\n--- Script Complete ---")



RAW DATA COLUMNS FROM S3:
['submissionId', 'email', 'timestamp', 'age', 'gender', 'height_cm', 'weight_kg', 'smoking_status', 'alcohol_consumption', 'physical_activity_level', 'sleep_hours_per_day', 'stress_level', 'diet_quality', 'cholesterol_level', 'glucose_level', 'blood_pressure_sys', 'blood_pressure_dia', 'region', 'BMI_derived', 'Obesity_category', 'Hypertension_flag', 'Prediabetes_Diabetes_flag', 'Physical_inactivity_flag', 'Cardiac_risk_index', 'Stress_index', 'Diet_risk_index', 'Lung_risk_index']

Loading trained models and encoders...
Models loaded successfully.

Data Imputation and Type Coercion Complete.

--- CRITICAL FIX: Mapping Numerical Strings to Categorical Labels ---
Mapped 'Physical activity level' and 'Stress level' to 'Low', 'Moderate', or 'High'.

--- FINAL DIAGNOSTIC CHECK: CATEGORICAL VALUES ---
Unique values in 'Smoking status': ['never_smoked' 'current_smoker' 'former_smoker']
Unique values in 'Alcohol consumption': ['moderate' 'heavy' 'light' 'none']
Uniqu

In [9]:
import pandas as pd
import os

# --- Configuration ---
# 1. PATH TO YOUR CURRENT INPUT DATA (The data yielding 100% risk scores)
INPUT_DATA_PATH = "s3://wellifyyyy/processed_for_sagemaker/part-00000-9e2e89b9-68c7-4faa-8238-4967f75d26c5-c000.csv"

# 2. PATHS TO YOUR ORIGINAL TRAINING DATA FILES
# *** CRITICAL: FINALIZED PATHS ***
# Training file used for prediction scores (as confirmed by your last message)
TRAINING_DATA_PATH_1 = "s3://wellifyyyy/training-data/synthetic_health_inputs_10000.csv" 
# Training file used for recommendations/targets
TRAINING_DATA_PATH_2 = "s3://wellifyyyy/training-data/recommendation_labeled_data.csv" 
# ******************************

# Numerical columns to focus on for risk score comparison
RISK_FEATURES = [
    'cholesterol_level', 'glucose_level', 'blood_pressure_sys', 
    'blood_pressure_dia', 'sleep_hours_per_day'
]

def load_and_analyze_data(path, name):
    """Loads the entire file for accurate statistics, and prints diagnostics."""
    print(f"============================================================")
    print(f"--- Analysis for: {name} ---")
    print(f"Source Path: {path}")
    print(f"============================================================")
    
    try:
        # Load the entire DataFrame for accurate statistics
        df = pd.read_csv(path)
        
        # --- Column Analysis ---
        print("\n[1] Column Names:")
        print(df.columns.tolist())
        
        # --- First 5 Rows ---
        print(f"\n[2] First 5 Rows (Features focused on risk):\n")
        # Select risk features + a few identifiers for context
        display_cols = [col for col in ['submissionId', 'age', 'gender'] + RISK_FEATURES if col in df.columns]
        
        # If no common columns are found, just display the head
        if not display_cols:
            print("No common risk features found. Displaying full head:")
            print(df.head(5).to_markdown(index=False))
        else:
            print(df[display_cols].head(5).to_markdown(index=False))

        # --- Descriptive Statistics (for all numerical columns) ---
        print("\n[3] Summary Statistics (Mean, Min, Max of all numerical columns):\n")
        print(df.describe().to_markdown())
        
    except FileNotFoundError:
        print(f"ERROR: File not found at {path}. Please check the S3 path.")
    except Exception as e:
        print(f"An unexpected error occurred while processing {path}: {e}")

if __name__ == "__main__":
    
    # 1. Analyze the current input data
    load_and_analyze_data(INPUT_DATA_PATH, "CURRENT INPUT DATA (S3)")
    
    # 2. Analyze the first training data file (Prediction Model Source)
    load_and_analyze_data(TRAINING_DATA_PATH_1, "TRAINING DATA FILE 1 (Prediction Model)")

    # 3. Analyze the second training data file (Recommendation Model Source)
    load_and_analyze_data(TRAINING_DATA_PATH_2, "TRAINING DATA FILE 2 (Recommendation Model)")
    
    print("\n--- Diagnostic Complete ---")
    print("Carefully compare the 'mean' and 'max' values for the numerical risk features between the CURRENT INPUT DATA and the TRAINING DATA FILE 1. If the current input data has significantly higher values, the 100% score is likely due to the model predicting extreme risk.")


--- Analysis for: CURRENT INPUT DATA (S3) ---
Source Path: s3://wellifyyyy/processed_for_sagemaker/part-00000-9e2e89b9-68c7-4faa-8238-4967f75d26c5-c000.csv

[1] Column Names:
['submissionId', 'email', 'timestamp', 'age', 'gender', 'height_cm', 'weight_kg', 'smoking_status', 'alcohol_consumption', 'physical_activity_level', 'sleep_hours_per_day', 'stress_level', 'diet_quality', 'cholesterol_level', 'glucose_level', 'blood_pressure_sys', 'blood_pressure_dia', 'region', 'BMI_derived', 'Obesity_category', 'Hypertension_flag', 'Prediabetes_Diabetes_flag', 'Physical_inactivity_flag', 'Cardiac_risk_index', 'Stress_index', 'Diet_risk_index', 'Lung_risk_index']

[2] First 5 Rows (Features focused on risk):

| submissionId                   |   age | gender   |   cholesterol_level |   glucose_level |   blood_pressure_sys |   blood_pressure_dia |   sleep_hours_per_day |
|:-------------------------------|------:|:---------|--------------------:|----------------:|---------------------:|------------

In [16]:
import pandas as pd
import numpy as np
import joblib
import os
from typing import Dict, Any, List
import json # Used for pretty printing the output

# --- Configuration ---
MODEL_DIR = "model_artifacts" # Must match the directory where training script saved the files

# Check for model files
if not os.path.exists(MODEL_DIR):
    print(f"Error: Model directory '{MODEL_DIR}' not found. Run training_script.py first.")
    # In a production environment, you would raise an exception here.
    exit() 

# --- 1. Load Artifacts ---
# In a production environment, this loading step would typically happen once when the service starts
# to minimize latency for individual prediction calls.
print("Loading trained models and artifacts...")
try:
    # Load Prediction Models
    prediction_scores_pipeline = joblib.load(os.path.join(MODEL_DIR, "prediction_scores_model_pipeline.joblib"))
    risk_level_pipeline = joblib.load(os.path.join(MODEL_DIR, "risk_level_model_pipeline.joblib"))
    pred_feature_cols: List[str] = joblib.load(os.path.join(MODEL_DIR, "pred_feature_cols.joblib"))

    # Load Recommendation Models and Encoders
    recommendation_model_pipeline = joblib.load(os.path.join(MODEL_DIR, "recommendation_model_pipeline.joblib"))
    recommendation_label_encoders: Dict[str, joblib.Parallel] = joblib.load(os.path.join(MODEL_DIR, "recommendation_label_encoders.joblib"))
    rec_feature_cols: List[str] = joblib.load(os.path.join(MODEL_DIR, "rec_feature_cols.joblib"))
    
except FileNotFoundError as e:
    print(f"Error loading artifact: {e}. Ensure all files were saved correctly by training_script.py")
    exit()

print("Artifacts loaded successfully.")


# --- 2. Prediction Function ---

def predict_health_outcomes(raw_record: Dict[str, Any]) -> Dict[str, Any]:
    """
    Takes a single raw patient record (e.g., from DynamoDB) and generates 
    all risk scores, risk level, and personalized recommendations.

    Args:
        raw_record: A dictionary representing the patient's current health input.

    Returns:
        A dictionary containing the original data plus all predicted outcomes.
    """
    
    # 1. Prepare input for Risk Score and Risk Level Prediction
    # Create DataFrame, ensuring only the expected features are included
    try:
        pred_df = pd.DataFrame([raw_record])[pred_feature_cols]
    except KeyError as e:
        print(f"Error: Missing required input feature for prediction model: {e}")
        return {"error": f"Missing required feature: {e}"}

    # --- Predict Risk Scores (Multi-target Regression) ---
    # The pipeline handles all preprocessing (imputation, scaling, OHE)
    predicted_scores_array = prediction_scores_pipeline.predict(pred_df)
    
    score_names = ['overall_risk_score_pct', 'heart_risk_score_pct', 
                   'hypertension_risk_score_pct', 'diabetes_risk_score_pct', 
                   'respiratory_risk_score_pct']
    
    # Format scores into a dictionary, rounding to 2 decimal places
    predicted_scores = {
        name: round(float(score), 2) for name, score in zip(score_names, predicted_scores_array[0])
    }
    
    # --- Predict Risk Level (Classification) ---
    predicted_level = risk_level_pipeline.predict(pred_df)[0]
    
    # --- Predict Recommendations (Multi-output Classification) ---
    
    # 1. Prepare input for Recommendation Model
    # Start with the features used to train the recommendation model (rec_feature_cols)
    rec_input = {k: v for k, v in raw_record.items() if k in rec_feature_cols}
    
    # 2. Add the newly predicted risk scores to the input for the recommendation model
    rec_input['Hypertension_Risk_Score_pct'] = predicted_scores['hypertension_risk_score_pct']
    rec_input['Diabetes_Risk_Score_pct'] = predicted_scores['diabetes_risk_score_pct']
    rec_input['Lung_Issue_Score_pct'] = predicted_scores['respiratory_risk_score_pct'] 
    rec_input['Heart_Score_pct'] = predicted_scores['heart_risk_score_pct']

    # Ensure all required rec features are present before creating the final DataFrame
    rec_df_input_data = {col: rec_input.get(col, None) for col in rec_feature_cols}
    rec_df = pd.DataFrame([rec_df_input_data])[rec_feature_cols]

    # Predict encoded recommendations
    predicted_recs_encoded = recommendation_model_pipeline.predict(rec_df)[0]
    
    # Decode recommendations back to human-readable strings
    predicted_recs = {}
    recommendation_targets = ['Exercise_Rec', 'Nutrition_Rec', 'Sleep_Rec', 'Stress_Rec', 'Other_Rec']
    
    for i, col in enumerate(recommendation_targets):
        le = recommendation_label_encoders[col]
        # Decode the predicted integer back to the string label
        predicted_recs[col] = le.inverse_transform([predicted_recs_encoded[i]])[0]
        
    # --- Compile Final Output ---
    
    final_output = {
        **raw_record, # Include original data
        'prediction_timestamp': pd.Timestamp.now().isoformat(),
        
        'risk_prediction': {
            'risk_level': predicted_level,
            **predicted_scores
        },
        'recommendations': predicted_recs
    }
    
    return final_output

# --- Example Usage (Using Dummy Data for Testing) ---

# Example 1: High Risk Patient Input (DUMMY DATA)
high_risk_record = {
    # Features required for Risk Score/Level Prediction (pred_feature_cols)
    "Sleep duration": 5.2, 
    "Avg Cholesterol": 240, 
    "Avg Glucose": 145, 
    "Avg Systolic BP": 155, 
    "Avg Diastolic BP": 95,
    "Smoking status": "Current", 
    "Alcohol consumption": "Heavy", 
    "Physical activity level": "Low", 
    "Stress level": "High", 
    "Diet quality": "Poor", 
    "Family history of chronic disease": "Yes",
    
    # Additional features required for Recommendation Model Input (rec_feature_cols)
    "Physical_inactivity_flag": 1, 
    "Sleep_hours_per_day": 5.2, 
    
    # Other metadata (not used for prediction but often included in DB record)
    "user_id": "user-A123 (High Risk)",
    "timestamp": "2025-10-14T10:00:00Z",
}

# Example 2: Medium Risk Patient Input (DUMMY DATA)
medium_risk_record = {
    "Sleep duration": 7.0, 
    "Avg Cholesterol": 190, 
    "Avg Glucose": 110, 
    "Avg Systolic BP": 130, 
    "Avg Diastolic BP": 85,
    "Smoking status": "Former", 
    "Alcohol consumption": "Moderate", 
    "Physical activity level": "Moderate", 
    "Stress level": "Medium", 
    "Diet quality": "Average", 
    "Family history of chronic disease": "No",
    
    "Physical_inactivity_flag": 0, 
    "Sleep_hours_per_day": 7.0, 
    
    "user_id": "user-B456 (Medium Risk)",
    "timestamp": "2025-10-14T10:00:00Z",
}

# Example 3: Low Risk Patient Input (DUMMY DATA)
low_risk_record = {
    "Sleep duration": 8.5, 
    "Avg Cholesterol": 150, 
    "Avg Glucose": 85, 
    "Avg Systolic BP": 115, 
    "Avg Diastolic BP": 75,
    "Smoking status": "Never", 
    "Alcohol consumption": "Light", 
    "Physical activity level": "High", 
    "Stress level": "Low", 
    "Diet quality": "Good", 
    "Family history of chronic disease": "No",
    
    "Physical_inactivity_flag": 0, 
    "Sleep_hours_per_day": 8.5, 
    
    "user_id": "user-C789 (Low Risk)",
    "timestamp": "2025-10-14T10:00:00Z",
}


# Run all three predictions and print results
print("\n" + "="*50)
print("--- Running Prediction for HIGH Risk Patient ---")
print("="*50)
print(json.dumps(predict_health_outcomes(high_risk_record), indent=4))

print("\n" + "="*50)
print("--- Running Prediction for MEDIUM Risk Patient ---")
print("="*50)
print(json.dumps(predict_health_outcomes(medium_risk_record), indent=4))

print("\n" + "="*50)
print("--- Running Prediction for LOW Risk Patient ---")
print("="*50)
print(json.dumps(predict_health_outcomes(low_risk_record), indent=4))


Loading trained models and artifacts...
Artifacts loaded successfully.

--- Running Prediction for HIGH Risk Patient ---
{
    "Sleep duration": 5.2,
    "Avg Cholesterol": 240,
    "Avg Glucose": 145,
    "Avg Systolic BP": 155,
    "Avg Diastolic BP": 95,
    "Smoking status": "Current",
    "Alcohol consumption": "Heavy",
    "Physical activity level": "Low",
    "Stress level": "High",
    "Diet quality": "Poor",
    "Family history of chronic disease": "Yes",
    "Physical_inactivity_flag": 1,
    "Sleep_hours_per_day": 5.2,
    "user_id": "user-A123 (High Risk)",
    "timestamp": "2025-10-14T10:00:00Z",
    "prediction_timestamp": "2025-10-14T08:23:52.595520",
    "risk_prediction": {
        "risk_level": "High",
        "overall_risk_score_pct": 100.0,
        "heart_risk_score_pct": 100.0,
        "hypertension_risk_score_pct": 100.0,
        "diabetes_risk_score_pct": 100.0,
        "respiratory_risk_score_pct": 100.0
    },
    "recommendations": {
        "Exercise_Rec": "C

In [27]:
import pandas as pd
import numpy as np
import joblib
import os
from typing import Dict, Any, List
import json # Used for pretty printing the output
import re # Added for regular expression matching

# --- Configuration ---
MODEL_DIR = "model_artifacts" # Must match the directory where training script saved the files
DEBUG_MODE = True # Set to True to print input features and raw model output for the first patient.

# Check for model files
if not os.path.exists(MODEL_DIR):
    print(f"Error: Model directory '{MODEL_DIR}' not found. Run training_script.py first.")
    exit() 

# --- 1. Load Artifacts ---
print("Loading trained models and artifacts...")
try:
    # Load Prediction Models
    prediction_scores_pipeline = joblib.load(os.path.join(MODEL_DIR, "prediction_scores_model_pipeline.joblib"))
    # The risk_level_pipeline is no longer necessary as we derive risk from the score.
    pred_feature_cols: List[str] = joblib.load(os.path.join(MODEL_DIR, "pred_feature_cols.joblib"))

    # Load Recommendation Models and Encoders
    recommendation_model_pipeline = joblib.load(os.path.join(MODEL_DIR, "recommendation_model_pipeline.joblib"))
    recommendation_label_encoders: Dict[str, joblib.Parallel] = joblib.load(os.path.join(MODEL_DIR, "recommendation_label_encoders.joblib"))
    rec_feature_cols: List[str] = joblib.load(os.path.join(MODEL_DIR, "rec_feature_cols.joblib"))
    
except FileNotFoundError as e:
    print(f"Error loading artifact: {e}. Ensure all files were saved correctly by training_script.py")
    exit()

print("Artifacts loaded successfully.")

# --- 2. Prediction Function ---

def predict_health_outcomes(raw_record: Dict[str, Any]) -> Dict[str, Any]:
    """
    Takes a single raw patient record and generates all risk scores, risk level, 
    and personalized recommendations using the fixed, trained models.

    Args:
        raw_record: A dictionary representing the patient's current health input.

    Returns:
        A dictionary containing the original data plus all predicted outcomes.
    """
    
    is_debug_patient = raw_record.get('user_id') == 'H-A1 (High)' and DEBUG_MODE

    # 1. Prepare input for Risk Score Prediction
    try:
        pred_df = pd.DataFrame([raw_record])[pred_feature_cols]
    except KeyError as e:
        print(f"Error: Missing required input feature for prediction model: {e}")
        return {"error": f"Missing required feature: {e}"}

    if is_debug_patient:
        print("\n" + "="*80)
        print(f"--- DEBUG: Input Features for {raw_record.get('user_id')} (pred_df) ---")
        print("="*80)
        print(pred_df.to_string())


    # --- Predict Risk Scores (Multi-target Regression) ---
    predicted_scores_array = prediction_scores_pipeline.predict(pred_df)
    
    score_names = ['overall_risk_score_pct', 'heart_risk_score_pct', 
                   'hypertension_risk_score_pct', 'diabetes_risk_score_pct', 
                   'respiratory_risk_score_pct']
    
    # Format scores into a dictionary, rounding, and applying final clip based on trained max score (95)
    predicted_scores = {}
    for name, score in zip(score_names, predicted_scores_array[0]):
         # We rely on the model being non-saturated now, but still clip to ensure valid range
        predicted_scores[name] = round(float(np.clip(score, 0.0, 95.0)), 2)
    
    if is_debug_patient:
        print("\n" + "="*80)
        print("--- DEBUG: Final Predicted Scores ---")
        print("="*80)
        print(predicted_scores)

    # --- Determine Risk Level (Based on overall score, standard thresholds) ---
    overall_score = predicted_scores['overall_risk_score_pct']
    if overall_score <= 35.0:
        predicted_level = "Low"
    elif overall_score <= 70.0:
        predicted_level = "Medium"
    else:
        predicted_level = "High"

    # --- Predict Recommendations (Multi-output Classification) ---
    
    # 1. Prepare input for Recommendation Model (features + newly predicted scores)
    rec_input = {k: v for k, v in raw_record.items() if k in rec_feature_cols}
    
    # 2. Add the newly predicted risk scores to the input for the recommendation model
    rec_input['hypertension_risk_score_pct'] = predicted_scores['hypertension_risk_score_pct']
    rec_input['diabetes_risk_score_pct'] = predicted_scores['diabetes_risk_score_pct']
    rec_input['respiratory_risk_score_pct'] = predicted_scores['respiratory_risk_score_pct'] 
    rec_input['heart_risk_score_pct'] = predicted_scores['heart_risk_score_pct']

    # Ensure all required rec features are present before creating the final DataFrame
    rec_df_input_data = {col: rec_input.get(col, None) for col in rec_feature_cols}
    rec_df = pd.DataFrame([rec_df_input_data])[rec_feature_cols]

    # Predict encoded recommendations
    predicted_recs_encoded = recommendation_model_pipeline.predict(rec_df)[0]
    
    # Decode recommendations back to human-readable strings
    predicted_recs = {}
    recommendation_targets = ['Exercise_Rec', 'Nutrition_Rec', 'Sleep_Rec', 'Stress_Rec', 'Other_Rec']
    
    for i, col in enumerate(recommendation_targets):
        le = recommendation_label_encoders[col]
        # Decode the predicted integer back to the string label
        predicted_recs[col] = le.inverse_transform([predicted_recs_encoded[i]])[0]
        
    # --- Compile Final Output ---
    
    final_output = {
        **raw_record, # Include original data
        'risk_level': predicted_level,
        **predicted_scores,
        **predicted_recs
    }
    
    return final_output

# --- 3. Example Usage (Generating and Testing 10 Dummy Data Records) ---

DUMMY_RECORDS = [
    # HIGH RISK (4 Records)
    {
        "user_id": "H-A1 (High)", "Sleep duration": 5.2, "Avg Cholesterol": 240, 
        "Avg Glucose": 145, "Avg Systolic BP": 155, "Avg Diastolic BP": 95,
        "Smoking status": "Current", "Alcohol consumption": "Heavy", 
        "Physical activity level": "Low", "Stress level": "High", 
        "Diet quality": "Poor", "Family history of chronic disease": "Yes",
        "Physical_inactivity_flag": 1, "Sleep_hours_per_day": 5.2
    },
    {
        "user_id": "H-B2 (High)", "Sleep duration": 6.0, "Avg Cholesterol": 225, 
        "Avg Glucose": 130, "Avg Systolic BP": 145, "Avg Diastolic BP": 90,
        "Smoking status": "Current", "Alcohol consumption": "Moderate", 
        "Physical activity level": "Low", "Stress level": "High", 
        "Diet quality": "Poor", "Family history of chronic disease": "Yes",
        "Physical_inactivity_flag": 1, "Sleep_hours_per_day": 6.0
    },
    {
        "user_id": "H-C3 (High)", "Sleep duration": 6.5, "Avg Cholesterol": 210, 
        "Avg Glucose": 120, "Avg Systolic BP": 140, "Avg Diastolic BP": 88,
        "Smoking status": "Former", "Alcohol consumption": "Moderate", 
        "Physical activity level": "Low", "Stress level": "Medium", 
        "Diet quality": "Average", "Family history of chronic disease": "No",
        "Physical_inactivity_flag": 1, "Sleep_hours_per_day": 6.5
    },
    {
        "user_id": "H-D4 (High)", "Sleep duration": 5.8, "Avg Cholesterol": 230, 
        "Avg Glucose": 150, "Avg Systolic BP": 160, "Avg Diastolic BP": 100,
        "Smoking status": "Current", "Alcohol consumption": "Heavy", 
        "Physical activity level": "Low", "Stress level": "High", 
        "Diet quality": "Poor", "Family history of chronic disease": "Yes",
        "Physical_inactivity_flag": 1, "Sleep_hours_per_day": 5.8
    },
    
    # MEDIUM RISK (3 Records) - ADJUSTED BIOMETRICS
    {
        "user_id": "M-E5 (Medium)", "Sleep duration": 7.0, "Avg Cholesterol": 180, 
        "Avg Glucose": 100, "Avg Systolic BP": 125, "Avg Diastolic BP": 85,
        "Smoking status": "Former", "Alcohol consumption": "Moderate", 
        "Physical activity level": "Moderate", "Stress level": "Medium", 
        "Diet quality": "Average", "Family history of chronic disease": "No",
        "Physical_inactivity_flag": 0, "Sleep_hours_per_day": 7.0
    },
    {
        "user_id": "M-F6 (Medium)", "Sleep duration": 7.5, "Avg Cholesterol": 165, 
        "Avg Glucose": 95, "Avg Systolic BP": 118, "Avg Diastolic BP": 80,
        "Smoking status": "Never", "Alcohol consumption": "Light", 
        "Physical activity level": "Moderate", "Stress level": "Medium", 
        "Diet quality": "Good", "Family history of chronic disease": "No",
        "Physical_inactivity_flag": 0, "Sleep_hours_per_day": 7.5
    },
    {
        "user_id": "M-G7 (Medium)", "Sleep duration": 6.8, "Avg Cholesterol": 190, 
        "Avg Glucose": 110, "Avg Systolic BP": 130, "Avg Diastolic BP": 89,
        "Smoking status": "Former", "Alcohol consumption": "Heavy", 
        "Physical activity level": "Moderate", "Stress level": "High", 
        "Diet quality": "Average", "Family history of chronic disease": "No",
        "Physical_inactivity_flag": 0, "Sleep_hours_per_day": 6.8
    },
    
    # LOW RISK (3 Records) - ADJUSTED BIOMETRICS
    {
        "user_id": "L-H8 (Low)", "Sleep duration": 8.5, "Avg Cholesterol": 130, 
        "Avg Glucose": 80, "Avg Systolic BP": 110, "Avg Diastolic BP": 75,
        "Smoking status": "Never", "Alcohol consumption": "Light", 
        "Physical activity level": "High", "Stress level": "Low", 
        "Diet quality": "Good", "Family history of chronic disease": "No",
        "Physical_inactivity_flag": 0, "Sleep_hours_per_day": 8.5
    },
    {
        "user_id": "L-I9 (Low)", "Sleep duration": 8.0, "Avg Cholesterol": 140, 
        "Avg Glucose": 85, "Avg Systolic BP": 112, "Avg Diastolic BP": 78,
        "Smoking status": "Never", "Alcohol consumption": "Light", 
        "Physical activity level": "High", "Stress level": "Low", 
        "Diet quality": "Good", "Family history of chronic disease": "No",
        "Physical_inactivity_flag": 0, "Sleep_hours_per_day": 8.0
    },
    {
        "user_id": "L-J10 (Low)", "Sleep duration": 7.8, "Avg Cholesterol": 150, 
        "Avg Glucose": 90, "Avg Systolic BP": 115, "Avg Diastolic BP": 80,
        "Smoking status": "Never", "Alcohol consumption": "Light", 
        "Physical activity level": "Moderate", "Stress level": "Low", 
        "Diet quality": "Good", "Family history of chronic disease": "No",
        "Physical_inactivity_flag": 0, "Sleep_hours_per_day": 7.8
    }
]

# Run all predictions and collect results
all_results = []
print("\n" + "="*80)
print("--- Generating Predictions for 10 Dummy Patients (Using Fixed Model) ---")
print("="*80)

for record in DUMMY_RECORDS:
    try:
        result = predict_health_outcomes(record)
        all_results.append(result)
    except Exception as e:
        print(f"Prediction failed for user_id {record.get('user_id', 'Unknown')}: {e}")

# --- Format Results into a Table ---

if all_results:
    # Convert list of results (which are dicts) to a DataFrame
    results_df = pd.DataFrame(all_results)
    
    # Define the columns for the final display table
    display_columns = [
        'user_id', 'risk_level', 'overall_risk_score_pct',
        'heart_risk_score_pct', 'diabetes_risk_score_pct', 
        'Exercise_Rec', 'Nutrition_Rec', 'Sleep_Rec', 'Stress_Rec'
    ]

    # Select and rename columns for clarity in the output table
    final_table = results_df[display_columns].rename(columns={
        'user_id': 'User ID',
        'risk_level': 'Risk Level',
        'overall_risk_score_pct': 'Overall Score (%)',
        'heart_risk_score_pct': 'Heart Score (%)',
        'diabetes_risk_score_pct': 'Diabetes Score (%)',
        'Exercise_Rec': 'Exercise Rec',
        'Nutrition_Rec': 'Nutrition Rec',
        'Sleep_Rec': 'Sleep Rec',
        'Stress_Rec': 'Stress Rec'
    })
    
    # Set display options for full text printing
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 1000)
    pd.set_option('display.colheader_justify', 'left')
    pd.set_option('display.expand_frame_repr', False)
    
    print("\n" + "="*150)
    print("--- CONSOLIDATED PREDICTION RESULTS (10 PATIENTS) ---")
    print("="*150)
    # Print the DataFrame as a string for a well-formatted table in the console
    print(final_table.to_string())

else:
    print("No predictions were successfully generated.")


Loading trained models and artifacts...
Artifacts loaded successfully.

--- Generating Predictions for 10 Dummy Patients (Using Fixed Model) ---

--- DEBUG: Input Features for H-A1 (High) (pred_df) ---
   Sleep duration  Avg Cholesterol  Avg Glucose  Avg Systolic BP  Avg Diastolic BP Smoking status Alcohol consumption Physical activity level Stress level Diet quality Family history of chronic disease
0  5.2             240              145          155              95                Current        Heavy               Low                     High         Poor         Yes                             

--- DEBUG: Final Predicted Scores ---
{'overall_risk_score_pct': 87.34, 'heart_risk_score_pct': 92.23, 'hypertension_risk_score_pct': 89.09, 'diabetes_risk_score_pct': 94.18, 'respiratory_risk_score_pct': 73.97}

--- CONSOLIDATED PREDICTION RESULTS (10 PATIENTS) ---
  User ID        Risk Level  Overall Score (%)  Heart Score (%)  Diabetes Score (%) Exercise Rec                              

In [None]:
version 2

In [28]:
import pandas as pd
import numpy as np
import joblib
import os
from typing import Dict, Any, List
import json # Used for pretty printing the output
import re # Added for regular expression matching

# --- Configuration ---
MODEL_DIR = "model_artifacts" # Must match the directory where training script saved the files
DEBUG_MODE = True # Set to True to print input features and raw model output for the first patient.

# Check for model files
if not os.path.exists(MODEL_DIR):
    print(f"Error: Model directory '{MODEL_DIR}' not found. Run training_script.py first.")
    exit() 

# --- 1. Load Artifacts ---
print("Loading trained models and artifacts...")
try:
    # Load Prediction Models
    prediction_scores_pipeline = joblib.load(os.path.join(MODEL_DIR, "prediction_scores_model_pipeline.joblib"))
    # The risk_level_pipeline is no longer necessary as we derive risk from the score.
    pred_feature_cols: List[str] = joblib.load(os.path.join(MODEL_DIR, "pred_feature_cols.joblib"))

    # Load Recommendation Models and Encoders
    recommendation_model_pipeline = joblib.load(os.path.join(MODEL_DIR, "recommendation_model_pipeline.joblib"))
    recommendation_label_encoders: Dict[str, joblib.Parallel] = joblib.load(os.path.join(MODEL_DIR, "recommendation_label_encoders.joblib"))
    rec_feature_cols: List[str] = joblib.load(os.path.join(MODEL_DIR, "rec_feature_cols.joblib"))
    
except FileNotFoundError as e:
    print(f"Error loading artifact: {e}. Ensure all files were saved correctly by training_script.py")
    exit()

print("Artifacts loaded successfully.")

# --- Helper Function for Customizing Recommendations ---
def refine_recommendations(raw_record: Dict[str, Any], predicted_recs: Dict[str, str], predicted_scores: Dict[str, float]) -> Dict[str, str]:
    """
    Refines generic model recommendations based on specific patient biometrics and risk scores
    to make the output more personalized and less repetitive for Low/Medium risk patients.
    """
    refined_recs = predicted_recs.copy()
    
    # Refine Exercise Recommendation
    if refined_recs['Exercise_Rec'].startswith("Aim for 30 minutes of aerobic"):
        # This generic recommendation is given for 'Moderate' activity level.
        heart_score = predicted_scores.get('heart_risk_score_pct', 0)
        # Add a note based on a slightly elevated heart score
        if heart_score > 30: 
            refined_recs['Exercise_Rec'] += " Pay extra attention to your heart rate during workouts, keeping it within a moderate zone."
        else:
             # Provide a gentler push for consistency and variety
             refined_recs['Exercise_Rec'] += " Aim for variety, trying swimming or yoga to supplement your current routine."
        
    elif refined_recs['Exercise_Rec'].startswith("Excellent activity level"):
         # This is for 'High' activity level (L-H8, L-I9)
         refined_recs['Exercise_Rec'] += " Consider adding 5-10 minutes of targeted flexibility and stretching training daily."
            
    # Refine Nutrition Recommendation
    if refined_recs['Nutrition_Rec'].startswith("Great diet quality!"):
        # This generic recommendation is given for 'Good' diet quality.
        glucose = raw_record.get('Avg Glucose', 0)
        cholesterol = raw_record.get('Avg Cholesterol', 0)
        
        # Check for slightly elevated biometrics even with a 'Good' diet
        if glucose > 90 and cholesterol > 150: 
            refined_recs['Nutrition_Rec'] += " Since your glucose and cholesterol are monitored, ensure you minimize saturated fats and hidden sugars to maintain these excellent levels."
        elif glucose > 90:
            refined_recs['Nutrition_Rec'] += " Keep a closer eye on your carbohydrate and sugar intake to maintain optimal glucose levels."
        
    elif refined_recs['Nutrition_Rec'].startswith("Increase vegetable and lean protein"):
        # This is for 'Average' diet quality (M-E5, M-G7)
        refined_recs['Nutrition_Rec'] += " Plan your meals a week in advance to ensure consistent intake of nutrient-dense foods."
        
    return refined_recs


# --- 2. Prediction Function ---

def predict_health_outcomes(raw_record: Dict[str, Any]) -> Dict[str, Any]:
    """
    Takes a single raw patient record and generates all risk scores, risk level, 
    and personalized recommendations using the fixed, trained models.

    Args:
        raw_record: A dictionary representing the patient's current health input.

    Returns:
        A dictionary containing the original data plus all predicted outcomes.
    """
    
    is_debug_patient = raw_record.get('user_id') == 'H-A1 (High)' and DEBUG_MODE

    # 1. Prepare input for Risk Score Prediction
    try:
        pred_df = pd.DataFrame([raw_record])[pred_feature_cols]
    except KeyError as e:
        print(f"Error: Missing required input feature for prediction model: {e}")
        return {"error": f"Missing required feature: {e}"}

    if is_debug_patient:
        print("\n" + "="*80)
        print(f"--- DEBUG: Input Features for {raw_record.get('user_id')} (pred_df) ---")
        print("="*80)
        print(pred_df.to_string())


    # --- Predict Risk Scores (Multi-target Regression) ---
    predicted_scores_array = prediction_scores_pipeline.predict(pred_df)
    
    score_names = ['overall_risk_score_pct', 'heart_risk_score_pct', 
                   'hypertension_risk_score_pct', 'diabetes_risk_score_pct', 
                   'respiratory_risk_score_pct']
    
    # Format scores into a dictionary, rounding, and applying final clip based on trained max score (95)
    predicted_scores = {}
    for name, score in zip(score_names, predicted_scores_array[0]):
         # We rely on the model being non-saturated now, but still clip to ensure valid range
        predicted_scores[name] = round(float(np.clip(score, 0.0, 95.0)), 2)
    
    if is_debug_patient:
        print("\n" + "="*80)
        print("--- DEBUG: Final Predicted Scores ---")
        print("="*80)
        print(predicted_scores)

    # --- Determine Risk Level (Based on overall score, standard thresholds) ---
    overall_score = predicted_scores['overall_risk_score_pct']
    if overall_score <= 35.0:
        predicted_level = "Low"
    elif overall_score <= 70.0:
        predicted_level = "Medium"
    else:
        predicted_level = "High"

    # --- Predict Recommendations (Multi-output Classification) ---
    
    # 1. Prepare input for Recommendation Model (features + newly predicted scores)
    rec_input = {k: v for k, v in raw_record.items() if k in rec_feature_cols}
    
    # 2. Add the newly predicted risk scores to the input for the recommendation model
    rec_input['hypertension_risk_score_pct'] = predicted_scores['hypertension_risk_score_pct']
    rec_input['diabetes_risk_score_pct'] = predicted_scores['diabetes_risk_score_pct']
    rec_input['respiratory_risk_score_pct'] = predicted_scores['respiratory_risk_score_pct'] 
    rec_input['heart_risk_score_pct'] = predicted_scores['heart_risk_score_pct']

    # Ensure all required rec features are present before creating the final DataFrame
    rec_df_input_data = {col: rec_input.get(col, None) for col in rec_feature_cols}
    rec_df = pd.DataFrame([rec_df_input_data])[rec_feature_cols]

    # Predict encoded recommendations
    predicted_recs_encoded = recommendation_model_pipeline.predict(rec_df)[0]
    
    # Decode recommendations back to human-readable strings
    predicted_recs = {}
    recommendation_targets = ['Exercise_Rec', 'Nutrition_Rec', 'Sleep_Rec', 'Stress_Rec', 'Other_Rec']
    
    for i, col in enumerate(recommendation_targets):
        le = recommendation_label_encoders[col]
        # Decode the predicted integer back to the string label
        predicted_recs[col] = le.inverse_transform([predicted_recs_encoded[i]])[0]
        
    # --- Refine Recommendations for better personalization ---
    final_recs = refine_recommendations(raw_record, predicted_recs, predicted_scores)

    # --- Compile Final Output ---
    
    final_output = {
        **raw_record, # Include original data
        'risk_level': predicted_level,
        **predicted_scores,
        **final_recs
    }
    
    return final_output

# --- 3. Example Usage (Generating and Testing 10 Dummy Data Records) ---

DUMMY_RECORDS = [
    # HIGH RISK (4 Records)
    {
        "user_id": "H-A1 (High)", "Sleep duration": 5.2, "Avg Cholesterol": 240, 
        "Avg Glucose": 145, "Avg Systolic BP": 155, "Avg Diastolic BP": 95,
        "Smoking status": "Current", "Alcohol consumption": "Heavy", 
        "Physical activity level": "Low", "Stress level": "High", 
        "Diet quality": "Poor", "Family history of chronic disease": "Yes",
        "Physical_inactivity_flag": 1, "Sleep_hours_per_day": 5.2
    },
    {
        "user_id": "H-B2 (High)", "Sleep duration": 6.0, "Avg Cholesterol": 225, 
        "Avg Glucose": 130, "Avg Systolic BP": 145, "Avg Diastolic BP": 90,
        "Smoking status": "Current", "Alcohol consumption": "Moderate", 
        "Physical activity level": "Low", "Stress level": "High", 
        "Diet quality": "Poor", "Family history of chronic disease": "Yes",
        "Physical_inactivity_flag": 1, "Sleep_hours_per_day": 6.0
    },
    {
        "user_id": "H-C3 (High)", "Sleep duration": 6.5, "Avg Cholesterol": 210, 
        "Avg Glucose": 120, "Avg Systolic BP": 140, "Avg Diastolic BP": 88,
        "Smoking status": "Former", "Alcohol consumption": "Moderate", 
        "Physical activity level": "Low", "Stress level": "Medium", 
        "Diet quality": "Average", "Family history of chronic disease": "No",
        "Physical_inactivity_flag": 1, "Sleep_hours_per_day": 6.5
    },
    {
        "user_id": "H-D4 (High)", "Sleep duration": 5.8, "Avg Cholesterol": 230, 
        "Avg Glucose": 150, "Avg Systolic BP": 160, "Avg Diastolic BP": 100,
        "Smoking status": "Current", "Alcohol consumption": "Heavy", 
        "Physical activity level": "Low", "Stress level": "High", 
        "Diet quality": "Poor", "Family history of chronic disease": "Yes",
        "Physical_inactivity_flag": 1, "Sleep_hours_per_day": 5.8
    },
    
    # MEDIUM RISK (3 Records) - ADJUSTED BIOMETRICS
    {
        "user_id": "M-E5 (Medium)", "Sleep duration": 7.0, "Avg Cholesterol": 180, 
        "Avg Glucose": 100, "Avg Systolic BP": 125, "Avg Diastolic BP": 85,
        "Smoking status": "Former", "Alcohol consumption": "Moderate", 
        "Physical activity level": "Moderate", "Stress level": "Medium", 
        "Diet quality": "Average", "Family history of chronic disease": "No",
        "Physical_inactivity_flag": 0, "Sleep_hours_per_day": 7.0
    },
    {
        "user_id": "M-F6 (Medium)", "Sleep duration": 7.5, "Avg Cholesterol": 165, 
        "Avg Glucose": 95, "Avg Systolic BP": 118, "Avg Diastolic BP": 80,
        "Smoking status": "Never", "Alcohol consumption": "Light", 
        "Physical activity level": "Moderate", "Stress level": "Medium", 
        "Diet quality": "Good", "Family history of chronic disease": "No",
        "Physical_inactivity_flag": 0, "Sleep_hours_per_day": 7.5
    },
    {
        "user_id": "M-G7 (Medium)", "Sleep duration": 6.8, "Avg Cholesterol": 190, 
        "Avg Glucose": 110, "Avg Systolic BP": 130, "Avg Diastolic BP": 89,
        "Smoking status": "Former", "Alcohol consumption": "Heavy", 
        "Physical activity level": "Moderate", "Stress level": "High", 
        "Diet quality": "Average", "Family history of chronic disease": "No",
        "Physical_inactivity_flag": 0, "Sleep_hours_per_day": 6.8
    },
    
    # LOW RISK (3 Records) - ADJUSTED BIOMETRICS
    {
        "user_id": "L-H8 (Low)", "Sleep duration": 8.5, "Avg Cholesterol": 130, 
        "Avg Glucose": 80, "Avg Systolic BP": 110, "Avg Diastolic BP": 75,
        "Smoking status": "Never", "Alcohol consumption": "Light", 
        "Physical activity level": "High", "Stress level": "Low", 
        "Diet quality": "Good", "Family history of chronic disease": "No",
        "Physical_inactivity_flag": 0, "Sleep_hours_per_day": 8.5
    },
    {
        "user_id": "L-I9 (Low)", "Sleep duration": 8.0, "Avg Cholesterol": 140, 
        "Avg Glucose": 85, "Avg Systolic BP": 112, "Avg Diastolic BP": 78,
        "Smoking status": "Never", "Alcohol consumption": "Light", 
        "Physical activity level": "High", "Stress level": "Low", 
        "Diet quality": "Good", "Family history of chronic disease": "No",
        "Physical_inactivity_flag": 0, "Sleep_hours_per_day": 8.0
    },
    {
        "user_id": "L-J10 (Low)", "Sleep duration": 7.8, "Avg Cholesterol": 150, 
        "Avg Glucose": 90, "Avg Systolic BP": 115, "Avg Diastolic BP": 80,
        "Smoking status": "Never", "Alcohol consumption": "Light", 
        "Physical activity level": "Moderate", "Stress level": "Low", 
        "Diet quality": "Good", "Family history of chronic disease": "No",
        "Physical_inactivity_flag": 0, "Sleep_hours_per_day": 7.8
    }
]

# Run all predictions and collect results
all_results = []
print("\n" + "="*80)
print("--- Generating Predictions for 10 Dummy Patients (Using Fixed Model) ---")
print("="*80)

for record in DUMMY_RECORDS:
    try:
        result = predict_health_outcomes(record)
        all_results.append(result)
    except Exception as e:
        print(f"Prediction failed for user_id {record.get('user_id', 'Unknown')}: {e}")

# --- Format Results into a Table ---

if all_results:
    # Convert list of results (which are dicts) to a DataFrame
    results_df = pd.DataFrame(all_results)
    
    # Define the columns for the final display table
    display_columns = [
        'user_id', 'risk_level', 'overall_risk_score_pct',
        'heart_risk_score_pct', 'diabetes_risk_score_pct', 
        'Exercise_Rec', 'Nutrition_Rec', 'Sleep_Rec', 'Stress_Rec'
    ]

    # Select and rename columns for clarity in the output table
    final_table = results_df[display_columns].rename(columns={
        'user_id': 'User ID',
        'risk_level': 'Risk Level',
        'overall_risk_score_pct': 'Overall Score (%)',
        'heart_risk_score_pct': 'Heart Score (%)',
        'diabetes_risk_score_pct': 'Diabetes Score (%)',
        'Exercise_Rec': 'Exercise Rec',
        'Nutrition_Rec': 'Nutrition Rec',
        'Sleep_Rec': 'Sleep Rec',
        'Stress_Rec': 'Stress Rec'
    })
    
    # Set display options for full text printing
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 1000)
    pd.set_option('display.colheader_justify', 'left')
    pd.set_option('display.expand_frame_repr', False)
    
    print("\n" + "="*150)
    print("--- CONSOLIDATED PREDICTION RESULTS (10 PATIENTS) ---")
    print("="*150)
    # Print the DataFrame as a string for a well-formatted table in the console
    print(final_table.to_string())

else:
    print("No predictions were successfully generated.")


Loading trained models and artifacts...
Artifacts loaded successfully.

--- Generating Predictions for 10 Dummy Patients (Using Fixed Model) ---

--- DEBUG: Input Features for H-A1 (High) (pred_df) ---
   Sleep duration  Avg Cholesterol  Avg Glucose  Avg Systolic BP  Avg Diastolic BP Smoking status Alcohol consumption Physical activity level Stress level Diet quality Family history of chronic disease
0  5.2             240              145          155              95                Current        Heavy               Low                     High         Poor         Yes                             

--- DEBUG: Final Predicted Scores ---
{'overall_risk_score_pct': 87.34, 'heart_risk_score_pct': 92.23, 'hypertension_risk_score_pct': 89.09, 'diabetes_risk_score_pct': 94.18, 'respiratory_risk_score_pct': 73.97}

--- CONSOLIDATED PREDICTION RESULTS (10 PATIENTS) ---
  User ID        Risk Level  Overall Score (%)  Heart Score (%)  Diabetes Score (%) Exercise Rec                              

In [None]:
Final Version

In [58]:
import pandas as pd
import numpy as np
import joblib
import os
from typing import Dict, Any, List
from pathlib import Path

# --- Configuration ---
MODEL_DIR = "model_artifacts" # Must match the directory where training script saved the files

# CORRECTED PATH: Using the full S3 URL as specified by the user.
INPUT_FILE_PATH = "s3://wellifyyyy/processed_for_sagemaker/part-00000-9e2e89b9-68c7-4faa-8238-4967f75d26c5-c000.csv" 
DEBUG_MODE = True # Set to True to print input features and raw model output for the first patient.

# Check for model files
if not os.path.exists(MODEL_DIR):
    print(f"Error: Model directory '{MODEL_DIR}' not found. Run training_script.py first.")
    exit() 

# --- 1. Load Artifacts ---
print("Loading trained models and artifacts...")
try:
    # Load Prediction Models (Multi-target Regression)
    prediction_scores_pipeline = joblib.load(os.path.join(MODEL_DIR, "prediction_scores_model_pipeline.joblib"))
    pred_feature_cols: List[str] = joblib.load(os.path.join(MODEL_DIR, "pred_feature_cols.joblib"))

    # Load Recommendation Models (Multi-output Classification) and Encoders
    recommendation_model_pipeline = joblib.load(os.path.join(MODEL_DIR, "recommendation_model_pipeline.joblib"))
    recommendation_label_encoders: Dict[str, joblib.Parallel] = joblib.load(os.path.join(MODEL_DIR, "recommendation_label_encoders.joblib"))
    rec_feature_cols: List[str] = joblib.load(os.path.join(MODEL_DIR, "rec_feature_cols.joblib"))
    
except FileNotFoundError as e:
    print(f"Error loading artifact: {e}. Ensure all files were saved correctly by training_script.py")
    exit()

print("Artifacts loaded successfully.")

# --- 2. Prediction Function ---

def predict_health_outcomes(raw_record: Dict[str, Any]) -> Dict[str, Any]:
    """
    Takes a single raw patient record and generates all risk scores, risk level, 
    and personalized recommendations using the trained ML models.

    Args:
        raw_record: A dictionary representing the patient's current health input.

    Returns:
        A dictionary containing the original data plus all predicted outcomes.
    """
    
    user_id = raw_record.get('user_id', 'Unknown')
    # Using the first user ID from the debug output for consistent debugging
    is_debug_patient = (user_id == 'athlete-20251012T013727680Z') and DEBUG_MODE 

    # --- 2.1 Predict Risk Scores (Multi-target Regression) ---

    # 1. Prepare input for Risk Score Prediction
    try:
        # Select and ORDER the columns strictly according to the features the model expects
        pred_df = pd.DataFrame([raw_record])[pred_feature_cols]
    except KeyError as e:
        print(f"Error: Missing required input feature for prediction model: {e}")
        return {"error": f"Missing required feature: {e}"}

    # Defensive Check: Handle NaN values (relying on pipeline imputer, removed explicit fillna(0))
    if pred_df.isnull().any().any():
        if is_debug_patient:
            print("\n!!! WARNING: NaN values detected in prediction input features. Relying on pipeline Imputer. !!!")

    
    if is_debug_patient:
        print("\n" + "="*80)
        print(f"--- DEBUG: Input Features for {user_id} (pred_df) ---")
        print("="*80)
        print(pred_df.to_string())
        
        # --- DEBUG STEP: Print dtypes check before pipeline.predict() ---
        print("\n--- DEBUG: Dtypes check before pipeline.predict() ---")
        print(pred_df.dtypes)
        print("="*80)
        # -----------------------------------------------------------------


    # Predict scores
    predicted_scores_array = prediction_scores_pipeline.predict(pred_df)
    
    score_names = ['overall_risk_score_pct', 'heart_risk_score_pct', 
                   'hypertension_risk_score_pct', 'diabetes_risk_score_pct', 
                   'respiratory_risk_score_pct']
    
    # Format scores into a dictionary
    predicted_scores = {}
    for name, score in zip(score_names, predicted_scores_array[0]):
        # Clip score to be between 0 and 95 (based on training max)
        predicted_scores[name] = round(float(np.clip(score, 0.0, 95.0)), 2)
    
    if is_debug_patient:
        print("\n" + "="*80)
        print("--- DEBUG: Final Predicted Scores ---")
        print("="*80)
        print(predicted_scores)

    # --- Determine Risk Level ---
    overall_score = predicted_scores['overall_risk_score_pct']
    if overall_score <= 35.0:
        predicted_level = "Low"
    elif overall_score <= 70.0:
        predicted_level = "Medium"
    else:
        predicted_level = "High"

    # --- 2.2 Predict Recommendations (Multi-output Classification) ---
    
    # 1. Prepare input for Recommendation Model (features + newly predicted scores)
    rec_input = {k: v for k, v in raw_record.items() if k in rec_feature_cols}
    
    # 2. Add the newly predicted risk scores to the input for the recommendation model
    rec_input['hypertension_risk_score_pct'] = predicted_scores['hypertension_risk_score_pct']
    rec_input['diabetes_risk_score_pct'] = predicted_scores['diabetes_risk_score_pct']
    rec_input['respiratory_risk_score_pct'] = predicted_scores['respiratory_risk_score_pct'] 
    rec_input['heart_risk_score_pct'] = predicted_scores['heart_risk_score_pct']

    # Ensure all required rec features are present before creating the final DataFrame
    rec_df_input_data = {col: rec_input.get(col, None) for col in rec_feature_cols}
    rec_df = pd.DataFrame([rec_df_input_data])[rec_feature_cols]
    
    # Explicitly cast numerical columns for the recommendation model input too
    numeric_rec_cols = ['Avg Cholesterol', 'Avg Glucose', 'Avg Systolic BP', 'Avg Diastolic BP', 
                        'Sleep duration', 'Stress level', 'Physical activity level',
                        'hypertension_risk_score_pct', 'diabetes_risk_score_pct', 
                        'respiratory_risk_score_pct', 'heart_risk_score_pct']
    
    for col in numeric_rec_cols:
        if col in rec_df.columns:
            # Errors='coerce' converts non-numeric values to NaN, which the pipeline can handle.
            rec_df[col] = pd.to_numeric(rec_df[col], errors='coerce').astype(float)


    # Predict encoded recommendations
    predicted_recs_encoded = recommendation_model_pipeline.predict(rec_df)[0]
    
    # Decode recommendations back to human-readable strings
    predicted_recs = {}
    recommendation_targets = ['Exercise_Rec', 'Nutrition_Rec', 'Sleep_Rec', 'Stress_Rec', 'Other_Rec']
    
    for i, col in enumerate(recommendation_targets):
        le = recommendation_label_encoders[col]
        # Decode the predicted integer back to the string label
        predicted_recs[col] = le.inverse_transform([predicted_recs_encoded[i]])[0]
        
    # --- 2.3 Compile Final Output ---
    
    final_output = {
        **raw_record, # Include original data
        'risk_level': predicted_level,
        **predicted_scores,
        **predicted_recs # Use the pure ML-generated recommendations
    }
    
    return final_output


# --- 3. Data Loading and Execution ---

def load_data_from_csv(file_path: str) -> List[Dict[str, Any]]:
    """
    Loads the CSV, renames columns to match model expectations, and returns a list of records.
    Handles loading directly from an S3 path using the 's3fs' dependency.
    """
    
    try:
        # Pandas can read directly from S3 paths if 's3fs' is installed
        df = pd.read_csv(file_path)
    except Exception as e:
        print(f"Error reading CSV file from S3: {e}")
        if file_path.startswith("s3://"):
             print(f"Attempting to read from local file path as a fallback...")
             try:
                 local_file_name = file_path.split('/')[-1]
                 df = pd.read_csv(local_file_name)
             except Exception as local_e:
                 print(f"Error reading from local fallback path: {local_e}")
                 return []
        else:
             return []
        
    # Define mapping from CSV columns to model-expected features
    column_mapping = {
        'submissionId': 'user_id',
        'sleep_hours_per_day': 'Sleep duration', 
        'cholesterol_level': 'Avg Cholesterol',
        'glucose_level': 'Avg Glucose',
        'blood_pressure_sys': 'Avg Systolic BP',
        'blood_pressure_dia': 'Avg Diastolic BP',
        'smoking_status': 'Smoking status',
        'alcohol_consumption': 'Alcohol consumption',
        'physical_activity_level': 'Physical activity level',
        'stress_level': 'Stress level',
        'diet_quality': 'Diet quality',
        # Features that match exactly:
        'Physical_inactivity_flag': 'Physical_inactivity_flag',
    }

    # Rename columns in the DataFrame
    df.rename(columns=column_mapping, inplace=True)
    
    # --- FIX 1: Explicitly cast all expected numerical columns to float ---
    numeric_cols = [
        'Sleep duration', 'Avg Cholesterol', 'Avg Glucose', 'Avg Systolic BP', 
        'Avg Diastolic BP', 'Stress level', 'Physical activity level' 
    ]
    
    for col in numeric_cols:
        if col in df.columns:
            # Use errors='coerce' to turn any unexpected strings into NaN (which is a float type)
            df[col] = pd.to_numeric(df[col], errors='coerce').astype(float)
            
    # --- FIX 2: Ensure all categorical columns are clean strings AND cast to 'category' ---
    categorical_cols = [
        'Smoking status', 'Alcohol consumption', 'Diet quality',
        'Family history of chronic disease' 
    ]
    for col in categorical_cols:
         if col in df.columns:
            # Clean string data
            df[col] = df[col].astype(str).str.strip().str.title()
            # Explicitly cast to 'category' type for Scikit-learn ColumnTransformer
            df[col] = df[col].astype('category') 
            
    # Map categorical values to match the expected labels (consistency check)
    df['Smoking status'] = df['Smoking status'].replace({'Never_Smoked': 'Never', 'Current_Smoker': 'Current', 'Former_Smoker': 'Former'})
    
    # Crucial imputation: 'Family history of chronic disease' is missing in the input file 
    if 'Family history of chronic disease' not in df.columns:
        df['Family history of chronic disease'] = 'No' 

    # Convert DataFrame rows to a list of dictionaries (records)
    print(f"Successfully loaded {len(df)} records from {file_path}.")
    return df.to_dict('records')


# Load the data from the input CSV
RECORDS_TO_TEST = load_data_from_csv(INPUT_FILE_PATH)

# Run all predictions and collect results
all_results = []
print("\n" + "="*80)
print(f"--- Generating Predictions for {len(RECORDS_TO_TEST)} Patients (from {INPUT_FILE_PATH}) ---")
print("="*80)

if RECORDS_TO_TEST:
    for record in RECORDS_TO_TEST:
        try:
            result = predict_health_outcomes(record)
            all_results.append(result)
        except Exception as e:
            user_id = record.get('user_id', 'Unknown')
            print(f"Prediction failed for user_id {user_id}: {e}")

# --- Format Results into a Table ---

if all_results:
    # Convert list of results (which are dicts) to a DataFrame
    results_df = pd.DataFrame(all_results)
    
    # Define the columns for the final display table
    display_columns = [
        'user_id', 'risk_level', 'overall_risk_score_pct',
        'heart_risk_score_pct', 'diabetes_risk_score_pct', 
        'Exercise_Rec', 'Nutrition_Rec', 'Sleep_Rec', 'Stress_Rec'
    ]

    # Select and rename columns for clarity in the output table
    final_table = results_df[display_columns].rename(columns={
        'user_id': 'User ID',
        'risk_level': 'Risk Level',
        'overall_risk_score_pct': 'Overall Score (%)',
        'heart_risk_score_pct': 'Heart Score (%)',
        'diabetes_risk_score_pct': 'Diabetes Score (%)',
        'Exercise_Rec': 'Exercise Rec',
        'Nutrition_Rec': 'Nutrition Rec',
        'Sleep_Rec': 'Sleep Rec',
        'Stress_Rec': 'Stress Rec'
    })
    
    # Set display options for full text printing
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 1000)
    pd.set_option('display.colheader_justify', 'left')
    pd.set_option('display.expand_frame_repr', False)
    
    print("\n" + "="*150)
    print(f"--- CONSOLIDATED PREDICTION RESULTS ({len(RECORDS_TO_TEST)} PATIENTS) ---")
    print("="*150)
    # Print the DataFrame as a string for a well-formatted table in the console
    print(final_table.to_string())

else:
    print("No predictions were successfully generated.")


Loading trained models and artifacts...
Artifacts loaded successfully.
Successfully loaded 12 records from s3://wellifyyyy/processed_for_sagemaker/part-00000-9e2e89b9-68c7-4faa-8238-4967f75d26c5-c000.csv.

--- Generating Predictions for 12 Patients (from s3://wellifyyyy/processed_for_sagemaker/part-00000-9e2e89b9-68c7-4faa-8238-4967f75d26c5-c000.csv) ---

--- DEBUG: Input Features for athlete-20251012T013727680Z (pred_df) ---
   Sleep duration  Avg Cholesterol  Avg Glucose  Avg Systolic BP  Avg Diastolic BP Smoking status Alcohol consumption  Physical activity level  Stress level Diet quality Family history of chronic disease
0  8.0             160.0            90.0         115.0            75.0              Never          Moderate            10.0                     3.0           Good         No                              

--- DEBUG: Dtypes check before pipeline.predict() ---
Sleep duration                       float64
Avg Cholesterol                      float64
Avg Glucose      

  df['Smoking status'] = df['Smoking status'].replace({'Never_Smoked': 'Never', 'Current_Smoker': 'Current', 'Former_Smoker': 'Former'})


Loading trained models and artifacts...
Artifacts loaded successfully.
Error: Input file not found at s3://wellifyyyy/processed_for_sagemaker/part-00000-9e2e89b9-68c7-4faa-8238-4967f75d26c5-c000.csv

--- Generating Predictions for 0 Patients (from s3://wellifyyyy/processed_for_sagemaker/part-00000-9e2e89b9-68c7-4faa-8238-4967f75d26c5-c000.csv) ---
No predictions were successfully generated.
