In [1]:
import numpy as np
import pandas as pd
import joblib
import os
import time
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedShuffleSplit, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# Set a seed for reproducibility
np.random.seed(42)
NUM_RECORDS = 20000  # Increased to 20k for better data diversity and model robustness

print("--- Iteration 4: ENHANCED Data Generation with Advanced Parameter Weighting ---")
print("STEP 1: Generating Enhanced Synthetic Data with Intelligent Weighting...")

# --- 1. Define Input Feature Distributions with Better Real-World Patterns ---
# More realistic age distribution (bimodal: young athletes + older fitness enthusiasts)
AGE = np.concatenate([
    np.random.normal(28, 6, int(NUM_RECORDS * 0.6)).clip(18, 40).astype(int),
    np.random.normal(48, 8, int(NUM_RECORDS * 0.4)).clip(40, 70).astype(int)
])
np.random.shuffle(AGE)
AGE = AGE[:NUM_RECORDS]

# Sex distribution (slightly more male gym-goers in data)
SEX = np.random.choice(['M', 'F'], size=NUM_RECORDS, p=[0.6, 0.4])

# Weight follows BMI-realistic distribution
WEIGHT_KG = np.random.normal(78, 14, size=NUM_RECORDS).clip(45, 130).round(1)

# Sleep: realistic with some chronically sleep-deprived individuals
SLEEP_HRS = np.concatenate([
    np.random.normal(7.3, 0.8, int(NUM_RECORDS * 0.75)),
    np.random.normal(5.5, 0.6, int(NUM_RECORDS * 0.25))
]).clip(4.0, 10.0).round(1)
np.random.shuffle(SLEEP_HRS)

# RHR correlates somewhat with fitness level (lower is generally better)
RHR_BASE = 62 + (WEIGHT_KG - 75) * 0.15  # Heavier people tend to have higher RHR
RHR_BPM = (RHR_BASE + np.random.normal(0, 6, size=NUM_RECORDS)).clip(45, 95).round(0)

# Soreness distribution: skewed toward low soreness (most people don't report high soreness)
SORENESS = np.random.choice([1, 2, 3, 4, 5], size=NUM_RECORDS, p=[0.35, 0.30, 0.20, 0.10, 0.05])

# Mental stress: slightly elevated (realistic for modern population)
MENTAL_STRESS = np.random.choice([1, 2, 3, 4, 5], size=NUM_RECORDS, p=[0.20, 0.30, 0.30, 0.15, 0.05])

# Calories: correlate with weight and gender (heavier people eat more)
CALORIES = (2100 + (WEIGHT_KG - 75) * 8 + (SEX == 'M') * 200 + 
            np.random.normal(0, 300, size=NUM_RECORDS)).clip(1200, 4500).round(0)

# Protein: correlate with fitness goal (assume people who work out eat more protein)
PROTEIN_G = (90 + WEIGHT_KG * 0.8 + np.random.normal(0, 25, size=NUM_RECORDS)).clip(40, 250).round(0)

# Carbs: correlate with activity (higher carbs for high carb days)
CARBS_G = (200 + CALORIES * 0.35 / 4 + np.random.normal(0, 50, size=NUM_RECORDS)).clip(80, 550).round(0)

# Nutrition confidence: distribution reflecting real world (high confidence most common)
NUTR_CONF_SCORE = np.random.choice([1.0, 0.75, 0.5], size=NUM_RECORDS, p=[0.70, 0.20, 0.10])

# --- 2. Assemble DataFrame ---
df = pd.DataFrame({
    'Age': AGE, 'Sex': SEX, 'Weight_kg': WEIGHT_KG,
    'SLEEP_HRS': SLEEP_HRS, 'RHR_BPM': RHR_BPM, 'SORENESS': SORENESS,
    'MENTAL_STRESS': MENTAL_STRESS, 'CALORIES_IN': CALORIES,
    'PROTEIN_G': PROTEIN_G, 'CARBS_G': CARBS_G, 'NUTR_CONF_SCORE': NUTR_CONF_SCORE
})

print(f"‚úÖ Generated {NUM_RECORDS} records with realistic feature correlations")
print(f"   Mean Age: {AGE.mean():.1f} | Mean Weight: {WEIGHT_KG.mean():.1f}kg | Mean Sleep: {SLEEP_HRS.mean():.1f}h")

# --- 3. ADVANCED Rule-Based Labeling with Intelligent Parameter Weighting ---
base_duration = 45.0
base_intensity_rpe = 6.0

# --- IMPROVED Modifiers with Graduated Weighting ---

# Sleep modifier: Non-linear response (bigger penalty for really poor sleep)
sleep_modifier = np.ones(NUM_RECORDS)
sleep_modifier[df['SLEEP_HRS'] < 4.5] = 0.5      # Severe sleep deprivation
sleep_modifier[(df['SLEEP_HRS'] >= 4.5) & (df['SLEEP_HRS'] < 5.5)] = 0.65
sleep_modifier[(df['SLEEP_HRS'] >= 5.5) & (df['SLEEP_HRS'] < 6.5)] = 0.85
sleep_modifier[(df['SLEEP_HRS'] >= 7.0) & (df['SLEEP_HRS'] < 8.0)] = 1.05
sleep_modifier[df['SLEEP_HRS'] >= 8.0] = 1.15   # Well-rested

# Soreness modifier: Progressive recovery (5 is near-complete rest)
soreness_modifier = np.where(df['SORENESS'] == 5, 0.35,
                    np.where(df['SORENESS'] == 4, 0.55,
                    np.where(df['SORENESS'] == 3, 0.80,
                    np.where(df['SORENESS'] == 2, 0.95, 1.0))))  # 1 = full go

# RHR-based readiness: Elevated RHR suggests incomplete recovery or illness
rhr_baseline = 62
rhr_modifier = 1.0 - (df['RHR_BPM'] - rhr_baseline) * 0.015  # Each 5 bpm above baseline = ~7.5% reduction
rhr_modifier = np.clip(rhr_modifier, 0.7, 1.15)

# Carbs modifier: More nuanced - mid-range carbs are optimal
carbs_modifier = np.ones(NUM_RECORDS)
carbs_modifier[df['CARBS_G'] < 120] = 0.75       # Very low carbs = weak performance
carbs_modifier[(df['CARBS_G'] >= 120) & (df['CARBS_G'] < 180)] = 0.90
carbs_modifier[(df['CARBS_G'] >= 180) & (df['CARBS_G'] < 350)] = 1.0   # Optimal range
carbs_modifier[(df['CARBS_G'] >= 350) & (df['CARBS_G'] < 450)] = 0.95  # Slightly too much
carbs_modifier[df['CARBS_G'] >= 450] = 0.85     # Way too many carbs = lethargy

# Protein modifier: More protein = more ready for strength
protein_modifier = np.clip(df['PROTEIN_G'] / 120, 0.8, 1.15)

# Stress modifier: Nuanced - some stress is good (eustress), too much is bad
stress_modifier = np.ones(NUM_RECORDS)
stress_modifier[df['MENTAL_STRESS'] == 1] = 1.0   # Low stress = good
stress_modifier[df['MENTAL_STRESS'] == 2] = 1.05  # Slightly elevated = some motivation
stress_modifier[df['MENTAL_STRESS'] == 3] = 0.95  # Moderate stress = slightly reduced capacity
stress_modifier[df['MENTAL_STRESS'] == 4] = 0.75  # High stress = significant reduction
stress_modifier[df['MENTAL_STRESS'] == 5] = 0.5   # Severe stress = rest recommended

# Gender modifier: Males typically have higher baseline intensity/duration
gender_modifier = np.where(df['Sex'] == 'M', 1.10, 1.0)

# Age modifier: Graduated, not linear
age_modifier = np.ones(NUM_RECORDS)
age_modifier[df['Age'] < 25] = 1.15   # Young athletes peak
age_modifier[(df['Age'] >= 25) & (df['Age'] < 35)] = 1.05
age_modifier[(df['Age'] >= 35) & (df['Age'] < 45)] = 1.0
age_modifier[(df['Age'] >= 45) & (df['Age'] < 55)] = 0.90
age_modifier[(df['Age'] >= 55) & (df['Age'] < 65)] = 0.75
age_modifier[df['Age'] >= 65] = 0.60

# Weight-based modifier: Heavier people may fatigue faster but have strength advantage
weight_modifier = 1.0 + ((df['Weight_kg'] - 75) * 0.003)  # Reduced sensitivity
weight_modifier = np.clip(weight_modifier, 0.85, 1.15)

# Nutrition confidence: Low confidence = conservative estimates
confidence_modifier = df['NUTR_CONF_SCORE']  # Ranges 0.5-1.0
confidence_modifier = np.clip(confidence_modifier, 0.75, 1.0)  # Don't penalize too heavily

print(f"‚úÖ Parameter modifiers calculated with advanced weighting")

# *** GOAL PREDICTION with Enhanced Weighting ***
# More sophisticated factor scoring based on real exercise science

strength_factors = (
    (df['SLEEP_HRS'] >= 7.5) * 3.0 +           # Sleep crucial for strength gains
    (df['SORENESS'] <= 2) * 2.5 +               # Low soreness = good recovery
    (df['PROTEIN_G'] >= 130) * 2.0 +            # High protein for muscle building
    (df['CARBS_G'] >= 220) * 1.5 +              # Adequate carbs for workout fuel
    (df['MENTAL_STRESS'] <= 2) * 1.5 +          # Low stress = better focus
    (df['Age'] <= 35) * 1.0 +                   # Peak athletic age
    (df['RHR_BPM'] < 65) * 1.0                  # Lower RHR = good recovery
)

endurance_factors = (
    (df['CARBS_G'] >= 300) * 3.5 +              # Carbs are king for endurance
    (df['SLEEP_HRS'] >= 6.5) * 2.0 +            # Good sleep for aerobic adaptation
    (df['SORENESS'] <= 3) * 1.5 +               # Manageable soreness
    (df['Age'] <= 50) * 1.5 +                   # Endurance peaks earlier than strength
    (df['PROTEIN_G'] >= 100) * 1.0              # Some protein for recovery
)

maintenance_factors = (
    (df['SORENESS'] >= 4) * 4.0 +               # HIGH WEIGHT: high soreness needs recovery
    (df['SLEEP_HRS'] < 6.0) * 3.5 +             # Poor sleep = recovery day
    (df['MENTAL_STRESS'] >= 4) * 3.0 +          # High stress = lighter activity
    (df['Age'] >= 45) * 2.0 +                   # Age-appropriate maintenance
    (df['RHR_BPM'] > 75) * 2.0 +                # Elevated RHR = not recovered
    (df['CARBS_G'] < 150) * 1.5                 # Low carbs = low fuel
)

# Yoga factors with high priority for mental recovery
yoga_factors = (
    (df['MENTAL_STRESS'] >= 4) * 6.0 +          # Mental health first
    (df['SORENESS'] >= 4) * 3.0 +               # Also great for physical recovery
    (df['RHR_BPM'] > 75) * 2.5 +                # High RHR = stressed/unrecovered
    (df['SLEEP_HRS'] < 6.5) * 2.0 +             # Poor sleep pairs with yoga
    (df['Age'] >= 50) * 1.0                     # Good for older athletes
)

# Determine goal based on highest weighted score
df['GOAL'] = 'Maintenance'  # Default

# Vectorized goal assignment for efficiency
strength_score = strength_factors.values
endurance_score = endurance_factors.values
maintenance_score = maintenance_factors.values
yoga_score = yoga_factors.values

goal_matrix = np.column_stack([strength_score, endurance_score, maintenance_score, yoga_score])
goal_labels = ['Strength', 'Endurance', 'Maintenance', 'Yoga']
df['GOAL'] = [goal_labels[np.argmax(row)] for row in goal_matrix]

print(f"\n‚úÖ Goal Distribution (with advanced weighting):")
print(df['GOAL'].value_counts())
print(f"   {df['GOAL'].value_counts().to_dict()}")

# Goal-specific modifiers with better differentiation
goal_duration_modifier = np.select(
    [df['GOAL'] == 'Endurance', df['GOAL'] == 'Strength', df['GOAL'] == 'Yoga', df['GOAL'] == 'Maintenance'],
    [1.20, 1.05, 0.65, 0.75],  # Endurance: longer, Yoga: very short
    default=1.0
)

goal_intensity_modifier = np.select(
    [df['GOAL'] == 'Endurance', df['GOAL'] == 'Strength', df['GOAL'] == 'Yoga', df['GOAL'] == 'Maintenance'],
    [0.85, 1.15, 0.35, 0.70],  # Strength: highest intensity, Yoga: lowest
    default=1.0
)

# Calculate final predictions with ALL modifiers
# Duration and intensity are both affected by readiness factors
combined_readiness = (sleep_modifier * soreness_modifier * rhr_modifier * 
                      stress_modifier * confidence_modifier)

df['Optimal_Intensity_RPE'] = (base_intensity_rpe * 
                                combined_readiness *
                                gender_modifier * 
                                age_modifier * 
                                protein_modifier *
                                carbs_modifier *
                                goal_intensity_modifier * 
                                weight_modifier)

df['Optimal_Duration_Min'] = (base_duration * 
                              sleep_modifier * 
                              soreness_modifier * 
                              rhr_modifier *
                              stress_modifier * 
                              gender_modifier * 
                              age_modifier * 
                              goal_duration_modifier * 
                              weight_modifier)

# Clamp outputs to valid ranges
df['Optimal_Duration_Min'] = df['Optimal_Duration_Min'].clip(12, 120).round(0)
df['Optimal_Intensity_RPE'] = df['Optimal_Intensity_RPE'].clip(1.5, 9.5).round(1)

print(f"\n‚úÖ Step 1 Complete: Enhanced data with intelligent parameter weighting generated.")
print(f"   Duration range: {df['Optimal_Duration_Min'].min():.0f}-{df['Optimal_Duration_Min'].max():.0f} min")
print(f"   Intensity range: {df['Optimal_Intensity_RPE'].min():.1f}-{df['Optimal_Intensity_RPE'].max():.1f} RPE")

# --- 4. Feature Engineering (Preprocessing) with StandardScaler ---
print("\nSTEP 2: Preprocessing features (Encoding + Scaling)...")

TARGETS = ['Optimal_Duration_Min', 'Optimal_Intensity_RPE', 'GOAL']
X = df.drop(columns=TARGETS)
Y = df[['Optimal_Duration_Min', 'Optimal_Intensity_RPE']]
Y_goal = df['GOAL']

categorical_features = ['Sex']
numerical_features = X.select_dtypes(include=np.number).columns.tolist()

# Enhanced preprocessor with StandardScaler for numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
        ('num', StandardScaler(), numerical_features)  # Added scaling for better model performance
    ],
    remainder='drop'
)

X_processed = preprocessor.fit_transform(X)
Y_values = Y.values

# IMPROVED: Use StratifiedShuffleSplit to ensure balanced goal distribution in train/test
from sklearn.preprocessing import LabelEncoder
goal_encoder = LabelEncoder()
Y_goal_encoded = goal_encoder.fit_transform(Y_goal)

# Stratified split ensures each set has similar goal distribution
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(sss.split(X_processed, Y_goal_encoded))

X_train = X_processed[train_idx]
X_test = X_processed[test_idx]
Y_train = Y_values[train_idx]
Y_test = Y_values[test_idx]
Y_goal_train = Y_goal_encoded[train_idx]
Y_goal_test = Y_goal_encoded[test_idx]

print(f"‚úÖ Step 2 Complete: Train/Test split with stratification (train: {len(train_idx)}, test: {len(test_idx)})")

# --- 5. Train Goal Classifier with Cross-Validation ---
print("\nSTEP 3a: Training Goal Classifier with Cross-Validation...")

from sklearn.ensemble import RandomForestClassifier

# RandomForest often generalizes better than XGBoost on synthetic data
goal_classifier = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    min_samples_split=4,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'  # Handle imbalanced classes
)

goal_classifier.fit(X_train, Y_goal_train)
goal_pred_train = goal_classifier.predict(X_train)
goal_pred_test = goal_classifier.predict(X_test)
goal_accuracy_train = (goal_pred_train == Y_goal_train).mean()
goal_accuracy_test = (goal_pred_test == Y_goal_test).mean()

# Cross-validation score
cv_scores = cross_val_score(goal_classifier, X_train, Y_goal_train, cv=5, scoring='accuracy')

print(f"‚úÖ Goal Classifier Training Accuracy: {goal_accuracy_train:.4f}")
print(f"‚úÖ Goal Classifier Test Accuracy: {goal_accuracy_test:.4f}")
print(f"‚úÖ Goal Classifier CV Scores: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
print(f"‚úÖ Goal Classes: {goal_encoder.classes_}")

# --- 6. Train Duration/Intensity Regressor ---
print("\nSTEP 3b: Starting Hyperparameter Tuning for Duration/Intensity (GridSearchCV)...")
print("This may take a few minutes...")

start_time = time.time()

# Use RandomForestRegressor for better generalization on synthetic data
# XGBoost can overfit on synthetic data
base_rf = RandomForestRegressor(
    random_state=42,
    n_jobs=-1,
    max_features='sqrt'  # Reduce feature space per split for stability
)

multi_model = MultiOutputRegressor(base_rf)

# Focused parameter grid for RandomForest with improved ranges
param_grid = {
    'estimator__n_estimators': [180, 220],
    'estimator__max_depth': [12, 16],
    'estimator__min_samples_split': [4, 6],
    'estimator__min_samples_leaf': [2, 3]
}

grid_search = GridSearchCV(
    estimator=multi_model,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train, Y_train)

end_time = time.time()
print(f"\n‚úÖ Step 3b Complete: Tuning finished in {end_time - start_time:.2f} seconds.")
print(f"‚úÖ Best Hyperparameters: {grid_search.best_params_}")

# --- 7. Evaluate the BEST Model with per-output metrics ---
print("\nSTEP 4: Evaluating final tuned model on test set...")

best_mvva_model = grid_search.best_estimator_
Y_pred = best_mvva_model.predict(X_test)

# Per-output metrics for better debugging
rmse_duration = np.sqrt(mean_squared_error(Y_test[:, 0], Y_pred[:, 0]))
rmse_intensity = np.sqrt(mean_squared_error(Y_test[:, 1], Y_pred[:, 1]))
mae_duration = mean_absolute_error(Y_test[:, 0], Y_pred[:, 0])
mae_intensity = mean_absolute_error(Y_test[:, 1], Y_pred[:, 1])
r2_duration = r2_score(Y_test[:, 0], Y_pred[:, 0])
r2_intensity = r2_score(Y_test[:, 1], Y_pred[:, 1])

rmse_overall = np.sqrt(mean_squared_error(Y_test, Y_pred))
mae_overall = mean_absolute_error(Y_test, Y_pred)
r2_overall = r2_score(Y_test, Y_pred)

# Cross-validation scores for regression
cv_scores_reg = cross_val_score(grid_search.best_estimator_, X_train, Y_train, cv=5, scoring='r2')

print("\n--- Final Model Performance (on Test Set) ---")
print(f"Overall RMSE: {rmse_overall:.3f} | Overall MAE: {mae_overall:.3f} | Overall R2: {r2_overall:.3f}")
print(f"\nDuration (min):")
print(f"  RMSE: {rmse_duration:.2f} | MAE: {mae_duration:.2f} | R2: {r2_duration:.3f}")
print(f"\nIntensity (RPE):")
print(f"  RMSE: {rmse_intensity:.2f} | MAE: {mae_intensity:.2f} | R2: {r2_intensity:.3f}")
print(f"\nRegression CV Scores: {cv_scores_reg.mean():.4f} (+/- {cv_scores_reg.std():.4f})")

if r2_overall > 0.78:
    print("\n‚úÖ EXCELLENT: Model shows very good generalization! R2 > 0.78 indicates robust performance.")
elif r2_overall > 0.75:
    print("\n‚úÖ GOOD: Model shows good generalization! R2 > 0.75 indicates solid performance.")
else:
    print(f"\n‚ö†Ô∏è WARNING: R2 = {r2_overall:.3f}. Model may need more data or different features.")

# --- 8. Feature Importance Analysis (for transparency) ---
print("\nSTEP 5: Analyzing Feature Importance...")
feature_names = []
for name, transformer, columns in preprocessor.transformers_:
    if name == 'cat':
        feature_names.extend([f"{col}_encoded" for col in columns])
    else:
        feature_names.extend(columns)

# Get feature importances from the best RandomForest estimator
if hasattr(best_mvva_model.estimators_[0], 'feature_importances_'):
    importances = best_mvva_model.estimators_[0].feature_importances_
    top_7_idx = np.argsort(importances)[-7:][::-1]
    print("\nTop 7 Most Important Features:")
    for i, idx in enumerate(top_7_idx):
        print(f"  {i+1}. {feature_names[idx]}: {importances[idx]:.4f}")

# --- 9. Save ALL Artifacts ---
print("\nSTEP 6: Saving final model, preprocessor, classifier, and encoder...")

MODEL_FILENAME_V2 = 'mvva_model_v2.joblib'
PREPROCESSOR_FILENAME_V2 = 'mvva_preprocessor_v2.joblib'
GOAL_CLASSIFIER_FILENAME_V2 = 'mvva_goal_classifier_v2.joblib'
GOAL_ENCODER_FILENAME_V2 = 'mvva_goal_encoder_v2.joblib'

joblib.dump(best_mvva_model, MODEL_FILENAME_V2)
joblib.dump(preprocessor, PREPROCESSOR_FILENAME_V2)
joblib.dump(goal_classifier, GOAL_CLASSIFIER_FILENAME_V2)
joblib.dump(goal_encoder, GOAL_ENCODER_FILENAME_V2)

print(f"\n‚úÖ‚úÖ‚úÖ ENHANCED PIPELINE COMPLETE ‚úÖ‚úÖ‚úÖ")
print(f"‚úÖ Model with advanced weighting trained successfully!")
print(f"‚úÖ 20,000 records with realistic feature correlations generated")
print(f"‚úÖ Goal classes: {list(goal_encoder.classes_)}")
print(f"‚úÖ All artifacts saved and ready for production")
print(f"\nFiles saved:")
print(f"  ‚Ä¢ {MODEL_FILENAME_V2}")
print(f"  ‚Ä¢ {PREPROCESSOR_FILENAME_V2}")
print(f"  ‚Ä¢ {GOAL_CLASSIFIER_FILENAME_V2}")
print(f"  ‚Ä¢ {GOAL_ENCODER_FILENAME_V2}")


--- Iteration 4: ENHANCED Data Generation with Advanced Parameter Weighting ---
STEP 1: Generating Enhanced Synthetic Data with Intelligent Weighting...
‚úÖ Generated 20000 records with realistic feature correlations
   Mean Age: 35.9 | Mean Weight: 77.8kg | Mean Sleep: 6.9h
‚úÖ Parameter modifiers calculated with advanced weighting

‚úÖ Goal Distribution (with advanced weighting):
GOAL
Endurance      9461
Strength       8351
Yoga           1453
Maintenance     735
Name: count, dtype: int64
   {'Endurance': 9461, 'Strength': 8351, 'Yoga': 1453, 'Maintenance': 735}

‚úÖ Step 1 Complete: Enhanced data with intelligent parameter weighting generated.
   Duration range: 12-85 min
   Intensity range: 1.5-9.5 RPE

STEP 2: Preprocessing features (Encoding + Scaling)...
‚úÖ Step 2 Complete: Train/Test split with stratification (train: 16000, test: 4000)

STEP 3a: Training Goal Classifier with Cross-Validation...

‚úÖ Step 1 Complete: Enhanced data with intelligent parameter weighting generated.

In [2]:

# --- BONUS STEP: CONTINUOUS LEARNING FROM USER FEEDBACK ---
# Load user feedback if available and incorporate into training data

print("\n" + "="*80)
print("OPTIONAL STEP: Loading User Feedback for Continuous Learning...")
print("="*80)

feedback_file = 'feedback_history.csv'

if os.path.exists(feedback_file):
    print(f"\n‚úÖ Found feedback file: {feedback_file}")
    feedback_df = pd.read_csv(feedback_file)
    print(f"‚úÖ Loaded {len(feedback_df)} feedback records from users")
    
    # Convert feedback to synthetic training data
    # If user completed more than 90%, they could have done more
    # If user completed 50-70%, difficulty was right
    # If user completed <50%, they should have done less
    
    synthetic_feedback_data = []
    
    for idx, row in feedback_df.iterrows():
        # Extract original input features
        feedback_record = {
            'Age': row['age'],
            'Sex': row['sex'],
            'Weight_kg': row['weight_kg'],
            'SLEEP_HRS': row['sleep_hrs'],
            'RHR_BPM': row['rhr_bpm'],
            'SORENESS': row['soreness_before'],
            'MENTAL_STRESS': row['mental_stress'],
            'CALORIES_IN': row['calories_in'],
            'PROTEIN_G': row['protein_g'],
            'CARBS_G': row['carbs_g'],
            'NUTR_CONF_SCORE': 0.9,  # User-provided feedback is high confidence
            'GOAL': row['predicted_goal']
        }
        
        # Adjust targets based on user satisfaction
        completion_pct = row['workout_completion_pct']
        actual_intensity = row['actual_intensity']
        recovery_feeling = row['recovery_feeling']
        
        # Calculate adjustment factors based on feedback
        # If user completed <50%, they were overestimated - reduce duration/intensity
        # If user completed >90%, they could do more - increase duration/intensity
        # If 50-80%, it was about right
        
        if completion_pct >= 90 and recovery_feeling >= 4:
            # Could have done more - increase both
            duration_adjustment = 1.10
            intensity_adjustment = 1.08
        elif completion_pct >= 80 and completion_pct < 90:
            # Pretty good but could push slightly more
            duration_adjustment = 1.05
            intensity_adjustment = 1.03
        elif completion_pct >= 70 and completion_pct < 80:
            # About right
            duration_adjustment = 1.0
            intensity_adjustment = 1.0
        elif completion_pct >= 50 and completion_pct < 70:
            # Slightly overestimated
            duration_adjustment = 0.95
            intensity_adjustment = 0.95
        else:  # < 50%
            # Significantly overestimated
            duration_adjustment = 0.85
            intensity_adjustment = 0.85
        
        # Apply recovery feeling adjustment
        if recovery_feeling == 1:  # Exhausted
            duration_adjustment *= 0.90
            intensity_adjustment *= 0.90
        elif recovery_feeling == 5:  # Fully refreshed
            duration_adjustment *= 1.05
            intensity_adjustment *= 1.05
        
        # Set targets with adjustments
        feedback_record['Optimal_Duration_Min'] = row['recommended_duration'] * duration_adjustment
        feedback_record['Optimal_Intensity_RPE'] = row['recommended_intensity'] * intensity_adjustment
        
        synthetic_feedback_data.append(feedback_record)
    
    # Create feedback dataframe
    feedback_synthetic_df = pd.DataFrame(synthetic_feedback_data)
    
    print(f"\n‚úÖ Converted {len(feedback_synthetic_df)} feedback records to training data")
    print(f"   Adjusted durations: {feedback_synthetic_df['Optimal_Duration_Min'].mean():.1f} min (avg)")
    print(f"   Adjusted intensities: {feedback_synthetic_df['Optimal_Intensity_RPE'].mean():.1f} RPE (avg)")
    
    # Combine with synthetic data (feedback gets higher weight)
    # Use 70% synthetic, 30% feedback to avoid overfitting to feedback
    num_synthetic = len(df)
    num_feedback = len(feedback_synthetic_df)
    
    # Weight feedback data to give it more importance
    feedback_weighted = pd.concat([feedback_synthetic_df] * 2, ignore_index=True)  # Double weight
    
    # Combine datasets
    df_combined = pd.concat([df, feedback_weighted], ignore_index=True)
    
    print(f"\n‚úÖ Combined datasets:")
    print(f"   Original synthetic: {num_synthetic} records")
    print(f"   User feedback (weighted): {len(feedback_weighted)} records")
    print(f"   Total training data: {len(df_combined)} records")
    
    # Use combined data for subsequent training
    df = df_combined
    print(f"\n‚úÖ Using combined data with user feedback for model training!")
    print(f"   This makes the model adapt to user preferences and actual performance")
    
else:
    print(f"\n‚ö†Ô∏è No feedback file found. Using only synthetic data.")
    print(f"   Feedback will be available after first few users submit post-workout feedback.")

print("\n" + "="*80 + "\n")



OPTIONAL STEP: Loading User Feedback for Continuous Learning...

‚úÖ Found feedback file: feedback_history.csv
‚úÖ Loaded 4 feedback records from users

‚úÖ Converted 4 feedback records to training data
   Adjusted durations: 56.0 min (avg)
   Adjusted intensities: 4.9 RPE (avg)

‚úÖ Combined datasets:
   Original synthetic: 20000 records
   User feedback (weighted): 8 records
   Total training data: 20008 records

‚úÖ Using combined data with user feedback for model training!
   This makes the model adapt to user preferences and actual performance


‚úÖ Loaded 4 feedback records from users

‚úÖ Converted 4 feedback records to training data
   Adjusted durations: 56.0 min (avg)
   Adjusted intensities: 4.9 RPE (avg)

‚úÖ Combined datasets:
   Original synthetic: 20000 records
   User feedback (weighted): 8 records
   Total training data: 20008 records

‚úÖ Using combined data with user feedback for model training!
   This makes the model adapt to user preferences and actual performanc

In [3]:
# --- FEEDBACK ANALYTICS & MODEL ADAPTATION ANALYSIS ---
print("\n" + "="*80)
print("FEEDBACK ANALYTICS: How is the model adapting to user preferences?")
print("="*80)

if os.path.exists('feedback_history.csv'):
    feedback_df = pd.read_csv('feedback_history.csv')
    
    print(f"\nüìä FEEDBACK ANALYSIS ({len(feedback_df)} submissions):")
    
    # Completion patterns
    avg_completion = feedback_df['workout_completion_pct'].mean()
    completion_std = feedback_df['workout_completion_pct'].std()
    print(f"\n1. WORKOUT COMPLETION:")
    print(f"   Average: {avg_completion:.1f}% (¬±{completion_std:.1f}%)")
    
    high_completion = (feedback_df['workout_completion_pct'] >= 90).sum()
    low_completion = (feedback_df['workout_completion_pct'] < 50).sum()
    print(f"   High achievers (‚â•90%): {high_completion} users")
    print(f"   Struggled (<50%): {low_completion} users")
    
    # Intensity accuracy
    actual_vs_recommended = feedback_df['actual_intensity'] / feedback_df['recommended_intensity']
    print(f"\n2. INTENSITY ACCURACY:")
    print(f"   Average Actual/Recommended: {actual_vs_recommended.mean():.2f}x")
    print(f"   Target range: 0.90-1.10 (user can do 90-110% of recommendation)")
    
    if actual_vs_recommended.mean() < 0.90:
        print(f"   ‚ö†Ô∏è Model is overestimating intensity - recommend DECREASING")
    elif actual_vs_recommended.mean() > 1.10:
        print(f"   ‚úÖ Model is underestimating - could push harder")
    else:
        print(f"   ‚úÖ Intensity estimates are well-calibrated")
    
    # Recovery patterns
    avg_recovery = feedback_df['recovery_feeling'].mean()
    print(f"\n3. POST-WORKOUT RECOVERY:")
    print(f"   Average recovery feeling: {avg_recovery:.1f}/5")
    
    exhausted = (feedback_df['recovery_feeling'] == 1).sum()
    refreshed = (feedback_df['recovery_feeling'] == 5).sum()
    print(f"   Exhausted (1/5): {exhausted} users")
    print(f"   Refreshed (5/5): {refreshed} users")
    
    if avg_recovery < 2.5:
        print(f"   ‚ö†Ô∏è Users are too exhausted - recommend REDUCING duration/intensity")
    elif avg_recovery > 4:
        print(f"   ‚úÖ Good balance - users can handle current recommendations")
    
    # Satisfaction patterns
    very_satisfied = (feedback_df['would_repeat'] == 'Yes, definitely! ‚úÖ').sum()
    not_satisfied = (feedback_df['would_repeat'].isin(['Maybe ü§î', 'No üëé'])).sum()
    
    print(f"\n4. SATISFACTION & REPEATABILITY:")
    print(f"   Would repeat workout: {very_satisfied} users said 'definitely yes'")
    print(f"   Uncertain/No: {not_satisfied} users")
    
    # Feature-based patterns (if enough data)
    if len(feedback_df) >= 10:
        print(f"\n5. PATTERNS BY USER CHARACTERISTICS:")
        
        # By age groups
        young = feedback_df[feedback_df['age'] < 35]['workout_completion_pct'].mean()
        old = feedback_df[feedback_df['age'] >= 45]['workout_completion_pct'].mean()
        print(f"   Younger users (<35): {young:.1f}% completion")
        print(f"   Older users (‚â•45): {old:.1f}% completion")
        
        # By stress levels
        low_stress = feedback_df[feedback_df['mental_stress'] <= 2]['workout_completion_pct'].mean()
        high_stress = feedback_df[feedback_df['mental_stress'] >= 4]['workout_completion_pct'].mean()
        print(f"   Low stress users: {low_stress:.1f}% completion")
        print(f"   High stress users: {high_stress:.1f}% completion")
        
        # By sleep quality
        good_sleep = feedback_df[feedback_df['sleep_hrs'] >= 7]['workout_completion_pct'].mean()
        poor_sleep = feedback_df[feedback_df['sleep_hrs'] < 6]['workout_completion_pct'].mean()
        print(f"   Good sleep (‚â•7h): {good_sleep:.1f}% completion")
        print(f"   Poor sleep (<6h): {poor_sleep:.1f}% completion")
    
    print(f"\nüí° MODEL ADAPTATION INSIGHTS:")
    print(f"   ‚úÖ The combined model learns from {len(feedback_df)} real user workouts")
    print(f"   ‚úÖ Next retrain will be SMARTER based on actual performance data")
    print(f"   ‚úÖ Recommendations are optimized for YOUR audience's capabilities")
    
else:
    print("\n‚ö†Ô∏è No feedback data yet. Waiting for first users to complete workouts!")
    print("   This analytics will be available after ~5-10 user feedback submissions")

print("\n" + "="*80)



FEEDBACK ANALYTICS: How is the model adapting to user preferences?

üìä FEEDBACK ANALYSIS (4 submissions):

1. WORKOUT COMPLETION:
   Average: 100.0% (¬±0.0%)
   High achievers (‚â•90%): 4 users
   Struggled (<50%): 0 users

2. INTENSITY ACCURACY:
   Average Actual/Recommended: 0.88x
   Target range: 0.90-1.10 (user can do 90-110% of recommendation)
   ‚ö†Ô∏è Model is overestimating intensity - recommend DECREASING

3. POST-WORKOUT RECOVERY:
   Average recovery feeling: 3.5/5
   Exhausted (1/5): 0 users
   Refreshed (5/5): 0 users

4. SATISFACTION & REPEATABILITY:
   Would repeat workout: 1 users said 'definitely yes'
   Uncertain/No: 0 users

üí° MODEL ADAPTATION INSIGHTS:
   ‚úÖ The combined model learns from 4 real user workouts
   ‚úÖ Next retrain will be SMARTER based on actual performance data
   ‚úÖ Recommendations are optimized for YOUR audience's capabilities

