In [None]:
"""
Obesity Classification - Extra Trees Only (Optimized for 85%+ Accuracy)
Focused approach with best feature engineering
"""

# ============================================================================
# 1. IMPORT LIBRARIES
# ============================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# ============================================================================
# 2. LOAD DATA
# ============================================================================
train_df = pd.read_csv('train.csv')
print(f"\n{'='*70}")
print(f"Training data shape: {train_df.shape}")
print(f"Target distribution:\n{train_df['Weight_Category'].value_counts()}")
print(f"{'='*70}")

# ============================================================================
# 3. ADVANCED FEATURE ENGINEERING
# ============================================================================
def create_enhanced_features(df):
    """Create extensive feature set with polynomial and interaction features"""
    df = df.copy()
    
    # Core BMI features
    df['BMI'] = df['Weight_Kg'] / ((df['Height_cm'] / 100) ** 2)
    df['BMI_squared'] = df['BMI'] ** 2
    df['BMI_cubed'] = df['BMI'] ** 3
    df['BMI_log'] = np.log1p(df['BMI'])
    df['BMI_sqrt'] = np.sqrt(df['BMI'])
    
    # Weight-Height ratios
    df['Weight_Height_Ratio'] = df['Weight_Kg'] / df['Height_cm']
    df['Weight_Height_Ratio_Sq'] = df['Weight_Height_Ratio'] ** 2
    df['Height_Weight_Ratio'] = df['Height_cm'] / df['Weight_Kg']
    
    # Age features
    df['Age_squared'] = df['Age_Years'] ** 2
    df['Age_log'] = np.log1p(df['Age_Years'])
    df['Age_BMI'] = df['Age_Years'] * df['BMI']
    df['Age_Weight'] = df['Age_Years'] * df['Weight_Kg']
    df['Age_Height'] = df['Age_Years'] * df['Height_cm']
    
    # Age categories (more robust approach with codes)
    age_filled = df['Age_Years'].fillna(df['Age_Years'].median())
    df['Age_Group'] = pd.cut(age_filled, bins=[0, 25, 35, 45, 100], labels=False)
    df['Age_Group'] = df['Age_Group'].fillna(1).astype(int)
    
    # Lifestyle binary features
    df['High_Cal'] = (df['High_Calorie_Food'] == 'yes').astype(int)
    df['Low_Veg'] = (df['Vegetable_Intake'] < 2).astype(int)
    df['Low_Water'] = (df['Water_Intake'] < 2).astype(int)
    df['High_Screen'] = (df['Screen_Time_Hours'] > 4).astype(int)
    
    # Enhanced diet score with polynomial terms
    df['Diet_Score'] = (
        df['High_Cal'] * 4 +
        (3 - df['Vegetable_Intake']).clip(0, 3) * 2.5 +
        (3 - df['Water_Intake']).clip(0, 3) * 2 +
        df['Meal_Frequency'] * 0.8
    )
    df['Diet_Score_Sq'] = df['Diet_Score'] ** 2
    
    # Activity features
    activity_map = {'low': 0, 'medium': 1, 'high': 2}
    df['Activity_Num'] = df['Physical_Activity_Level'].map(activity_map).fillna(0)
    
    if 'Activity_Level_Score' not in df.columns:
        df['Activity_Level_Score'] = df['Activity_Num'].astype(float)
    
    df['Activity_Score_Sq'] = df['Activity_Level_Score'] ** 2
    df['Sedentary'] = (df['High_Screen'] + (df['Activity_Level_Score'] < 1).astype(int))
    
    # Risk factors with better encoding
    df['Family_Hist'] = (df['Family_History'] == 'yes').astype(int)
    df['Smoking'] = (df['Smoking_Habit'] == 'yes').astype(int)
    
    alcohol_map = {'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3}
    df['Alcohol'] = df['Alcohol_Consumption'].map(alcohol_map).fillna(0)
    
    snack_map = {'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3}
    df['Snack'] = df['Snack_Frequency'].map(snack_map).fillna(1)
    
    if 'Family_Risk' not in df.columns:
        df['Family_Risk'] = df['Family_Hist']
    
    # Enhanced risk score
    df['Total_Risk'] = (
        df['Family_Hist'] * 5 +
        df['Family_Risk'] * 4 +
        df['Smoking'] * 3.5 +
        df['Alcohol'] * 2.5 +
        df['Snack'] * 2 +
        df['Diet_Score'] * 0.5 +
        df['Sedentary'] * 3 -
        df['Activity_Num'] * 2.5 -
        df['Activity_Level_Score'] * 1.5
    )
    df['Total_Risk_Sq'] = df['Total_Risk'] ** 2
    
    # Gender
    df['Gender_Male'] = (df['Gender'] == 'Male').astype(int)
    
    # Transport
    df['Active_Transport'] = df['Commute_Mode'].isin(['Walking', 'Bike']).astype(int)
    
    # Critical interactions
    df['BMI_Risk'] = df['BMI'] * df['Total_Risk']
    df['BMI_Age'] = df['BMI'] * df['Age_Years']
    df['BMI_Diet'] = df['BMI'] * df['Diet_Score']
    df['BMI_Activity'] = df['BMI'] * df['Activity_Level_Score']
    df['Diet_Activity_Gap'] = df['Diet_Score'] - df['Activity_Level_Score']
    df['Risk_Activity_Ratio'] = df['Total_Risk'] / (df['Activity_Level_Score'] + 1)
    
    # Advanced interactions
    df['Age_Risk'] = df['Age_Years'] * df['Total_Risk']
    df['Gender_BMI'] = df['Gender_Male'] * df['BMI']
    df['Family_BMI'] = df['Family_Hist'] * df['BMI']
    df['Screen_Activity'] = df['Screen_Time_Hours'] * (3 - df['Activity_Level_Score'])
    
    # Lifestyle balance
    df['Health_Score'] = (
        df['Vegetable_Intake'] * 2 +
        df['Water_Intake'] * 1.5 +
        df['Activity_Level_Score'] * 3 -
        df['High_Cal'] * 3 -
        df['Screen_Time_Hours'] * 0.5 -
        df['Snack'] * 1.5
    )
    df['Lifestyle_Balance'] = df['Health_Score'] - df['Diet_Score']
    
    # Binning important continuous features
    bmi_filled = df['BMI'].fillna(df['BMI'].median())
    df['BMI_Category'] = pd.cut(bmi_filled, bins=[0, 18.5, 25, 30, 35, 100], labels=False)
    df['BMI_Category'] = df['BMI_Category'].fillna(2).astype(int)
    
    screen_filled = df['Screen_Time_Hours'].fillna(df['Screen_Time_Hours'].median())
    df['Screen_Category'] = pd.cut(screen_filled, bins=[0, 2, 4, 6, 24], labels=False)
    df['Screen_Category'] = df['Screen_Category'].fillna(1).astype(int)
    
    return df

train_df = create_enhanced_features(train_df)
print(f"✓ Enhanced features created: {train_df.shape[1]} columns")

# ============================================================================
# 4. PREPROCESSING
# ============================================================================
def prepare_data(df, is_train=True):
    df = df.copy()
    if is_train:
        X = df.drop(['PersonID', 'Weight_Category'], axis=1)
        y = df['Weight_Category']
    else:
        X = df.drop(['PersonID'], axis=1)
        y = None
    
    # Encode categoricals
    for col in X.select_dtypes(include=['object', 'category']).columns:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
    
    X = X.fillna(X.median(numeric_only=True)).replace([np.inf, -np.inf], 0)
    return X, y

X, y = prepare_data(train_df, is_train=True)

# Scale features
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Stratified split
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.15, random_state=42, stratify=y
)

print(f"\nTraining: {len(X_train)}, Validation: {len(X_val)}")

# ============================================================================
# 5. TRAIN EXTRA TREES WITH MULTIPLE CONFIGURATIONS
# ============================================================================
print(f"\n{'='*70}")
print("TRAINING EXTRA TREES WITH DIFFERENT CONFIGURATIONS")
print(f"{'='*70}\n")

configs = [
    {
        'name': 'Extra Trees - Config 1 (Balanced)',
        'params': {
            'n_estimators': 500,
            'max_depth': 30,
            'min_samples_split': 5,
            'min_samples_leaf': 2,
            'max_features': 'sqrt',
            'random_state': 42,
            'n_jobs': -1
        }
    },
    {
        'name': 'Extra Trees - Config 2 (Deep)',
        'params': {
            'n_estimators': 600,
            'max_depth': 35,
            'min_samples_split': 3,
            'min_samples_leaf': 1,
            'max_features': 'sqrt',
            'random_state': 123,
            'n_jobs': -1
        }
    },
    {
        'name': 'Extra Trees - Config 3 (Wide)',
        'params': {
            'n_estimators': 700,
            'max_depth': 28,
            'min_samples_split': 4,
            'min_samples_leaf': 2,
            'max_features': 'log2',
            'random_state': 456,
            'n_jobs': -1
        }
    },
    {
        'name': 'Extra Trees - Config 4 (Conservative)',
        'params': {
            'n_estimators': 800,
            'max_depth': 25,
            'min_samples_split': 8,
            'min_samples_leaf': 3,
            'max_features': 'sqrt',
            'random_state': 789,
            'n_jobs': -1
        }
    }
]

models = []
results = []

for config in configs:
    print(f"Training: {config['name']}...")
    et = ExtraTreesClassifier(**config['params'])
    et.fit(X_train, y_train)
    pred = et.predict(X_val)
    acc = accuracy_score(y_val, pred)
    
    models.append((config['name'], et))
    results.append({'Model': config['name'], 'Accuracy': acc})
    
    print(f"  Accuracy: {acc:.4f} ({acc*100:.2f}%)")

# ============================================================================
# 6. DISPLAY RESULTS
# ============================================================================
print(f"\n{'='*70}")
print("EXTRA TREES CONFIGURATIONS COMPARISON")
print(f"{'='*70}")

results_df = pd.DataFrame(results)
results_df['Accuracy %'] = results_df['Accuracy'].apply(lambda x: f"{x*100:.2f}%")
results_df = results_df.sort_values('Accuracy', ascending=False)

print(results_df.to_string(index=False))

# Visualize
plt.figure(figsize=(12, 6))
colors = ['#27ae60' if acc >= 0.85 else '#2ecc71' if acc >= 0.80 else '#3498db' 
          for acc in results_df['Accuracy']]
bars = plt.barh(results_df['Model'], results_df['Accuracy'], color=colors)
plt.xlabel('Accuracy', fontsize=12, fontweight='bold')
plt.title('Extra Trees Configurations - Accuracy Comparison', fontsize=14, fontweight='bold')
plt.xlim(0.70, 1.0)
for bar, acc in zip(bars, results_df['Accuracy']):
    plt.text(acc + 0.005, bar.get_y() + bar.get_height()/2,
             f'{acc:.4f} ({acc*100:.2f}%)', va='center', fontweight='bold')
plt.axvline(x=0.80, color='orange', linestyle='--', linewidth=2, label='80% Target')
plt.axvline(x=0.85, color='green', linestyle='--', linewidth=2, label='85% Target')
plt.legend()
plt.tight_layout()
plt.show()

# ============================================================================
# 7. ENSEMBLE OF EXTRA TREES (Voting)
# ============================================================================
print(f"\n{'='*70}")
print("CREATING ENSEMBLE OF ALL EXTRA TREES MODELS")
print(f"{'='*70}")

# Collect all predictions
all_preds = []
for name, model in models:
    pred = model.predict(X_val)
    all_preds.append(pred)

# Majority voting
ensemble_preds_array = np.vstack(all_preds)
final_ensemble_pred = pd.DataFrame(ensemble_preds_array).mode(axis=0).iloc[0].to_numpy()
ensemble_acc = accuracy_score(y_val, final_ensemble_pred)

print(f"\n✓ Extra Trees Ensemble Accuracy: {ensemble_acc:.4f} ({ensemble_acc*100:.2f}%)")

# Display confusion matrix for best model
best_model_name = results_df.iloc[0]['Model']
best_model = [m for n, m in models if n == best_model_name][0]
best_pred = best_model.predict(X_val)

print(f"\n{'='*70}")
print(f"CONFUSION MATRIX - {best_model_name}")
print(f"{'='*70}")
cm = confusion_matrix(y_val, best_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=sorted(y_val.unique()), 
            yticklabels=sorted(y_val.unique()))
plt.title(f'Confusion Matrix - {best_model_name}', fontsize=14, fontweight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

print("\nClassification Report:")
print(classification_report(y_val, best_pred))

# ============================================================================
# 8. TRAIN BEST MODELS ON FULL DATA
# ============================================================================
print(f"\n{'='*70}")
print("TRAINING ALL MODELS ON FULL DATA")
print(f"{'='*70}")

final_models_full = []

for config in configs:
    print(f"Training {config['name']} on full data...")
    et = ExtraTreesClassifier(**config['params'])
    et.fit(X_scaled, y)
    final_models_full.append((config['name'], et))

print("✓ Training complete!")

# ============================================================================
# 9. GENERATE TEST PREDICTIONS
# ============================================================================
print(f"\n{'='*70}")
print("GENERATING TEST PREDICTIONS")
print(f"{'='*70}")

test_df_raw = pd.read_csv('test.csv')
print(f"Test data shape: {test_df_raw.shape}")

test_df = create_enhanced_features(test_df_raw.copy())
X_test, _ = prepare_data(test_df, is_train=False)

# Align columns
missing_cols = set(X.columns) - set(X_test.columns)
for col in missing_cols:
    X_test[col] = 0
X_test = X_test[X.columns]

X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Predict with all models
all_test_preds = []
for name, model in final_models_full:
    pred = model.predict(X_test_scaled)
    all_test_preds.append(pred)

# Ensemble prediction (majority voting)
test_preds_array = np.vstack(all_test_preds)
final_predictions = pd.DataFrame(test_preds_array).mode(axis=0).iloc[0].to_numpy()

print(f"\nPrediction distribution:")
print(pd.Series(final_predictions).value_counts().sort_index())

# ============================================================================
# 10. CREATE SUBMISSION
# ============================================================================
submission = pd.DataFrame({
    'PersonID': test_df_raw['PersonID'],
    'Weight_Category': final_predictions
})
submission.to_csv('submission_extratrees.csv', index=False)

print(f"\n{'='*70}")
print("SUBMISSION FILE CREATED")
print(f"{'='*70}")
print(f"File: submission_extratrees.csv")
print(f"Predictions: {len(submission)}")
print(f"\nFirst 10:")
print(submission.head(10))

print(f"\n{'='*70}")
print("FINAL SUMMARY")
print(f"{'='*70}")
print(f"Best Single Config: {best_model_name} ({results_df.iloc[0]['Accuracy']*100:.2f}%)")
print(f"Ensemble Accuracy (val): {ensemble_acc*100:.2f}%")
print(f"Number of Models: {len(final_models_full)}")
print(f"{'='*70}")
print("\n✅ Extra Trees submission ready!")