In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

print("Loading data...")
# Load data
train_data = pd.read_csv('/kaggle/input/train.csv')
test_data = pd.read_csv('/kaggle/input/test.csv')

print(f"Training data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")

# Check target distribution
print(f"\nTarget distribution:")
print(train_data['age_group'].value_counts(dropna=False))


In [None]:
def clean_data(df, is_train=True):
    """
    Simple, effective data cleaning
    """
    df = df.copy()
    
    if is_train:
        # Remove rows with missing target
        print(f"Rows before cleaning: {len(df)}")
        df = df.dropna(subset=['age_group'])
        print(f"Rows after cleaning: {len(df)}")
    
    # Remove identifier
    if 'SEQN' in df.columns:
        df = df.drop('SEQN', axis=1)
    
    return df

# Clean data
train_clean = clean_data(train_data, is_train=True)
test_clean = clean_data(test_data, is_train=False)

print(f"Final target distribution:")
print(train_clean['age_group'].value_counts())


In [None]:
def simple_feature_engineering(df):
    """
    Simple, proven feature engineering
    """
    df = df.copy()
    
    # Handle missing values with median/mode
    numerical_cols = ['BMXBMI', 'LBXGLU', 'LBXGLT', 'LBXIN']
    categorical_cols = ['RIAGENDR', 'PAQ605', 'DIQ010']
    
    # Fill numerical missing values with median
    for col in numerical_cols:
        if col in df.columns:
            median_val = df[col].median()
            df[col] = df[col].fillna(median_val)
    
    # Fill categorical missing values with mode (most frequent)
    for col in categorical_cols:
        if col in df.columns:
            mode_val = df[col].mode()[0] if len(df[col].mode()) > 0 else 2.0
            df[col] = df[col].fillna(mode_val)
    
    # Simple binary features that actually matter for age prediction
    df['BMI_high'] = (df['BMXBMI'] > 30).astype(int)  # Obesity threshold
    df['Glucose_high'] = (df['LBXGLU'] > 100).astype(int)  # Pre-diabetes threshold
    df['Diabetes_yes'] = (df['DIQ010'] == 1).astype(int)  # Has diabetes
    df['Male'] = (df['RIAGENDR'] == 1).astype(int)  # Gender encoding
    df['Active'] = (df['PAQ605'] == 1).astype(int)  # Physical activity
    
    # Age-related health combinations
    df['Metabolic_risk'] = df['BMI_high'] + df['Glucose_high'] + df['Diabetes_yes']
    
    return df

# Apply feature engineering
train_processed = simple_feature_engineering(train_clean)
test_processed = simple_feature_engineering(test_clean)

print("Feature engineering completed!")
print(f"Final training shape: {train_processed.shape}")


In [None]:
# Prepare target variable - CRITICAL STEP
print("Preparing target variable...")

# Simple mapping: Adult=0, Senior=1
y = train_processed['age_group'].map({'Adult': 0, 'Senior': 1})
X = train_processed.drop('age_group', axis=1)

print(f"Target mapping check:")
print(f"Missing targets after mapping: {y.isnull().sum()}")
print(f"Target distribution: {y.value_counts()}")

# Split for validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training shapes: X={X_train.shape}, y={y_train.shape}")
print(f"Validation shapes: X={X_val.shape}, y={y_val.shape}")

# Use simpler, more robust model
print("Training Random Forest...")
rf_model = RandomForestClassifier(
    n_estimators=100,  # Reduced from 200
    max_depth=10,      # Limited depth to prevent overfitting
    min_samples_split=10,  # Higher to prevent overfitting
    min_samples_leaf=5,    # Higher to prevent overfitting
    random_state=42,
    class_weight='balanced'  # Handle class imbalance
)

# Fit model
rf_model.fit(X_train, y_train)

# Validate
val_pred = rf_model.predict(X_val)
val_pred_proba = rf_model.predict_proba(X_val)[:, 1]

val_auc = roc_auc_score(y_val, val_pred_proba)
print(f"Validation ROC-AUC: {val_auc:.4f}")

print("\nClassification Report:")
print(classification_report(y_val, val_pred, target_names=['Adult', 'Senior']))


In [None]:
# Check feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Feature Importances:")
print(feature_importance.head(10))

# Use only top features for final model
top_features = feature_importance.head(8)['feature'].tolist()
print(f"\nUsing top features: {top_features}")

# Retrain with top features only
X_train_top = X_train[top_features]
X_val_top = X_val[top_features]

rf_final = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    class_weight='balanced'
)

rf_final.fit(X_train_top, y_train)

# Final validation
val_pred_final = rf_final.predict(X_val_top)
val_pred_proba_final = rf_final.predict_proba(X_val_top)[:, 1]
val_auc_final = roc_auc_score(y_val, val_pred_proba_final)

print(f"Final Validation ROC-AUC: {val_auc_final:.4f}")


In [None]:
# Prepare test data with same features
print("Generating final predictions...")

# Ensure test data has same columns
test_final = test_processed[top_features]

print(f"Test data shape: {test_final.shape}")
print(f"Missing values in test: {test_final.isnull().sum().sum()}")

# Generate predictions
final_predictions = rf_final.predict(test_final)

print(f"Prediction distribution:")
print(pd.Series(final_predictions).value_counts())

# Create submission
submission = pd.DataFrame({
    'age_group': final_predictions
})

# Save submission
submission.to_csv('submission.csv', index=False)
print("\nSubmission saved!")
print(f"Submission shape: {submission.shape}")
print("First 10 predictions:")
print(submission.head(10))
