In [None]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.2-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.28.9-py3-none-manylinux_2_18_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.1.2-py3-none-manylinux_2_28_x86_64.whl (115.9 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m115.9/115.9 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_nccl_cu12-2.28.9-py3-none-manylinux_2_18_x86_64.whl (296.8 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m296.8/296.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.28.9 xgboost-3.1.2


In [None]:
# 0. Install missing library immediately
!pip install --upgrade xgboost

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

# 1. Load Data
print("Loading data...")
# We use dropna on train only; the test set must remain at 300,000 rows
train = pd.read_csv('train.csv').dropna(subset=['diagnosed_diabetes'])
test = pd.read_csv('test.csv')

print(f"Test set loaded with {len(test)} rows. Preparing for submission...")

# Convert target to integer
train['diagnosed_diabetes'] = train['diagnosed_diabetes'].astype(int)

# 2. Advanced Feature Engineering
def apply_fe(df):
    # Use float32 to save memory on large datasets
    df['tg_hdl_ratio'] = (df['triglycerides'] / (df['hdl_cholesterol'] + 1e-6)).astype('float32')
    df['remnant_cholesterol'] = (df['cholesterol_total'] - (df['hdl_cholesterol'] + df['ldl_cholesterol'])).astype('float32')
    df['pulse_pressure'] = (df['systolic_bp'] - df['diastolic_bp']).astype('float32')
    df['bmi_age'] = (df['bmi'] * df['age']).astype('float32')

    risk_factors = ['family_history_diabetes', 'hypertension_history', 'cardiovascular_history']
    df['total_comorbidities'] = df[risk_factors].sum(axis=1).astype('int32')
    return df

train = apply_fe(train)
test = apply_fe(test)

# 3. Handle Categorical Columns
cat_cols = train.select_dtypes(include=['object']).columns
for col in cat_cols:
    # Frequency encoding
    freq = train[col].value_counts(normalize=True)
    train[col + '_freq'] = train[col].map(freq).fillna(0).astype('float32')
    test[col + '_freq'] = test[col].map(freq).fillna(0).astype('float32')

    # Label Encoding
    le = LabelEncoder()
    combined = pd.concat([train[col], test[col]]).astype(str)
    le.fit(combined)
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

# 4. Final Data Prep
# Ensure X_test has all 300,000 rows
X = train.drop(columns=['id', 'diagnosed_diabetes'])
y = train['diagnosed_diabetes']
X_test = test.drop(columns=['id'])
test_ids = test['id']

# 5. Competition-Grade Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
test_preds = np.zeros(len(X_test)) # This will hold 300,000 predictions
oof_preds = np.zeros(len(X))

print(f"Training on {len(X)} rows, Predicting on {len(X_test)} rows...")

for fold, (t_idx, v_idx) in enumerate(skf.split(X, y)):
    xt, xv = X.iloc[t_idx], X.iloc[v_idx]
    yt, yv = y.iloc[t_idx], y.iloc[v_idx]

    y = y.fillna(0).astype(int)
    model = xgb.XGBClassifier(
        tree_method='hist',      # Much faster for large datasets
        n_estimators=1000,       # Reduced slightly for speed, but boosted by learning rate
        learning_rate=0.05,
        max_depth=7,
        subsample=0.8,
        colsample_bytree=0.6,
        objective='binary:logistic',
        eval_metric='auc',
        random_state=42,
        n_jobs=-1,
        early_stopping_rounds =100
    )

    # Using the standard fit (early_stopping_rounds inside fit)
    model.fit(xt, yt, eval_set=[(xv, yv)], verbose=100)

    # Accumulate predictions
    oof_preds[v_idx] = model.predict_proba(xv)[:, 1]
    test_preds += model.predict_proba(X_test)[:, 1] / 5
    print(f"Fold {fold+1} complete.")

# 6. Final Submission
submission = pd.DataFrame({
    'id': test_ids,
    'diagnosed_diabetes': test_preds
})

# Final check:
print(f"Final submission row count: {len(submission)}")
submission.to_csv('submission_final.csv', index=False)
print("File 'submission_final.csv' is ready!")

Loading data...
Test set loaded with 300000 rows. Preparing for submission...
Training on 700000 rows, Predicting on 300000 rows...
[0]	validation_0-auc:0.64289
[100]	validation_0-auc:0.71892
[200]	validation_0-auc:0.72311
[300]	validation_0-auc:0.72449
[400]	validation_0-auc:0.72532
[500]	validation_0-auc:0.72562
[600]	validation_0-auc:0.72585
[700]	validation_0-auc:0.72583
[717]	validation_0-auc:0.72589
Fold 1 complete.
[0]	validation_0-auc:0.64204
[100]	validation_0-auc:0.71661
[200]	validation_0-auc:0.72077
[300]	validation_0-auc:0.72217
[400]	validation_0-auc:0.72326
[500]	validation_0-auc:0.72378
[600]	validation_0-auc:0.72431
[700]	validation_0-auc:0.72447
[800]	validation_0-auc:0.72458
[900]	validation_0-auc:0.72463
[994]	validation_0-auc:0.72452
Fold 2 complete.
[0]	validation_0-auc:0.64168
[100]	validation_0-auc:0.71737
[200]	validation_0-auc:0.72147
[300]	validation_0-auc:0.72293
[400]	validation_0-auc:0.72394
[500]	validation_0-auc:0.72452
[600]	validation_0-auc:0.72484
[70

In [None]:
# 1. Fix the environment errors first
!pip install --upgrade xgboost

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OrdinalEncoder

# --- 1. Load Data ---
train = pd.read_csv('train.csv').dropna(subset=['diagnosed_diabetes'])
test = pd.read_csv('test.csv')
train['diagnosed_diabetes'] = train['diagnosed_diabetes'].astype(int)

# --- 2. Advanced Feature Engineering (The Signal) ---
def advanced_fe(df):
    # Metabolic & Arterial signals
    df['age_bmi'] = (df['age'] * df['bmi']).astype('float32')
    df['non_hdl'] = (df['cholesterol_total'] - df['hdl_cholesterol']).astype('float32')
    df['aip'] = np.log10(df['triglycerides'] / (df['hdl_cholesterol'] + 1e-6)).astype('float32')
    df['map'] = ((df['systolic_bp'] + 2 * df['diastolic_bp']) / 3).astype('float32')

    # Ratios
    df['bmi_wthr'] = (df['bmi'] * df['waist_to_hip_ratio']).astype('float32')
    return df

train = advanced_fe(train)
test = advanced_fe(test)

# --- 3. Encoding ---
cat_cols = train.select_dtypes(include=['object']).columns
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
train[cat_cols] = oe.fit_transform(train[cat_cols].astype(str))
test[cat_cols] = oe.transform(test[cat_cols].astype(str))

X = train.drop(columns=['id', 'diagnosed_diabetes'])
y = train['diagnosed_diabetes']
X_test = test.drop(columns=['id'])

# --- 4. The Blending Strategy ---
# We will use 5 folds and 2 different models
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
final_preds = np.zeros(len(X_test)) # Target: 300,000 rows

print("üöÄ Starting Blending: Model 1 (XGBoost) + Model 2 (HistGBM)...")

for fold, (t_idx, v_idx) in enumerate(skf.split(X, y)):
    xt, xv = X.iloc[t_idx], X.iloc[v_idx]
    yt, yv = y.iloc[t_idx], y.iloc[v_idx]

    # Model 1: XGBoost (Fixing the TypeError here)
    m1 = xgb.XGBClassifier(
        tree_method='hist',
        n_estimators=1000,
        learning_rate=0.03,
        max_depth=7,
        early_stopping_rounds=50, # Set in constructor to avoid TypeError
        eval_metric='auc',
        objective='binary:logistic',
        random_state=42 + fold
    )
    m1.fit(xt, yt, eval_set=[(xv, yv)], verbose=False)

    # Model 2: HistGradientBoosting (Native to sklearn, no install needed)
    m2 = HistGradientBoostingClassifier(
        max_iter=1000,
        learning_rate=0.02,
        max_depth=8,
        l2_regularization=2.0,
        early_stopping=True,
        random_state=42 + fold
    )
    m2.fit(xt, yt)

    # Predict Probabilities
    p1 = m1.predict_proba(X_test)[:, 1]
    p2 = m2.predict_proba(X_test)[:, 1]

    # Blend: 50% XGBoost + 50% HistGBM
    fold_preds = (p1 + p2) / 2
    final_preds += fold_preds / 5

    print(f"‚úÖ Fold {fold+1} complete.")

# --- 5. Final Submission (Guaranteed 300,000 rows) ---
submission = pd.DataFrame({
    'id': test['id'],
    'diagnosed_diabetes': final_preds
})

print(f"Success! Exporting {len(submission)} rows.")
submission.to_csv('submission_blend.csv', index=False)

üöÄ Starting Blending: Model 1 (XGBoost) + Model 2 (HistGBM)...
‚úÖ Fold 1 complete.
‚úÖ Fold 2 complete.
‚úÖ Fold 3 complete.
‚úÖ Fold 4 complete.
‚úÖ Fold 5 complete.
Success! Exporting 300000 rows.


In [None]:
# 1. Environment Setup
!pip install --upgrade xgboost

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OrdinalEncoder

# --- 2. Load and Clean ---
train = pd.read_csv('train.csv').dropna(subset=['diagnosed_diabetes'])
test = pd.read_csv('test.csv')
train['diagnosed_diabetes'] = train['diagnosed_diabetes'].astype(int)

# --- 3. Clinical Feature Engineering (The Final Boost) ---
def final_fe(df):
    # TyG Index Proxy: (ln[TG * Glucose/2]) -> since we don't have Glucose,
    # we use a metabolic proxy: ln(Triglycerides * BMI)
    df['metabolic_index'] = np.log1p(df['triglycerides'] * df['bmi']).astype('float32')

    # Heart/Arterial Risk
    df['hdl_to_total_ratio'] = (df['hdl_cholesterol'] / (df['cholesterol_total'] + 1e-6)).astype('float32')
    df['remnant_chol'] = (df['cholesterol_total'] - df['hdl_cholesterol'] - df['ldl_cholesterol']).astype('float32')

    # Blood Pressure Intensity
    df['bp_product'] = (df['systolic_bp'] * df['diastolic_bp']).astype('float32')

    # Body Composition signal
    df['age_waist_interaction'] = (df['age'] * df['waist_to_hip_ratio']).astype('float32')

    # Binary Risk Sum
    df['comorbidity_count'] = df[['family_history_diabetes', 'hypertension_history', 'cardiovascular_history']].sum(axis=1)

    return df

train = final_fe(train)
test = final_fe(test)

# --- 4. Robust Encoding ---
cat_cols = train.select_dtypes(include=['object']).columns
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
train[cat_cols] = oe.fit_transform(train[cat_cols].astype(str))
test[cat_cols] = oe.transform(test[cat_cols].astype(str))

X = train.drop(columns=['id', 'diagnosed_diabetes'])
y = train['diagnosed_diabetes']
X_test = test.drop(columns=['id'])

# --- 5. 10-Fold CV & Weighted Ensemble ---
# 10 folds is the gold standard for breaking through score plateaus
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
final_test_preds = np.zeros(len(X_test))
oof_scores = []

print("üöÄ Running 10-Fold Ultra-Ensemble...")

for fold, (t_idx, v_idx) in enumerate(skf.split(X, y)):
    xt, xv = X.iloc[t_idx], X.iloc[v_idx]
    yt, yv = y.iloc[t_idx], y.iloc[v_idx]

    # Model 1: XGBoost (Tuned for 0.70)
    m1 = xgb.XGBClassifier(
        tree_method='hist',
        n_estimators=1500,
        learning_rate=0.015, # Slower learning for more precision
        max_depth=8,
        subsample=0.85,
        colsample_bytree=0.5,
        early_stopping_rounds=100,
        eval_metric='auc',
        objective='binary:logistic',
        random_state=fold
    )
    m1.fit(xt, yt, eval_set=[(xv, yv)], verbose=False)

    # Model 2: HistGBM (Stronger Regularization)
    m2 = HistGradientBoostingClassifier(
        max_iter=1500,
        learning_rate=0.015,
        max_depth=10,
        l2_regularization=5.0, # High regularization to force generalization
        early_stopping=True,
        random_state=fold
    )
    m2.fit(xt, yt)

    # Get probabilities
    p1 = m1.predict_proba(X_test)[:, 1]
    p2 = m2.predict_proba(X_test)[:, 1]

    # We give 60% weight to XGBoost as it usually has more "peak" performance
    blend = (p1 * 0.6) + (p2 * 0.4)
    final_test_preds += blend / 10

    # Track performance
    val_p1 = m1.predict_proba(xv)[:, 1]
    val_p2 = m2.predict_proba(xv)[:, 1]
    fold_auc = roc_auc_score(yv, (val_p1 * 0.6) + (val_p2 * 0.4))
    oof_scores.append(fold_auc)
    print(f"Fold {fold+1} ROC AUC: {fold_auc:.5f}")

print(f"\nüèÜ Estimated Leaderboard Score: {np.mean(oof_scores):.5f}")

# --- 6. Output Final 300,000 Rows ---
submission = pd.DataFrame({'id': test['id'], 'diagnosed_diabetes': final_test_preds})
submission.to_csv('submission_final_push.csv', index=False)
print("File 'submission_final_push.csv' created successfully.")

üöÄ Running 10-Fold Ultra-Ensemble...
Fold 1 ROC AUC: 0.72561
Fold 2 ROC AUC: 0.72777
Fold 3 ROC AUC: 0.72471
Fold 4 ROC AUC: 0.72464
Fold 5 ROC AUC: 0.72567
Fold 6 ROC AUC: 0.72570
Fold 7 ROC AUC: 0.72456
Fold 8 ROC AUC: 0.72871
Fold 9 ROC AUC: 0.72817
Fold 10 ROC AUC: 0.72478

üèÜ Estimated Leaderboard Score: 0.72603
File 'submission_final_push.csv' created successfully.


In [None]:
# 1. Install necessary libraries
!pip install --upgrade xgboost catboost

import pandas as pd
import numpy as np
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OrdinalEncoder

# --- 2. Load Data ---
train = pd.read_csv('train.csv').dropna(subset=['diagnosed_diabetes'])
test = pd.read_csv('test.csv')

# SAFETY LOCK 1: Save the exact 300,000 IDs before any processing
original_test_ids = test['id'].values
expected_rows = 300000

train['diagnosed_diabetes'] = train['diagnosed_diabetes'].astype(int)

# --- 3. Feature Engineering ---
def ultimate_fe(df):
    # Use fillna here to ensure NO rows are dropped from test
    df['triglycerides'] = df['triglycerides'].fillna(df['triglycerides'].median())
    df['hdl_cholesterol'] = df['hdl_cholesterol'].fillna(df['hdl_cholesterol'].median())
    df['bmi'] = df['bmi'].fillna(df['bmi'].median())

    df['lipid_index'] = (df['triglycerides'] * df['bmi'] / (df['hdl_cholesterol'] + 1e-6)).astype('float32')
    df['atherogenic_index'] = np.log10(df['triglycerides'] / (df['hdl_cholesterol'] + 1e-6)).astype('float32')
    df['bmi_waist_age'] = (df['bmi'] * df['waist_to_hip_ratio'].fillna(0) * df['age']).astype('float32')
    df['bp_severity'] = (df['systolic_bp'].fillna(0) * df['heart_rate'].fillna(0)).astype('float32')
    df['pulse_pressure'] = (df['systolic_bp'].fillna(0) - df['diastolic_bp'].fillna(0)).astype('float32')
    return df

train = ultimate_fe(train)
test = ultimate_fe(test)

# --- 4. Encoding ---
cat_cols = ['gender', 'ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status']
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
train[cat_cols] = oe.fit_transform(train[cat_cols].astype(str))
test[cat_cols] = oe.transform(test[cat_cols].astype(str))

X = train.drop(columns=['id', 'diagnosed_diabetes'])
y = train['diagnosed_diabetes']
X_test = test.drop(columns=['id'])

# SAFETY LOCK 2: Pre-allocate a zero-array of exactly 300,000 slots
final_test_preds = np.zeros(expected_rows)

# --- 5. The Triple-Threat Ensemble ---
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

print(f"üöÄ Training on {len(X)} rows. Predicting on {expected_rows} rows...")

for fold, (t_idx, v_idx) in enumerate(skf.split(X, y)):
    xt, xv = X.iloc[t_idx], X.iloc[v_idx]
    yt, yv = y.iloc[t_idx], y.iloc[v_idx]

    # Model 1: XGBoost
    m1 = xgb.XGBClassifier(
        tree_method='hist', n_estimators=1000, learning_rate=0.02,
        max_depth=7, subsample=0.8, colsample_bytree=0.6,
        early_stopping_rounds=50, eval_metric='auc', random_state=fold
    )
    m1.fit(xt, yt, eval_set=[(xv, yv)], verbose=False)

    # Model 2: HistGBM
    m2 = HistGradientBoostingClassifier(
        max_iter=1000, learning_rate=0.02, max_depth=10,
        l2_regularization=5.0, early_stopping=True, random_state=fold
    )
    m2.fit(xt, yt)

    # Model 3: CatBoost
    m3 = CatBoostClassifier(
        iterations=1000, learning_rate=0.03, depth=6,
        l2_leaf_reg=3, eval_metric='AUC', random_state=fold,
        verbose=False, early_stopping_rounds=50
    )
    m3.fit(xt, yt, eval_set=(xv, yv))

    # Predict on the full X_test (this must return 300k rows)
    p1 = m1.predict_proba(X_test)[:, 1]
    p2 = m2.predict_proba(X_test)[:, 1]
    p3 = m3.predict_proba(X_test)[:, 1]

    blend = (p1 * 0.40) + (p2 * 0.30) + (p3 * 0.30)
    final_test_preds += blend / 10

    print(f"‚úÖ Fold {fold+1} complete.")

# --- 6. SAFETY LOCK 3: Forced Alignment ---
submission = pd.DataFrame({
    'id': original_test_ids, # Explicitly use the 300k IDs saved at the start
    'diagnosed_diabetes': final_test_preds
})

# Final row count check
print(f"\nFinal row count: {len(submission)}")
if len(submission) == expected_rows:
    submission.to_csv('submission_ultra_ensemble.csv', index=False)
    print("üèÜ Submission saved with exactly 300,000 rows!")
else:
    print(f"‚ùå Error: Submission has {len(submission)} rows. Check test data loading.")

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m99.2/99.2 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
üöÄ Training on 300124 rows. Predicting on 300000 rows...
‚úÖ Fold 1 complete.
‚úÖ Fold 2 complete.
‚úÖ Fold 3 complete.
‚úÖ Fold 4 complete.
‚úÖ Fold 5 complete.
‚úÖ Fold 6 complete.
‚úÖ Fold 7 complete.
‚úÖ Fold 8 complete.
‚úÖ Fold 9 complete.
‚úÖ Fold 10 complete.

Final row count: 300000
üèÜ Submission saved with exactly 300,000 rows!


In [None]:
# Use this for the final 0.70+ push
# It takes the rank of the predictions rather than the raw values

from scipy.stats import rankdata

# Assume p1, p2, p3 are your probabilities from XGB, HistGBM, and CatBoost
# Instead of: blend = (p1 + p2 + p3) / 3
# Use Rank Blending:
p1_rank = rankdata(p1)
p2_rank = rankdata(p2)
p3_rank = rankdata(p3)

# Normalize the ranks to be between 0 and 1
final_blend = (p1_rank + p2_rank + p3_rank) / (3 * len(p1))

# This technique is much more robust to AUC optimization
len(final_blend)

300000

In [1]:
!pip install --upgrade xgboost catboost
import pandas as pd
import numpy as np
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from scipy.stats import rankdata

# --- 1. Load Data ---
train = pd.read_csv('train.csv').dropna(subset=['diagnosed_diabetes'])
test = pd.read_csv('test.csv')
train['diagnosed_diabetes'] = train['diagnosed_diabetes'].astype(int)

# --- 2. Advanced Clinical Feature Engineering ---
def expert_fe(df):
    # Metabolic Syndrome Score (Proxy)
    # High BP + High BMI + High Lipids
    df['high_bp'] = ((df['systolic_bp'] > 130) | (df['diastolic_bp'] > 80)).astype(int)
    df['high_bmi'] = (df['bmi'] > 30).astype(int)
    df['high_tg'] = (df['triglycerides'] > 150).astype(int)
    df['metabolic_score'] = df['high_bp'] + df['high_bmi'] + df['high_tg']

    # Clinical Ratios
    df['tg_hdl_ratio'] = df['triglycerides'] / (df['hdl_cholesterol'] + 1e-6)
    df['remnant_chol'] = df['cholesterol_total'] - df['hdl_cholesterol'] - df['ldl_cholesterol']

    # Non-Linear Interactions (The 0.70 key)
    # The risk of diabetes accelerates as both age and BMI increase together
    df['age_bmi_interaction'] = (df['age'] * df['bmi']) / 100
    df['bp_age_interaction'] = (df['systolic_bp'] * df['age']) / 100

    return df

train = expert_fe(train)
test = expert_fe(test)

# --- 3. Encoding ---
cat_cols = train.select_dtypes(include=['object']).columns
for col in cat_cols:
    train[col] = train[col].astype('category').cat.codes
    test[col] = test[col].astype('category').cat.codes

X = train.drop(columns=['id', 'diagnosed_diabetes'])
y = train['diagnosed_diabetes']
X_test = test.drop(columns=['id'])

# --- 4. 10-Fold Rank-Based Ensemble ---
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Arrays to store the final RANKS
# We use ranks because AUC is all about ordering
xgb_preds = np.zeros(len(X_test))
cat_preds = np.zeros(len(X_test))
hgb_preds = np.zeros(len(X_test))

print("üöÄ Starting 10-Fold Rank Ensemble...")

for fold, (t_idx, v_idx) in enumerate(skf.split(X, y)):
    xt, xv = X.iloc[t_idx], X.iloc[v_idx]
    yt, yv = y.iloc[t_idx], y.iloc[v_idx]

    # XGBoost
    m1 = xgb.XGBClassifier(tree_method='hist', n_estimators=1500, learning_rate=0.01,
                           max_depth=8, subsample=0.8, colsample_bytree=0.6,
                           early_stopping_rounds=50, eval_metric='auc', random_state=fold)
    m1.fit(xt, yt, eval_set=[(xv, yv)], verbose=False)
    # Store the RANK of the prediction, not the probability
    xgb_preds += rankdata(m1.predict_proba(X_test)[:, 1])

    # CatBoost
    m2 = CatBoostClassifier(iterations=1500, learning_rate=0.02, depth=7, random_state=fold,
                            verbose=False, early_stopping_rounds=50, eval_metric='AUC')
    m2.fit(xt, yt, eval_set=(xv, yv))
    cat_preds += rankdata(m2.predict_proba(X_test)[:, 1])

    # HistGBM
    m3 = HistGradientBoostingClassifier(max_iter=1500, learning_rate=0.015, max_depth=10,
                                         l2_regularization=5.0, random_state=fold)
    m3.fit(xt, yt)
    hgb_preds += rankdata(m3.predict_proba(X_test)[:, 1])

    print(f"‚úÖ Fold {fold+1} complete.")

# --- 5. Final Blending & Scaling ---
# We average the RANKS, then scale them between 0 and 1
# This is much more stable for AUC than averaging raw probabilities
final_ranks = (xgb_preds * 0.4 + cat_preds * 0.3 + hgb_preds * 0.3)
final_probs = (final_ranks - final_ranks.min()) / (final_ranks.max() - final_ranks.min())

# --- 6. The 300,000 Row Check ---
submission = pd.DataFrame({'id': test['id'], 'diagnosed_diabetes': final_probs})
print(f"Final Count: {len(submission)}")
submission.to_csv('submission_rank_ensemble.csv', index=False)

[31mERROR: Operation cancelled by user[0m[31m
[0m

KeyboardInterrupt: 

In [2]:
results = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest", "XGBoost"],
    "Validation AUC": [auc_logreg, auc_rf, auc_xgb]
})

results.sort_values("Validation AUC", ascending=False)


NameError: name 'pd' is not defined