In [3]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier  # <--- The library Andrew Ng explicitly recommends
from sklearn.preprocessing import StandardScaler

# ---------------------------------------------------------
# 1. LOAD DATA
# ---------------------------------------------------------
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_ids = test['id']

# Prepare X and y
# Andrew Ng emphasizes keeping data preparation vectorized
X_raw = train.drop(columns=['id', 'target']).values
y_raw = train['target'].values
X_test_values = test.drop(columns=['id']).values

# ---------------------------------------------------------
# 2. FEATURE SCALING (Optional for Trees, but good practice)
# ---------------------------------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_raw)
X_test_scaled = scaler.transform(X_test_values)

# ---------------------------------------------------------
# 3. TRAIN XGBOOST (The Course Recommendation)
# ---------------------------------------------------------
print("Training XGBoost...")

# Note on Imbalance: instead of manually copying rows 6 times, 
# we use 'scale_pos_weight'. This tells the math to treat 
# every "1" (Blue Pill) as if it counts for 6 rows.
# It is faster and cleaner than pd.concat.
model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,           # Good for preventing overfitting
    scale_pos_weight=6,    # Equivalent to your "x6" strategy
    random_state=42,
    eval_metric='logloss'  # Removes warning messages
)

model.fit(X_train_scaled, y_raw)

# ---------------------------------------------------------
# 4. PREDICT & COUNT ONES
# ---------------------------------------------------------
predictions = model.predict(X_test_scaled)

# --- SNIPPET TO COUNT ONES ---
num_ones = np.sum(predictions)
print(f"----------------------------------------")
print(f"Total test examples: {len(predictions)}")
print(f"Total '1's predicted: {num_ones}")
print(f"Percentage of '1's: {(num_ones / len(predictions)) * 100:.2f}%")
print(f"----------------------------------------")

if num_ones == 0:
    print("⚠️ WARNING: Model predicted all Zeros.")
else:
    print("✅ SUCCESS: The model found positive cases!")
    submission = pd.DataFrame({
        'id': test_ids,
        'target': predictions
    })
    submission.to_csv('submission_xgboost.csv', index=False)
    print("Saved 'submission_xgboost.csv'.")

Training XGBoost...
----------------------------------------
Total test examples: 2000
Total '1's predicted: 226
Percentage of '1's: 11.30%
----------------------------------------
✅ SUCCESS: The model found positive cases!
Saved 'submission_xgboost.csv'.
