In [35]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [36]:
train = pd.read_csv("/kaggle/input/playground-series-s6e2/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s6e2/test.csv")

In [37]:
X = train.drop(columns=["id", "heart_disease"])
y = train["heart_disease"]

X_test = test.drop(columns=["id"])

KeyError: "['heart_disease'] not found in axis"

In [None]:
print(train.columns)

In [None]:
print("TRAIN COLUMNS:", train.columns.tolist())
print("TEST COLUMNS :", test.columns.tolist())
print("TRAIN SHAPE  :", train.shape)
print("TEST SHAPE   :", test.shape)

In [None]:
# find possible id column
id_candidates = [c for c in train.columns if c.lower() == "id"]
print("ID candidates:", id_candidates)

# common target name guesses
possible_targets = [c for c in train.columns if c.lower() in ["target", "label", "heart_disease", "heartdisease", "class"]]
print("Possible target columns:", possible_targets)

In [None]:
print(train.columns.tolist())

In [None]:
print("Last 10 columns:", train.columns.tolist()[-10:])

In [38]:
train.head()

Unnamed: 0,id,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,0,58,1,4,152,239,0,0,158,1,3.6,2,2,7,Presence
1,1,52,1,1,125,325,0,2,171,0,0.0,1,0,3,Absence
2,2,56,0,2,160,188,0,2,151,0,0.0,1,0,3,Absence
3,3,44,0,3,134,229,0,2,150,0,1.0,2,0,3,Absence
4,4,58,1,4,140,234,0,2,125,1,3.8,2,3,3,Presence


In [39]:
train_cols = set(train.columns)
test_cols = set(test.columns)

diff = sorted(list(train_cols - test_cols))
print("Columns in train but NOT in test (likely target):", diff)

Columns in train but NOT in test (likely target): ['Heart Disease']


In [40]:
# Correct target and id
target_col = "Heart Disease"
id_col = "id"

X = train.drop(columns=[id_col, target_col])
y = train[target_col]

X_test = test.drop(columns=[id_col])

print("X shape:", X.shape)
print("y shape:", y.shape)
print("X_test shape:", X_test.shape)

X shape: (630000, 13)
y shape: (630000,)
X_test shape: (270000, 13)


In [42]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
import numpy as np

skf = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

In [43]:
from lightgbm import LGBMClassifier

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = LGBMClassifier(
        n_estimators=300,
        learning_rate=0.05,
        random_state=42
    )
    
    model.fit(X_train, y_train)
    
    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    test_preds += model.predict_proba(X_test)[:, 1] / skf.n_splits
    
    print(f"Fold {fold+1} done")

[LightGBM] [Info] Number of positive: 225963, number of negative: 278037
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037959 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 422
[LightGBM] [Info] Number of data points in the train set: 504000, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448339 -> initscore=-0.207383
[LightGBM] [Info] Start training from score -0.207383
Fold 1 done
[LightGBM] [Info] Number of positive: 225963, number of negative: 278037
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037425 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 417
[LightGBM] [Info] Number of data points in the train set: 504000, number of used features: 13
[L

In [48]:
score = log_loss(y, oof_preds)
print("LightGBM CV Log Loss:", score)

LightGBM CV Log Loss: 0.2691498702539154


In [50]:
# Create submission DataFrame
submission = pd.DataFrame({
    "id": test["id"],
    "Heart Disease": test_preds
})

# Save to CSV
submission.to_csv("submission.csv", index=False)

# Preview
submission.head()

Unnamed: 0,id,Heart Disease
0,630000,0.947124
1,630001,0.009249
2,630002,0.984883
3,630003,0.006553
4,630004,0.181009
