In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/WiDSWorldWide_GlobalDathon26/sample_submission.csv
/kaggle/input/WiDSWorldWide_GlobalDathon26/train.csv
/kaggle/input/WiDSWorldWide_GlobalDathon26/metaData.csv
/kaggle/input/WiDSWorldWide_GlobalDathon26/test.csv


In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import KFold
from scipy.stats import norm

In [3]:
train = pd.read_csv('/kaggle/input/WiDSWorldWide_GlobalDathon26/train.csv')
test = pd.read_csv('/kaggle/input/WiDSWorldWide_GlobalDathon26/test.csv')

In [4]:
ID_COL = "event_id"
TIME_COL = "time_to_hit_hours"
EVENT_COL = "event"

features = [c for c in train.columns 
            if c not in [ID_COL, TIME_COL, EVENT_COL]]

X = train[features]
X_test = test[features]

# =====================================================
# PART 1 — SURVIVAL MODEL
# =====================================================

aft_params = {
    "objective": "survival:aft",
    "eval_metric": "aft-nloglik",
    "aft_loss_distribution": "normal",
    "aft_loss_distribution_scale": 2.1,  # slightly wider for calibration
    "learning_rate": 0.03,
    "max_depth": 4,
    "subsample": 0.85,
    "colsample_bytree": 0.85,
    "min_child_weight": 4,
    "seed": 42
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

surv_preds = []

for tr_idx, val_idx in kf.split(train):

    X_tr = train.iloc[tr_idx][features]
    y_time = train.iloc[tr_idx][TIME_COL].values
    y_event = train.iloc[tr_idx][EVENT_COL].values

    lower = y_time
    upper = y_time.copy()
    upper[y_event == 0] = np.inf

    dtrain = xgb.DMatrix(X_tr)
    dtrain.set_float_info("label_lower_bound", lower)
    dtrain.set_float_info("label_upper_bound", upper)

    dtest = xgb.DMatrix(X_test)

    model = xgb.train(aft_params, dtrain, num_boost_round=700)
    surv_preds.append(model.predict(dtest))

mu = np.mean(surv_preds, axis=0)
sigma = aft_params["aft_loss_distribution_scale"]

def surv_prob(h):
    return norm.cdf((np.log(h) - mu) / sigma)

surv_12 = surv_prob(12)
surv_24 = surv_prob(24)
surv_48 = surv_prob(48)
surv_72 = surv_prob(72)

# =====================================================
# PART 2 — BINARY MODELS
# =====================================================

binary_params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "learning_rate": 0.05,
    "max_depth": 3,
    "subsample": 0.9,
    "colsample_bytree": 0.9,
    "seed": 42
}

def create_binary_target(h):
    return ((train[TIME_COL] <= h) & (train[EVENT_COL] == 1)).astype(int)

def train_binary(h):

    y_bin = create_binary_target(h)
    preds = []

    for tr_idx, val_idx in kf.split(train):

        dtrain = xgb.DMatrix(train.iloc[tr_idx][features],
                             label=y_bin.iloc[tr_idx])
        dtest = xgb.DMatrix(X_test)

        model = xgb.train(binary_params, dtrain, num_boost_round=400)
        preds.append(model.predict(dtest))

    return np.mean(preds, axis=0)

bin_12 = train_binary(12)
bin_24 = train_binary(24)
bin_48 = train_binary(48)
bin_72 = train_binary(72)

# =====================================================
# PART 3 — ADAPTIVE BLENDING (NEW UPGRADE)
# =====================================================

# Stronger survival influence at early horizons
# Stronger binary influence at 48h (metric focus)

prob_12 = 0.5 * bin_12 + 0.5 * surv_12
prob_24 = 0.55 * bin_24 + 0.45 * surv_24
prob_48 = 0.70 * bin_48 + 0.30 * surv_48  # emphasize calibration
prob_72 = 0.65 * bin_72 + 0.35 * surv_72

# =====================================================
# PART 4 — CONFIDENCE SMOOTHING (NEW)
# =====================================================

global_rate = train[EVENT_COL].mean()

def smooth(p):
    return 0.92*p + 0.08*global_rate

prob_12 = smooth(prob_12)
prob_24 = smooth(prob_24)
prob_48 = smooth(prob_48)
prob_72 = smooth(prob_72)

# enforce monotonicity
probs = np.vstack([prob_12, prob_24, prob_48, prob_72]).T
probs = np.maximum.accumulate(probs, axis=1)

prob_12, prob_24, prob_48, prob_72 = probs.T

# =====================================================
# SUBMISSION
# =====================================================

submission = pd.DataFrame({
    "event_id": test[ID_COL],
    "prob_12h": np.clip(prob_12,0,1),
    "prob_24h": np.clip(prob_24,0,1),
    "prob_48h": np.clip(prob_48,0,1),
    "prob_72h": np.clip(prob_72,0,1),
})

submission.to_csv("submission_v4.csv", index=False)
submission.head()

Unnamed: 0,event_id,prob_12h,prob_24h,prob_48h,prob_72h
0,10662602,0.026754,0.028252,0.028252,0.030407
1,13353600,0.27182,0.570926,0.714652,0.714652
2,13942327,0.026234,0.027332,0.027332,0.030366
3,16112781,0.362567,0.495237,0.668647,0.668647
4,17132808,0.030178,0.030178,0.031684,0.031684


In [5]:
print(len(submission))

95
