In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/WiDSWorldWide_GlobalDathon26/sample_submission.csv
/kaggle/input/WiDSWorldWide_GlobalDathon26/train.csv
/kaggle/input/WiDSWorldWide_GlobalDathon26/metaData.csv
/kaggle/input/WiDSWorldWide_GlobalDathon26/test.csv


In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import KFold
from scipy.stats import norm

In [3]:
train = pd.read_csv('/kaggle/input/WiDSWorldWide_GlobalDathon26/train.csv')
test = pd.read_csv('/kaggle/input/WiDSWorldWide_GlobalDathon26/test.csv')

In [4]:
ID_COL = "event_id"
TIME_COL = "time_to_hit_hours"
EVENT_COL = "event"

features = [c for c in train.columns 
            if c not in [ID_COL, TIME_COL, EVENT_COL]]

X_test = test[features]
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [5]:
# =====================================================
# PART 1 — SURVIVAL MODEL
# =====================================================

aft_params = {
    "objective": "survival:aft",
    "eval_metric": "aft-nloglik",
    "aft_loss_distribution": "normal",
    "aft_loss_distribution_scale": 2.1,
    "learning_rate": 0.03,
    "max_depth": 4,
    "subsample": 0.85,
    "colsample_bytree": 0.85,
    "min_child_weight": 4,
    "seed": 42
}

surv_preds = []

for tr_idx, _ in kf.split(train):
    X_tr = train.iloc[tr_idx][features]
    y_time = train.iloc[tr_idx][TIME_COL].values
    y_event = train.iloc[tr_idx][EVENT_COL].values

    lower = y_time
    upper = y_time.copy()
    upper[y_event == 0] = np.inf

    dtrain = xgb.DMatrix(X_tr)
    dtrain.set_float_info("label_lower_bound", lower)
    dtrain.set_float_info("label_upper_bound", upper)

    dtest = xgb.DMatrix(X_test)

    model = xgb.train(aft_params, dtrain, num_boost_round=700)
    surv_preds.append(model.predict(dtest))

mu = np.mean(surv_preds, axis=0)
sigma = aft_params["aft_loss_distribution_scale"]

def surv_prob(h):
    return norm.cdf((np.log(h) - mu) / sigma)

# =====================================================
# PART 2 — BINARY MODELS
# =====================================================

binary_params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "learning_rate": 0.05,
    "max_depth": 3,
    "subsample": 0.9,
    "colsample_bytree": 0.9,
    "seed": 42
}

def binary_target(h):
    return ((train[TIME_COL] <= h) & (train[EVENT_COL] == 1)).astype(int)

def train_binary(h):
    preds = []
    y = binary_target(h)

    for tr_idx, _ in kf.split(train):
        dtrain = xgb.DMatrix(train.iloc[tr_idx][features], label=y.iloc[tr_idx])
        dtest = xgb.DMatrix(X_test)
        model = xgb.train(binary_params, dtrain, num_boost_round=400)
        preds.append(model.predict(dtest))

    return np.mean(preds, axis=0)

# Hybrid base (Model-5 behaviour)
p12 = 0.5 * train_binary(12) + 0.5 * surv_prob(12)
p24 = 0.55 * train_binary(24) + 0.45 * surv_prob(24)
p48 = 0.70 * train_binary(48) + 0.30 * surv_prob(48)
p72 = 0.65 * train_binary(72) + 0.35 * surv_prob(72)

# =====================================================
# PART 3 — QUANTILE RECALIBRATION (Model-6 upgrade)
# =====================================================

def quantile_lift(p, strength_low=0.02, strength_mid=0.05):
    p = np.clip(p, 1e-6, 1-1e-6)
    ranks = p.argsort().argsort() / (len(p)-1)
    lift = (
        strength_low * (ranks > 0.20) +
        strength_mid * (ranks > 0.40) * (ranks < 0.85)
    )
    return p + lift * p * (1 - p)

p12 = quantile_lift(p12, 0.015, 0.035)
p24 = quantile_lift(p24, 0.020, 0.050)
p48 = quantile_lift(p48, 0.020, 0.060)
p72 = quantile_lift(p72, 0.015, 0.045)

# =====================================================
# PART 4 — TEMPERATURE SOFTENING
# =====================================================

def temp_soft(p, T=1.05):
    eps = 1e-6
    p = np.clip(p, eps, 1-eps)
    logit = np.log(p/(1-p))
    return 1/(1+np.exp(-logit/T))

p12 = temp_soft(p12, 1.08)
p24 = temp_soft(p24, 1.06)
p48 = temp_soft(p48, 1.04)
p72 = temp_soft(p72, 1.05)

# =====================================================
# PART 5 — SMOOTHING + MONOTONICITY
# =====================================================

global_rate = train[EVENT_COL].mean()

def smooth(p):
    return 0.97*p + 0.03*global_rate

p12 = smooth(p12)
p24 = smooth(p24)
p48 = smooth(p48)
p72 = smooth(p72)

probs = np.vstack([p12, p24, p48, p72]).T
probs = np.maximum.accumulate(probs, axis=1)

p12, p24, p48, p72 = probs.T

# =====================================================
# SUBMISSION
# =====================================================

submission = pd.DataFrame({
    "event_id": test[ID_COL],
    "prob_12h": np.clip(p12,0,1),
    "prob_24h": np.clip(p24,0,1),
    "prob_48h": np.clip(p48,0,1),
    "prob_72h": np.clip(p72,0,1),
})

submission.to_csv("submission_v6.csv", index=False)
submission.head()

Unnamed: 0,event_id,prob_12h,prob_24h,prob_48h,prob_72h
0,10662602,0.012381,0.014421,0.014421,0.017075
1,13353600,0.293118,0.584401,0.73234,0.73234
2,13942327,0.011554,0.012909,0.012909,0.01702
3,16112781,0.377615,0.520581,0.685318,0.685318
4,17132808,0.017755,0.017755,0.018546,0.018546


In [6]:
print(len(submission))

95
