In [2]:
import os
import pandas as pd
import torch
import sys
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import pickle
import numpy as np
import tqdm
sys.path.append('/kaggle/input/utilit')
from utils                      import read_parquet_dataset_from_local 
from dataset_preprocessing_utils import features, transform_credits_to_sequences, pad_sequence, create_padded_buckets
from data_generators            import batches_generator

In [3]:
# задаем пути файлов (ручная загрузка)
TRAIN_DATA_PATH = "/kaggle/input/data-for-credit-score/data_for_competition/train_data"
TEST_DATA_PATH = "/kaggle/input/data-for-credit-score/data_for_competition/test_data"
TRAIN_TARGET_PATH = "/kaggle/input/data-for-credit-score/data_for_competition/train_target.csv"
TEST_TARGET_PATH = "/kaggle/input/data-for-credit-score/data_for_competition/test_target.csv"

# задаем пути для предобработанных данных
TRAIN_BUCKETS_PATH = "/kaggle/working/train_buckets_rnn"
VAL_BUCKETS_PATH = "/kaggle/working/val_buckets_rnn"
TEST_BUCKETS_PATH = "/kaggle/working/test_buckets_rnn"
for p in [TRAIN_BUCKETS_PATH, VAL_BUCKETS_PATH, TEST_BUCKETS_PATH]:
    os.makedirs(p, exist_ok=True)

In [4]:
bucket_info = dict(zip(range(1, 59),
                       list(range(1, 41)) + [45]*5 + [50]*5 + [58]*8))

In [5]:
train_target_df = pd.read_csv(TRAIN_TARGET_PATH)
train_ids, val_ids = train_test_split(train_target_df,
                                      test_size=0.10,
                                      random_state=42)
print(f"Train id: {len(train_ids)}, Val id: {len(val_ids)}")

Train id: 2700000, Val id: 300000


In [6]:
def create_buckets_from_credits(path_to_dataset, bucket_info, save_to_path, frame_with_ids=None,
                                num_parts_to_preprocess_at_once: int = 1,
                                num_parts_total=50, has_target=False):
    block = 0
    for step in tqdm.notebook.tqdm(range(0, num_parts_total, num_parts_to_preprocess_at_once),
                     desc="Preparing credit data"):
        credits_frame = read_parquet_dataset_from_local(path_to_dataset, step, num_parts_to_preprocess_at_once, verbose=True)
        credits_frame.loc[:, features] += 1  
        seq = transform_credits_to_sequences(credits_frame)  

        print("Transforming credits to sequences is done.")
        
        if frame_with_ids is not None:
            seq = seq.merge(frame_with_ids, on="id")

        block_as_str = str(block).zfill(3) 
        file_name = os.path.join(save_to_path, f"processed_chunk_{block_as_str}.pkl")
        
        processed_fragment = create_padded_buckets(seq, bucket_info=bucket_info, has_target=has_target, 
                                                   save_to_file_path=file_name)
        block += 1

In [7]:
create_buckets_from_credits(TRAIN_DATA_PATH, bucket_info,
                            TRAIN_BUCKETS_PATH, frame_with_ids=train_ids,
                            num_parts_to_preprocess_at_once=4, num_parts_total=12, has_target=True)

create_buckets_from_credits(TRAIN_DATA_PATH, bucket_info,
                            VAL_BUCKETS_PATH, frame_with_ids=val_ids,
                            num_parts_to_preprocess_at_once=4, num_parts_total=12, has_target=True)

create_buckets_from_credits(TEST_DATA_PATH, bucket_info,
                            TEST_BUCKETS_PATH, frame_with_ids=None,
                            num_parts_to_preprocess_at_once=2, num_parts_total=2, has_target=False)


Preparing credit data:   0%|          | 0/3 [00:00<?, ?it/s]

Reading chunks:
/kaggle/input/data-for-credit-score/data_for_competition/train_data/train_data_0.pq
/kaggle/input/data-for-credit-score/data_for_competition/train_data/train_data_1.pq
/kaggle/input/data-for-credit-score/data_for_competition/train_data/train_data_2.pq
/kaggle/input/data-for-credit-score/data_for_competition/train_data/train_data_3.pq


Reading dataset with pandas:   0%|          | 0/4 [00:00<?, ?it/s]

Transforming credits to sequences is done.


Extracting buckets:   0%|          | 0/43 [00:00<?, ?it/s]

Reading chunks:
/kaggle/input/data-for-credit-score/data_for_competition/train_data/train_data_4.pq
/kaggle/input/data-for-credit-score/data_for_competition/train_data/train_data_5.pq
/kaggle/input/data-for-credit-score/data_for_competition/train_data/train_data_6.pq
/kaggle/input/data-for-credit-score/data_for_competition/train_data/train_data_7.pq


Reading dataset with pandas:   0%|          | 0/4 [00:00<?, ?it/s]

Transforming credits to sequences is done.


Extracting buckets:   0%|          | 0/43 [00:00<?, ?it/s]

Reading chunks:
/kaggle/input/data-for-credit-score/data_for_competition/train_data/train_data_8.pq
/kaggle/input/data-for-credit-score/data_for_competition/train_data/train_data_9.pq
/kaggle/input/data-for-credit-score/data_for_competition/train_data/train_data_10.pq
/kaggle/input/data-for-credit-score/data_for_competition/train_data/train_data_11.pq


Reading dataset with pandas:   0%|          | 0/4 [00:00<?, ?it/s]

Transforming credits to sequences is done.


Extracting buckets:   0%|          | 0/42 [00:00<?, ?it/s]

Preparing credit data:   0%|          | 0/3 [00:00<?, ?it/s]

Reading chunks:
/kaggle/input/data-for-credit-score/data_for_competition/train_data/train_data_0.pq
/kaggle/input/data-for-credit-score/data_for_competition/train_data/train_data_1.pq
/kaggle/input/data-for-credit-score/data_for_competition/train_data/train_data_2.pq
/kaggle/input/data-for-credit-score/data_for_competition/train_data/train_data_3.pq


Reading dataset with pandas:   0%|          | 0/4 [00:00<?, ?it/s]

Transforming credits to sequences is done.


Extracting buckets:   0%|          | 0/41 [00:00<?, ?it/s]

Reading chunks:
/kaggle/input/data-for-credit-score/data_for_competition/train_data/train_data_4.pq
/kaggle/input/data-for-credit-score/data_for_competition/train_data/train_data_5.pq
/kaggle/input/data-for-credit-score/data_for_competition/train_data/train_data_6.pq
/kaggle/input/data-for-credit-score/data_for_competition/train_data/train_data_7.pq


Reading dataset with pandas:   0%|          | 0/4 [00:00<?, ?it/s]

Transforming credits to sequences is done.


Extracting buckets:   0%|          | 0/41 [00:00<?, ?it/s]

Reading chunks:
/kaggle/input/data-for-credit-score/data_for_competition/train_data/train_data_8.pq
/kaggle/input/data-for-credit-score/data_for_competition/train_data/train_data_9.pq
/kaggle/input/data-for-credit-score/data_for_competition/train_data/train_data_10.pq
/kaggle/input/data-for-credit-score/data_for_competition/train_data/train_data_11.pq


Reading dataset with pandas:   0%|          | 0/4 [00:00<?, ?it/s]

Transforming credits to sequences is done.


Extracting buckets:   0%|          | 0/43 [00:00<?, ?it/s]

Preparing credit data:   0%|          | 0/1 [00:00<?, ?it/s]

Reading chunks:
/kaggle/input/data-for-credit-score/data_for_competition/test_data/test_data_0.pq
/kaggle/input/data-for-credit-score/data_for_competition/test_data/test_data_1.pq


Reading dataset with pandas:   0%|          | 0/2 [00:00<?, ?it/s]

Transforming credits to sequences is done.


Extracting buckets:   0%|          | 0/43 [00:00<?, ?it/s]

In [8]:
class EarlyStopping:
    def __init__(self, patience=7, mode='min', verbose=False, delta=0, save_path='checkpoint.hdf5', metric_name=None, save_format='torch'):
        if mode not in ["min", "max"]:
            raise ValueError(f"Unrecognized mode: {mode}! Please choose one of the following modes: \"min\", \"max\"")

        if save_format not in ["torch", "tf"]:
            raise ValueError(f"Unrecognized format: {save_format}! Please choose one of the following formats: \"torch\", \"tf\"")

        self.patience = patience
        self.mode = mode
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.best_prev_score = np.Inf if mode == "min" else -np.Inf
        self.delta = delta
        self.save_path = save_path
        self.metric_name = "metric" if not metric_name else metric_name
        self.save_format = save_format

    def __call__(self, metric_value, model):

        score = -metric_value if self.mode == "min" else metric_value

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(metric_value, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(
                f"No imporvement in validation {self.metric_name}. Current: {score:.6f}. Current best: {self.best_score:.6f}")
            print(f"EarlyStopping counter: {self.counter} out of {self.patience}")
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(metric_value, model)
            self.counter = 0

    def save_checkpoint(self, metric_value: float, model: torch.nn.Module or tensorflow.keras.Model):
        if self.verbose:
            print(
                f"Validation {self.metric_name} improved ({self.best_prev_score:.6f} --> {metric_value:.6f}).  Saving model...")
        if self.save_format == "tf":
            model.save_weights(self.save_path)
        else:
            torch.save(model.state_dict(), self.save_path)

        self.best_prev_score = metric_value

In [9]:
train_paths = sorted([os.path.join(TRAIN_BUCKETS_PATH, f) for f in os.listdir(TRAIN_BUCKETS_PATH)])
val_paths   = sorted([os.path.join(VAL_BUCKETS_PATH,   f) for f in os.listdir(VAL_BUCKETS_PATH)])
test_paths  = sorted([os.path.join(TEST_BUCKETS_PATH,  f) for f in os.listdir(TEST_BUCKETS_PATH)])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_gen = batches_generator(train_paths, batch_size=128, shuffle=True,
                              is_train=True,  output_format="torch", device=device)
val_gen   = batches_generator(val_paths,   batch_size=128, shuffle=False,
                              is_train=True,  output_format="torch", device=device)
test_gen  = batches_generator(test_paths,  batch_size=128, shuffle=False,
                              is_train=False, output_format="torch", device=device)


In [10]:
def compute_embed_dim(n_cat):        
    return min(600, round(1.6 * (n_cat ** 0.56)))

card = {f: 0 for f in features}
for start in range(0, 12, 4):
    part = read_parquet_dataset_from_local(TRAIN_DATA_PATH, start, 4, columns=features)
    for f in features:
        card[f] = max(card[f], part[f].max()+1)    # +1 после сдвига
    del part
embed_proj = {f: (card[f]+1, compute_embed_dim(card[f]+1)) for f in features}

class CreditsRNN(nn.Module):
    def __init__(self, features, embedding_proj, rnn_units=128, mlp_units=32):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(num, dim, padding_idx=0)
                                         for num, dim in embedding_proj.values()])
        self.rnn = nn.GRU(sum(dim for _, dim in embedding_proj.values()),
                          rnn_units, batch_first=True)
        self.head = nn.Sequential(
            nn.Linear(rnn_units, mlp_units),
            nn.ReLU(),
            nn.Linear(mlp_units, 1)
        )
    def forward(self, feats):                         
        x = torch.cat([emb(feats[i]) for i, emb in enumerate(self.embeddings)], dim=-1)
        _, h = self.rnn(x)                            
        out = self.head(h.squeeze(0))
        return out


Reading dataset with pandas:   0%|          | 0/4 [00:00<?, ?it/s]

Reading dataset with pandas:   0%|          | 0/4 [00:00<?, ?it/s]

Reading dataset with pandas:   0%|          | 0/4 [00:00<?, ?it/s]

In [11]:
model = CreditsRNN(features, embed_proj).to(device)
optim = torch.optim.Adam(model.parameters(), lr=1e-3)
bce   = nn.BCEWithLogitsLoss()

def train_epoch(model, gen, batches=1000):
    model.train(); loss_sum = 0
    for i, batch in enumerate(gen):
        if i >= batches: break
        optim.zero_grad()
        logits = model(batch["features"])
        loss = bce(logits.squeeze(1).float(), batch["label"].float())
        loss.backward(); optim.step()
        loss_sum += loss.item()
    return loss_sum / (i+1)

@torch.no_grad()
def eval_auc(model, gen, batches=300):
    model.eval(); preds, labels = [], []
    for i, batch in enumerate(gen):
        if i >= batches: break
        logits = model(batch["features"]).squeeze(1).float().cpu().numpy()
        preds.append(logits)
        labels.append(batch["label"].cpu().numpy())
    return roc_auc_score(np.concatenate(labels), np.concatenate(preds))

es = EarlyStopping(patience=3, mode="max", verbose=True,
                   save_path="/kaggle/working/best.pt", metric_name="AUC")

for epoch in range(10):
    tr_loss = train_epoch(model, train_gen)
    val_auc = eval_auc(model, val_gen)
    print(f"Epoch {epoch+1}: loss={tr_loss:.4f}, val_auc={val_auc:.5f}")
    es(val_auc, model)
    if es.early_stop:
        print("Early stopping.")
        break


Epoch 1: loss=0.1398, val_auc=0.72438
Validation AUC improved (-inf --> 0.724376).  Saving model...
Epoch 2: loss=0.1280, val_auc=0.77492
Validation AUC improved (0.724376 --> 0.774922).  Saving model...
Epoch 3: loss=0.1308, val_auc=0.73599
No imporvement in validation AUC. Current: 0.735993. Current best: 0.774922
EarlyStopping counter: 1 out of 3
Epoch 4: loss=0.1360, val_auc=0.74812
No imporvement in validation AUC. Current: 0.748117. Current best: 0.774922
EarlyStopping counter: 2 out of 3
Epoch 5: loss=0.1402, val_auc=0.76572
No imporvement in validation AUC. Current: 0.765722. Current best: 0.774922
EarlyStopping counter: 3 out of 3
Early stopping.


In [12]:
import torch, torch.nn as nn, torch.nn.functional as F

class AttnPool(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.q = nn.Linear(d_model, d_model, bias=False)
        self.scale = d_model ** -0.5

    def forward(self, h, mask=None):
        # h: (B, T, H)
        att = (self.q(h) * self.scale) @ h.transpose(1, 2)      
        if mask is not None:
            att.masked_fill_(mask[:, None, :]==0, -1e9)
        w = F.softmax(att, dim=-1)                             
        ctx = w @ h                                             
        return ctx.mean(1)                                      

class CreditsRNN_Attn(nn.Module):
    def __init__(self, features, emb_proj, hid=160, mlp=64,
                 emb_dp=0.1, rnn_dp=0.2):
        super().__init__()
        self.embs = nn.ModuleList(
            [nn.Embedding(n, d, padding_idx=0) for n, d in emb_proj.values()])
        self.emb_dp = nn.Dropout(emb_dp)

        d_in = sum(d for _, d in emb_proj.values())
        self.norm_in = nn.LayerNorm(d_in)

        self.gru = nn.GRU(d_in, hid, num_layers=2,
                          batch_first=True, bidirectional=True,
                          dropout=rnn_dp)

        self.pool = AttnPool(hid*2)
        self.head = nn.Sequential(
            nn.Linear(hid*2, mlp),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(mlp, 1)
        )

    def forward(self, feats):           
        x = torch.cat([e(f) for e, f in zip(self.embs, feats)], dim=-1)
        x = self.emb_dp(x)
        x = self.norm_in(x)
        h, _ = self.gru(x)
        out = self.pool(h)
        return self.head(out).squeeze(1)


In [19]:
def make_val_loader():
    return batches_generator(
        val_paths, batch_size=128, shuffle=False,
        is_train=True, output_format="torch", device=device
    )


In [20]:
from torch.amp import autocast, GradScaler
import random, numpy as np, torch.optim as optim

def set_seed(s):
    random.seed(s); np.random.seed(s); torch.manual_seed(s)
    torch.cuda.manual_seed_all(s)

def train_one_seed(seed):
    set_seed(seed)
    model = CreditsRNN_Attn(features, embed_proj).to(device)

    opt = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(opt, mode='max', factor=0.5,
                                                     patience=1)
    scaler = GradScaler('cuda')

    best_auc, patience = 0., 0
    for epoch in range(8):
        model.train(); running = 0.
        train_loader = batches_generator(   
            train_paths, batch_size=128, shuffle=True,
            is_train=True, output_format="torch", device=device
        )
        for i, batch in enumerate(train_loader):
            opt.zero_grad(set_to_none=True)
            with autocast('cuda'): 
                logits = model(batch['features'])
                loss = F.binary_cross_entropy_with_logits(
                    logits.float(), batch['label'].float())
            scaler.scale(loss).backward()
            scaler.step(opt); scaler.update()
            running += loss.item()

        model.eval(); preds, y = [], []
        val_loader = make_val_loader()
        with torch.no_grad(), autocast('cuda'):
            for batch in val_loader:
                logits = model(batch['features']).float()
                preds.append(logits.cpu())
                y.append(batch['label'].cpu())

        auc = roc_auc_score(torch.cat(y).numpy(),
                            torch.cat(preds).numpy())
        scheduler.step(auc)

        if auc > best_auc + 1e-4:
            best_auc, patience = auc, 0
            torch.save(model.state_dict(), f"/kaggle/working/best_{seed}.pt")
        else:
            patience += 1
            if patience == 3:
                break
        print(f"seed {seed}  epoch {epoch+1}  loss {running/(i+1):.4f}  val AUC {auc:.5f}")

    return best_auc

In [21]:
aucs = [train_one_seed(s) for s in (42, 777, 2025)]
print("single-seed AUCs:", aucs)

seed 42  epoch 1  loss 0.1389  val AUC 0.77216
seed 42  epoch 2  loss 0.1357  val AUC 0.77856
seed 42  epoch 3  loss 0.1349  val AUC 0.77783
seed 42  epoch 4  loss 0.1347  val AUC 0.78048
seed 42  epoch 5  loss 0.1345  val AUC 0.77956
seed 42  epoch 6  loss 0.1339  val AUC 0.77673
seed 777  epoch 1  loss 0.1390  val AUC 0.77052
seed 777  epoch 2  loss 0.1359  val AUC 0.77715
seed 777  epoch 3  loss 0.1349  val AUC 0.77909
seed 777  epoch 4  loss 0.1344  val AUC 0.78012
seed 777  epoch 6  loss 0.1342  val AUC 0.77903
seed 777  epoch 7  loss 0.1335  val AUC 0.78197
seed 777  epoch 8  loss 0.1330  val AUC 0.78223
seed 2025  epoch 1  loss 0.1389  val AUC 0.77529
seed 2025  epoch 2  loss 0.1354  val AUC 0.77500
seed 2025  epoch 3  loss 0.1348  val AUC 0.77688
seed 2025  epoch 4  loss 0.1345  val AUC 0.77901
seed 2025  epoch 5  loss 0.1343  val AUC 0.77784
seed 2025  epoch 6  loss 0.1343  val AUC 0.77729
seed 2025  epoch 7  loss 0.1334  val AUC 0.78164
seed 2025  epoch 8  loss 0.1331  val AU