In [1]:
import pandas as pd
import numpy as np
import os
import random

from sklearn.model_selection import train_test_split
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

In [2]:
CFG = {
    'BATCH_SIZE': 2**11,
    'EPOCHS': 100,
    'LEARNING_RATE': 1e-3,
    'SEED' : 42,
    'MODEL_NAME' : 'gdcn_ecn_fm_wll_full.pth'
}

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(CFG['SEED']) # Seed 고정

In [4]:
# 데이터 로드
all_train = pd.read_parquet("./train.parquet", engine="pyarrow")
test = pd.read_parquet("./test.parquet", engine="pyarrow").drop(columns=['ID'])

print("Train shape:", all_train.shape)
print("Test shape:", test.shape)

Train shape: (10704179, 119)
Test shape: (1527298, 118)


In [5]:
# categorical feature 분리
cat_cols = {}
num_cols = []
for col in all_train.columns:
    if col == 'clicked' or col == 'seq':
        continue
    if (all_train[col].astype(float).fillna(0) % 1 == 0).all():  # 소수점 이하가 전부 0인지 체크 - int 분류
        l = len(all_train[col].dropna().unique())
        if l < 100: # category column 분류
            cat_cols[col] = int(all_train[col].astype(float).max())
        else:
            num_cols.append(col)
    else:
        num_cols.append(col)

In [6]:
# clicked == 1 데이터
clicked_1 = all_train[all_train['clicked'] == 1]

# clicked == 0 데이터에서 동일 개수x2 만큼 무작위 추출 (다운 샘플링)
clicked_0 = all_train[all_train['clicked'] == 0].sample(n=len(clicked_1)*9, random_state=42)

# 두 데이터프레임 합치기
train = pd.concat([clicked_1, clicked_0], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)

In [7]:
print("Train shape:", train.shape)
print("Train clicked:0:", train[train['clicked']==0].shape)
print("Train clicked:1:", train[train['clicked']==1].shape)

Train shape: (2041790, 119)
Train clicked:0: (1837611, 119)
Train clicked:1: (204179, 119)


# Data Column Setting

In [8]:
# Target / Sequence
target_col = "clicked"
seq_col = "seq"

# 학습에 사용할 피처: ID/seq/target 제외, 나머지 전부
FEATURE_EXCLUDE = {target_col, seq_col, "ID"}
feature_cols = [c for c in train.columns if c not in FEATURE_EXCLUDE]

print("Num features:", len(feature_cols))
print("Num categorical features:", len(cat_cols))
print("Num numerical features:", len(num_cols))
print("Sequence:", seq_col)
print("Target:", target_col)

Num features: 117
Num categorical features: 23
Num numerical features: 94
Sequence: seq
Target: clicked


# Define Custom Dataset

In [9]:
class ClickDataset(Dataset):
    def __init__(self, df, cat_cols, num_cols, seq_col, target_col=None, has_target=True):
        self.df = df.reset_index(drop=True)
        self.cat_cols = list(cat_cols.keys())
        self.num_cols = num_cols
        self.seq_col = seq_col
        self.target_col = target_col
        self.has_target = has_target

        # 비-시퀀스 피처: category - int, num - float
        self.cats = self.df[self.cat_cols].astype(float).fillna(0).astype(int).values
        self.nums = self.df[self.num_cols].astype(float).fillna(0).values

        # 시퀀스: 문자열 그대로 보관 (lazy 파싱)
        self.seq_strings = self.df[self.seq_col].astype(str).values

        if self.has_target:
            self.y = self.df[self.target_col].astype(np.float32).values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        cats = torch.tensor(self.cats[idx], dtype=torch.int)
        nums = torch.tensor(self.nums[idx], dtype=torch.float)

        # 전체 시퀀스 사용 (빈 시퀀스만 방어)
        s = self.seq_strings[idx]
        if s:
            arr = np.fromstring(s, sep=",", dtype=np.float32)
        else:
            arr = np.array([], dtype=np.float32)

        if arr.size == 0:
            arr = np.array([0.0], dtype=np.float32)  # 빈 시퀀스 방어

        seq = torch.from_numpy(arr)  # shape (seq_len,)

        if self.has_target:
            y = torch.tensor(self.y[idx], dtype=torch.float)
            return cats, nums, seq, y
        else:
            return cats, nums, seq

In [10]:
def collate_fn_train(batch):
    cats, nums, seqs, ys = zip(*batch)
    cats = torch.stack(cats)
    nums = torch.stack(nums)
    ys = torch.stack(ys)
    seqs_padded = nn.utils.rnn.pad_sequence(seqs, batch_first=True, padding_value=0.0)
    seq_lengths = torch.tensor([len(s) for s in seqs], dtype=torch.long)
    seq_lengths = torch.clamp(seq_lengths, min=1)  # 빈 시퀀스 방지
    return cats, nums, seqs_padded, seq_lengths, ys

def collate_fn_infer(batch):
    cats, nums, seqs = zip(*batch)
    cats = torch.stack(cats)
    nums = torch.stack(nums)
    seqs_padded = nn.utils.rnn.pad_sequence(seqs, batch_first=True, padding_value=0.0)
    seq_lengths = torch.tensor([len(s) for s in seqs], dtype=torch.long)
    seq_lengths = torch.clamp(seq_lengths, min=1)
    return cats, nums, seqs_padded, seq_lengths

# Define Model Architecture


In [11]:
class ExponentialCrossNetwork(nn.Module):
    def __init__(self, input_dim, num_layers=2):
        super().__init__()
        self.layers = nn.ModuleList([
            nn.Sequential(
                nn.Linear(input_dim, input_dim),
                nn.Dropout(0.2)
            ) for _ in range(num_layers)
        ])
    def forward(self, x):
        out = x
        for l, layer in enumerate(self.layers, start=1):
            mask = torch.sigmoid(layer(out))  # Self-Mask 기능
            out = x.pow(2**(l-1)) * mask
        return out
        
class DNNLyaer(nn.Module):
    def __init__(self, input_dim, hidden_dims):
        super().__init__()
        layers = []
        for h in hidden_dims:
            layers.append(nn.Linear(input_dim, h))
            layers.append(nn.BatchNorm1d(h))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.2))
            input_dim = h
        self.mlp = nn.Sequential(*layers)

    def forward(self, x):
        return self.mlp(x)
        
class FMLayer(nn.Module):
    def forward(self, x):
        square_of_sum = torch.pow(x.sum(dim=1), 2)
        sum_of_square = (x * x).sum(dim=1)
        return (0.5 * (square_of_sum - sum_of_square)).unsqueeze(1)  # [batch_size, 1]
        
class GatedCrossLayerBlock(nn.Module):
    def __init__(self, input_dim : int, num_layers : int, gate_function = nn.Sigmoid()):
        # input_dim : embedding_vector_dim * number of feature?
        super(GatedCrossLayerBlock, self).__init__()
        input_dim = input_dim
        self.num_layers = num_layers
        self.gate_function = gate_function
        
        self.wc = nn.ModuleList() # weight of cross layer
        self.wg = nn.ModuleList() # weight of gate layer
        self.bias = nn.ParameterList() # bias
        for _ in range(self.num_layers):
            self.wc.append(
                nn.Sequential(nn.Linear(input_dim, input_dim), nn.Dropout(0.2))
            )
            self.wg.append(
                nn.Sequential(nn.Linear(input_dim, input_dim), nn.Dropout(0.2))
            )
            self.bias.append(nn.Parameter(torch.zeros(input_dim)))
            
        for m in self.wg.modules():
            if isinstance(m, nn.Linear):
                torch.nn.init.xavier_uniform_(m.weight)
                

    def forward(self, x):
        # cross networking
        '''
        c_(l+1) = c0*(wc_l + bias_l) * gated(wg_l) + c_l
        
        wc_l : nn.linear(c_l)
        wg_l : nn.linear(c_l)
        '''
        x0 = x
        for i in range(self.num_layers):
            xc = self.wc[i](x)
            xg = self.gate_function(self.wg[i](x))

            x = x0*(xc+self.bias[i])*xg + x

        return x # [batch_size, input_dim]

        
class TabularSeqModel(nn.Module):
    def __init__(self, cat_cols, num_cols, seq_emb_dim=32, feature_emb_dim = 8, num_layers = 4, mlp_hidden_units=[1024, 512, 256, 128], dropout=0.2):
        super().__init__()
        # categorical feature part
        n_cats = len(cat_cols)
        self.cat_embs = nn.ModuleList([
            nn.Embedding(num_categories+1, feature_emb_dim, padding_idx=0)
            for num_categories in cat_cols.values()
        ])
        
        # numerical feature part
        n_cols = len(num_cols)
        self.num_proj = nn.BatchNorm1d(n_cols)#ProjectionLayer(n_cols, feature_emb_dim)

        ## sequantial layer block
        # seq: 숫자 시퀀스 → LSTM
        self.lstm = nn.LSTM(input_size=1, hidden_size=seq_emb_dim, batch_first=True)

        ## feature train layer blocks
        # FM Layer Block - 저차원
        fm_input_dim = n_cats*feature_emb_dim + n_cols + seq_emb_dim
        fm_output_dim = 1
        self.fm = FMLayer()
        print(f'mdel fm_dim : {fm_input_dim}, output_dim : {fm_output_dim}')

        # Gated Cross Layer Block - 저차원 선택적 학습
        gcn_output_dim = gcn_input_dim = n_cats*feature_emb_dim + n_cols + seq_emb_dim
        self.gcn = GatedCrossLayerBlock(input_dim=gcn_input_dim, num_layers=num_layers)
        print('mdel gcn_dim : ', gcn_input_dim)

        # Exponential Cross Layer Block - 고차원 선택적 학습
        ecn_output_dim = ecn_input_dim = n_cats*feature_emb_dim + n_cols + seq_emb_dim
        self.ecn = ExponentialCrossNetwork(input_dim = ecn_input_dim, num_layers=min(3,num_layers)) # 크면 터짐
        print('mdel ecn_dim : ', ecn_input_dim)

        # DNN Layer Block - 고차원
        dnn_output_dim = dnn_input_dim = n_cats*feature_emb_dim + n_cols + seq_emb_dim
        self.dnn = DNNLyaer(dnn_input_dim, [dnn_input_dim for _ in range(num_layers)])
        print('mdel dnn_dim : ', dnn_input_dim)
        
        ## 최종 MLP
        final_input_dim = fm_output_dim + gcn_output_dim + ecn_output_dim + dnn_output_dim # 
        print('mdel final_input_dim : ', final_input_dim)
        layers = []
        for h in mlp_hidden_units:
            linear = nn.Linear(final_input_dim, h)
            nn.init.kaiming_normal_(linear.weight, nonlinearity='relu')
            #nn.init.xavier_normal_(linear.weight)
            nn.init.zeros_(linear.bias)
            
            layers += [linear, nn.BatchNorm1d(h), nn.ReLU(), nn.Dropout(dropout)]
            final_input_dim = h

        final_linear = nn.Linear(final_input_dim, 1)
        nn.init.xavier_normal_(final_linear.weight)
        nn.init.zeros_(final_linear.bias)
        
        layers += [final_linear, nn.Sigmoid()]
        
        self.mlp = nn.Sequential(*layers)

    def forward(self, x_cats, x_nums, x_seq, seq_lengths):
        # categorical feature part
        embedded_cats = [
            emb(x_cats[:, i])  # shape: (batch_size, embedding_dim)
            for i, emb in enumerate(self.cat_embs)
        ]
        embedded_cats = torch.cat(embedded_cats, dim=1) # (B, n_cats*emb_dim)

        # numerical feature part
        embedded_nums = self.num_proj(x_nums) # (B, n_nums), BatchNormalize

        # concat
        x = torch.cat([embedded_cats, embedded_nums], dim=1) # (B, n_cats*feature_emb_dim + bn_cols)
        
        # 시퀀스 → LSTM (pack)
        x_seq = x_seq.unsqueeze(-1)  # (B, L, 1)
        packed = nn.utils.rnn.pack_padded_sequence(
            x_seq, seq_lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        _, (h_n, _) = self.lstm(packed)
        h = h_n[-1]    # (B,emb_dim)

        z = torch.cat([x, h], dim=1) # (B, n_cats*feature_emb_dim + bn_cols + seq_emb_dim)
        
        # FM LayerBlock
        z_fm = self.fm(z)
        # GC LayerBlock
        z_gc = self.gcn(z)
        # EC LayerBlock
        z_ec = self.ecn(z)
        # DNN LayerBLock
        z_dnn = self.dnn(z)
        
        z = torch.cat([z_fm
                       ,z_gc
                       ,z_ec
                       ,z_dnn], dim=1)
        
        return self.mlp(z).squeeze(1)  # logits

# Train / Validation


In [12]:
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

from sklearn.metrics import average_precision_score


class WeightedBCELoss(nn.Module):
    def __init__(self, weight = [1,1]):
        super().__init__()

        self.w0 = weight[0]/sum(weight)
        self.w1 = weight[1]/sum(weight)

    def forward(self, y_pred, y_true):
        y_true = y_true.float()
        y_pred = y_pred.float()

        N0 = (y_true == 0).sum()
        N1 = (y_true == 1).sum()

        w0 = self.w0 / N0
        w1 = self.w1 / N1

        sample_weights = torch.where(y_true == 0, w0, w1)
        bce_loss = F.binary_cross_entropy(y_pred, y_true, reduction='none')
        weighted_loss = (bce_loss * sample_weights).sum()
        
        return weighted_loss

class Trainer:
    def __init__(self
                 , model = None
                 , loss_func = nn.BCELoss()
                ):
        self.model = model
        self.loss_func = loss_func
        
    def fit(self
            , train_df
            , feature_cols : list
            , seq_col : str
            , target_col : str
            , batch_size : int = 512
            , epochs : int = 3
            , learning_rate : float = 1e-3
            , device : str = None
            , model_name : str = 'model.pth'
           ):
        if device is None:
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
        else:
            self.device = device
        #self.device = 'cpu'
        self.model.to(self.device)
        print(f'training with {self.device}')
        # 1) split
        tr_df, va_df = train_test_split(train_df, test_size=0.2, random_state=42, shuffle=True)
    
        # 2) Dataset / Loader (l_max 인자 제거)
        train_dataset = ClickDataset(tr_df, cat_cols, num_cols, seq_col, target_col, has_target=True)
        eval_dataset   = ClickDataset(va_df, cat_cols, num_cols, seq_col, target_col, has_target=True)
    
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,  collate_fn=collate_fn_train)
        eval_loader   = DataLoader(eval_dataset,   batch_size=batch_size, shuffle=False, collate_fn=collate_fn_train)

        # 2-1) optimizer
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate)
        self.scheduler = CosineAnnealingWarmRestarts(self.optimizer, T_0=10, T_mult=1)
        
        # 3) Loop
        self.last_loss = 0
        early_stop_count = 0
        for epoch in range(1, epochs+1):
            train_loss = self._batch_process(train_loader, is_train = True)
            print(f"[Epoch {epoch}] Train Loss: {train_loss:.4f}")
            
            with torch.no_grad():
                eval_loss = self._batch_process(eval_loader, is_train = False)
            
            current_lr = self.scheduler.get_last_lr()
            self.scheduler.step()
            
            print(f"[Epoch {epoch}] Train Loss: {train_loss:.4f} | Val Loss: {eval_loss:.4f} | Learning Rate : {current_lr}")
            if self.last_loss < eval_loss:
                self.last_loss = eval_loss
                torch.save(model.state_dict(), model_name)
            else:
                early_stop_count += 1
                if early_stop_count >= 3:
                    break

        print(f'last score : {self.last_loss}')
    
        return model

    def _batch_process(self, loader, is_train : bool = True):
        if is_train:
            self.model.train()
            mode = 'Train'
        else:
            self.model.eval()
            mode = 'Eval'

        losses = [] # eval mode에서 preds 겸용
        nums = [] # eval mode에서 trues 겸용
        for xcats, xnums, seqs, seq_lens, ys in tqdm(loader, desc=mode):
            xcats, xnums, seqs, seq_lens, ys = xcats.to(self.device), xnums.to(self.device), seqs.to(self.device), seq_lens.to(self.device), ys.to(self.device)
            logits = self.model(xcats, xnums, seqs, seq_lens)
            if is_train:
                loss, num = self._get_loss(logits, ys, is_train)
                losses.append(loss)
                nums.append(num)
            else:
                losses.append(logits.to('cpu').detach())
                nums.append(ys.to('cpu').detach())

        if is_train:
            total_loss = sum([x*y for x,y in zip(losses, nums)])
            total_loss = total_loss/sum(nums)
        else:
            all_preds = torch.cat(losses).view(-1)
            all_trues = torch.cat(nums).view(-1)

            total_loss = 0.5 * average_precision_score(all_trues.numpy(), all_preds.numpy()) + 0.5 * 1/(1+WeightedBCELoss()(all_preds, all_trues).item())

            
        return total_loss


    def _get_loss(self, preds, labels, is_train : bool):
        
        loss = self.loss_func(preds,labels)
        if is_train:
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            
        return loss.item(), preds.shape[0]

# Run!!


In [None]:
# 3) 모델
model_args = {'cat_cols' : cat_cols
              , 'num_cols' : num_cols
              , 'seq_emb_dim' : 64
              , 'feature_emb_dim' : 4
              , 'num_layers' : 5
              , 'mlp_hidden_units' : [512,128]
              , 'dropout' : 0.2}

model = TabularSeqModel(**model_args)

criterion = WeightedBCELoss()

trainer = Trainer(model, loss_func = criterion)


print(CFG)
trainer.fit(
    train_df=train,
    feature_cols=feature_cols,
    seq_col=seq_col,
    target_col=target_col,
    batch_size=CFG['BATCH_SIZE'],
    epochs=CFG['EPOCHS'],
    learning_rate=CFG['LEARNING_RATE'],
    model_name = CFG['MODEL_NAME']
)
torch.cuda.empty_cache()

mdel fm_dim : 250, output_dim : 1
mdel gcn_dim :  250
mdel ecn_dim :  250
mdel dnn_dim :  250
mdel final_input_dim :  751
{'BATCH_SIZE': 2048, 'EPOCHS': 100, 'LEARNING_RATE': 0.001, 'SEED': 42, 'MODEL_NAME': 'gdcn_ecn_fm_wll_full.pth'}
training with cuda


Train:   5%|▍         | 36/798 [00:15<05:39,  2.25it/s]

# Inference

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# 1) Dataset/Loader
test_ds = ClickDataset(test, cat_cols, num_cols, seq_col, target_col, has_target=False)
test_ld = DataLoader(test_ds, batch_size=CFG['BATCH_SIZE'], shuffle=False, collate_fn=collate_fn_infer)

# 2) Predict
model = TabularSeqModel(**model_args)
model.load_state_dict(torch.load(CFG['MODEL_NAME']))
model.to(device)
model.eval()
outs = []
with torch.no_grad():
    for xcats, xnums, seqs, seq_lens in tqdm(test_ld, desc="Inference"):
        xcats, xnums, seqs, seq_lens = xcats.to(device), xnums.to(device), seqs.to(device), seq_lens.to(device)
        outs.append(model(xcats, xnums, seqs, seq_lens).cpu())

test_preds = torch.cat(outs).numpy()

# Submission

In [None]:
submit = pd.read_csv('./sample_submission.csv')
submit['clicked'] = test_preds

file_name = CFG['MODEL_NAME'].split('.')[0]
submit.to_csv(f'./{file_name}.csv', index=False)

In [None]:
pd.read_csv(f'./{file_name}.csv')['clicked'].plot()