In [1]:
# Gerekli kütüphanelerin kurulumu
!pip install -q transformers

In [2]:
# Google Drive bağlantısı
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os, gc, math, time, warnings
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
warnings.filterwarnings('ignore')

# ------------------- CONFIG -------------------
CFG = {
    # Genel
    'SEED': 42,
    'DEBUG': False,

    # Veri Yolları (Colab)
    'DATA_DIR': '/content/drive/MyDrive/data',           # CSV dosyaları
    'EMBEDDINGS_DIR': '/content/drive/MyDrive/embeddings', # Embedding dosyaları

    # Model Boyutları
    'H1': 2048,
    'H2': 1024,
    'DROPOUT1': 0.2,
    'DROPOUT2': 0.1,

    # Training
    'VAL_SIZE': 0.1,
    'BATCH_TRAIN': 64,     # Artırılmış batch size
    'BATCH_PRED': 128,
    'EPOCHS': 20,          # Daha uzun eğitim
    'LR': 5e-4,           # Başlangıç learning rate
    'MIN_LR': 1e-6,       # Minimum learning rate
    'WARMUP_EPOCHS': 2,    # Warmup dönemi
    'WD': 0.01,
    'LABEL_SMOOTH': 0.15,  # Artırılmış label smoothing
    'PATIENCE': 5,         # Daha uzun patience
    'CLIP_NORM': 1.0,
    'FINETUNE_STEPS': 3    # Daha fazla fine-tuning
}

# Seed
np.random.seed(CFG['SEED'])
torch.manual_seed(CFG['SEED'])
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(CFG['SEED'])

# Device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device:', device)
if device == 'cuda':
    print('GPU:', torch.cuda.get_device_name(0))


Device: cuda
GPU: Tesla T4


In [4]:
# ------------------- LOAD DATA & EMBEDDINGS -------------------
# CSV dosyaları
train = pd.read_csv(f"{CFG['DATA_DIR']}/train.csv")
test = pd.read_csv(f"{CFG['DATA_DIR']}/test.csv")
sample = pd.read_csv(f"{CFG['DATA_DIR']}/sample_submission.csv")

if CFG['DEBUG']:
    print('DEBUG MODE: Az veri kullanılıyor!')
    train = train.sample(1000, random_state=CFG['SEED']).reset_index(drop=True)
    test = test.sample(1000, random_state=CFG['SEED']).reset_index(drop=True)

# Label encoding
le = LabelEncoder()
y_all = le.fit_transform(train['label'].astype(str).values)

# Embeddings
print('Loading embeddings...')
X_train_embed = np.load(f"{CFG['EMBEDDINGS_DIR']}/X_train_multilingual_e5_large.npy", mmap_mode='r')
X_test_embed = np.load(f"{CFG['EMBEDDINGS_DIR']}/X_test_multilingual_e5_large.npy", mmap_mode='r')

print('Train shape:', train.shape)
print('Test shape:', test.shape)
print('Embedding shapes:', X_train_embed.shape, X_test_embed.shape)


Loading embeddings...
Train shape: (848237, 2)
Test shape: (217241, 2)
Embedding shapes: (848237, 1024) (217241, 1024)


In [9]:
# ------------------- MODEL CLASSES -------------------
class CosineHead(torch.nn.Module):
    def __init__(self, in_dim, num_classes, s=30.0):
        super().__init__()
        self.W = torch.nn.Parameter(torch.randn(in_dim, num_classes))
        self.s = s
        torch.nn.init.xavier_normal_(self.W)

    def forward(self, x):
        x = torch.nn.functional.normalize(x, dim=1)
        W = torch.nn.functional.normalize(self.W, dim=0)
        return self.s * (x @ W)

class ImprovedMLP(torch.nn.Module):
    def __init__(self, in_dim, num_classes, h1=2048, h2=1024, h3=512, d1=0.3, d2=0.2, d3=0.1):
        super().__init__()

        # First block
        self.block1 = torch.nn.Sequential(
            torch.nn.Linear(in_dim, h1),
            torch.nn.BatchNorm1d(h1),
            torch.nn.GELU(),
            torch.nn.Dropout(d1)
        )

        # Second block with residual
        self.block2 = torch.nn.Sequential(
            torch.nn.Linear(h1, h2),
            torch.nn.BatchNorm1d(h2),
            torch.nn.GELU(),
            torch.nn.Dropout(d2)
        )
        self.residual2 = torch.nn.Linear(h1, h2) if h1 != h2 else torch.nn.Identity()

        # Third block with residual
        self.block3 = torch.nn.Sequential(
            torch.nn.Linear(h2, h3),
            torch.nn.BatchNorm1d(h3),
            torch.nn.GELU(),
            torch.nn.Dropout(d3)
        )
        self.residual3 = torch.nn.Linear(h2, h3) if h2 != h3 else torch.nn.Identity()

        self.head = CosineHead(h3, num_classes)

    def forward(self, x):
        # Forward pass with residual connections
        x1 = self.block1(x)
        x2 = self.block2(x1) + self.residual2(x1)
        x3 = self.block3(x2) + self.residual3(x2)
        return self.head(x3)


In [10]:
# ------------------- TRAINING FUNCTIONS -------------------
def iterate_idx(idxs, batch=1024, shuffle=True, seed=CFG['SEED']):
    idx = np.array(idxs)
    if shuffle:
        rng = np.random.default_rng(seed)
        rng.shuffle(idx)
    for i in range(0, len(idx), batch):
        j = idx[i:i+batch]
        xb = torch.as_tensor(X_train_embed[j], dtype=torch.float32, device=device)
        yb = torch.as_tensor(y_all[j], dtype=torch.long, device=device)
        yield xb, yb

@torch.no_grad()
def eval_f1(idxs, batch=2048):
    model_clf.eval()
    preds = []
    for i in range(0, len(idxs), batch):
        j = idxs[i:i+batch]
        xb = torch.as_tensor(X_train_embed[j], dtype=torch.float32, device=device)
        logits = model_clf(xb)
        preds.append(torch.argmax(logits, dim=1).cpu().numpy())
    return f1_score(y_all[idxs], np.concatenate(preds), average='macro')


In [14]:
# ------------------- TRAINING -------------------
# Train-Val split
X_tr, X_val, y_tr, y_val = train_test_split(
    np.arange(X_train_embed.shape[0]),
    y_all,
    test_size=CFG['VAL_SIZE'],
    random_state=CFG['SEED'],
    stratify=y_all
)

# Model init
in_dim = X_train_embed.shape[1]
num_classes = len(le.classes_)
model_clf = ImprovedMLP(
    in_dim=in_dim,
    num_classes=num_classes,
    h1=CFG['H1'],
    h2=CFG['H2'],
    h3=512,  # Üçüncü katman boyutu
    d1=CFG['DROPOUT1'],
    d2=CFG['DROPOUT2'],
    d3=0.1  # Üçüncü katman dropout
).to(device)

# Class weights
cls_w_np = compute_class_weight('balanced', classes=np.arange(num_classes), y=y_all)
cls_w = torch.tensor(cls_w_np, dtype=torch.float32, device=device)

# Optimizer
opt = torch.optim.AdamW(model_clf.parameters(), lr=CFG['LR'], weight_decay=CFG['WD'])
scaler = torch.cuda.amp.GradScaler(enabled=(device=='cuda'))

# Cosine annealing with warmup scheduler
def get_lr_multiplier(epoch, warmup_epochs, total_epochs):
    if epoch < warmup_epochs:
        return epoch / warmup_epochs
    else:
        progress = (epoch - warmup_epochs) / (total_epochs - warmup_epochs)
        return 0.5 * (1 + math.cos(math.pi * progress))

# Training loop with improved scheduling and monitoring
best_f1, best_state, bad_epochs = -1.0, None, 0
print('Training ImprovedMLP...')

# Training statistics
train_losses = []
val_f1_scores = []

for ep in range(1, CFG['EPOCHS']+1):
    model_clf.train()
    t0 = time.time()
    epoch_losses = []

    # Learning rate scheduling
    lr_mult = get_lr_multiplier(ep-1, CFG['WARMUP_EPOCHS'], CFG['EPOCHS'])
    current_lr = CFG['LR'] * lr_mult
    current_lr = max(current_lr, CFG['MIN_LR'])
    for param_group in opt.param_groups:
        param_group['lr'] = current_lr

    it_train = iterate_idx(X_tr, batch=CFG['BATCH_TRAIN'], shuffle=True, seed=CFG['SEED']+ep)
    for xb, yb in it_train:
        opt.zero_grad(set_to_none=True)
        with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=(device=='cuda')):
            # Mixup ile eğitim
            if np.random.random() < 0.5:  # %50 olasılıkla mixup uygula
                lam = np.random.beta(0.2, 0.2)
                index = torch.randperm(xb.size(0)).to(xb.device)
                mixed_x = lam * xb + (1 - lam) * xb[index]
                logits = model_clf(mixed_x)
                loss = lam * torch.nn.functional.cross_entropy(logits, yb, label_smoothing=CFG['LABEL_SMOOTH'], weight=cls_w) + \
                       (1 - lam) * torch.nn.functional.cross_entropy(logits, yb[index], label_smoothing=CFG['LABEL_SMOOTH'], weight=cls_w)
            else:
                # Normal eğitim
                logits = model_clf(xb)
                loss = torch.nn.functional.cross_entropy(logits, yb, label_smoothing=CFG['LABEL_SMOOTH'], weight=cls_w)

        epoch_losses.append(loss.item())

        scaler.scale(loss).backward()
        if CFG['CLIP_NORM'] is not None:
            scaler.unscale_(opt)
            torch.nn.utils.clip_grad_norm_(model_clf.parameters(), CFG['CLIP_NORM'])
        scaler.step(opt)
        scaler.update()

    val_f1 = eval_f1(X_val, batch=CFG['BATCH_PRED'])
    dt = time.time() - t0
    print(f'[{ep:02d}/{CFG["EPOCHS"]}] Val F1: {val_f1:.4f} | {dt:.1f}s')

    if val_f1 > best_f1 + 1e-4:
        best_f1 = val_f1
        best_state = {k: v.detach().cpu().clone() for k, v in model_clf.state_dict().items()}
        bad_epochs = 0
    else:
        bad_epochs += 1
        if bad_epochs >= CFG['PATIENCE']:
            print(f'Early stopping at epoch {ep} (no improvement in {CFG["PATIENCE"]} epochs).')
            break

# En iyi modeli geri yükle
if best_state is not None:
    model_clf.load_state_dict(best_state)
print(f'Best Val F1: {best_f1:.4f}')


Training ImprovedMLP...
[01/20] Val F1: 0.0005 | 145.5s
[02/20] Val F1: 0.4102 | 142.6s
[03/20] Val F1: 0.4435 | 141.8s
[04/20] Val F1: 0.5020 | 141.3s
[05/20] Val F1: 0.5299 | 141.6s
[06/20] Val F1: 0.5525 | 141.6s
[07/20] Val F1: 0.5715 | 141.4s
[08/20] Val F1: 0.5879 | 141.5s
[09/20] Val F1: 0.5985 | 141.4s
[10/20] Val F1: 0.6099 | 141.4s
[11/20] Val F1: 0.6196 | 141.7s
[12/20] Val F1: 0.6312 | 142.1s
[13/20] Val F1: 0.6386 | 142.0s
[14/20] Val F1: 0.6474 | 142.0s
[15/20] Val F1: 0.6550 | 141.7s
[16/20] Val F1: 0.6594 | 141.9s
[17/20] Val F1: 0.6650 | 141.9s
[18/20] Val F1: 0.6681 | 140.9s
[19/20] Val F1: 0.6705 | 141.3s
[20/20] Val F1: 0.6709 | 141.5s
Best Val F1: 0.6709


In [16]:
# ------------------- FINAL FINETUNE & PREDICT -------------------
print('Short finetune on full train...')
all_idx = np.arange(X_train_embed.shape[0])
for step in range(CFG['FINETUNE_STEPS']):
    model_clf.train()
    for xb, yb in iterate_idx(all_idx, batch=CFG['BATCH_TRAIN'], shuffle=True, seed=CFG['SEED']+100+step):
        opt.zero_grad(set_to_none=True)
        with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=(device=='cuda')):
            logits = model_clf(xb)
            loss = torch.nn.functional.cross_entropy(logits, yb, label_smoothing=CFG['LABEL_SMOOTH'], weight=cls_w)
        scaler.scale(loss).backward()
        if CFG['CLIP_NORM'] is not None:
            scaler.unscale_(opt)
            torch.nn.utils.clip_grad_norm_(model_clf.parameters(), CFG['CLIP_NORM'])
        scaler.step(opt)
        scaler.update()

print('Predicting test...')
preds = []
model_clf.eval()
with torch.no_grad():
    for i in range(0, X_test_embed.shape[0], CFG['BATCH_PRED']):
        xb = torch.as_tensor(X_test_embed[i:i+CFG['BATCH_PRED']], dtype=torch.float32, device=device)
        with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=(device=='cuda')):
            logits = model_clf(xb)
        preds.append(torch.argmax(logits, dim=1).cpu().numpy())

y_test_pred = np.concatenate(preds)
labels_pred = le.inverse_transform(y_test_pred)

# Submission dosyası oluştur
sub = sample.copy()
sub['label'] = labels_pred
sub_path = os.path.join(CFG['EMBEDDINGS_DIR'], 'submission.csv')
sub.to_csv(sub_path, index=False)
print(f'Submission dosyası kaydedildi: {sub_path}')
print('Bitti! 🎉')


Short finetune on full train...
Predicting test...
Submission dosyası kaydedildi: /content/drive/MyDrive/embeddings/submission.csv
Bitti! 🎉
