In [1]:
!nvidia-smi
import os
for r,_,fs in os.walk("/kaggle/input"):
    for f in fs:
        print(os.path.join(r,f))

Sat Aug 23 10:31:58 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   37C    P8              9W /   70W |       1MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

In [2]:
# Gerekli kütüphanelerin kurulumu
!pip install -q transformers sentencepiece

In [3]:
# ================= HEPSİBURADA KAGGLE PIPELINE (OPTIMIZED v4 - SINGLE GPU STABLE) =================
import os, re, unicodedata, gc, math, time, warnings, json
import numpy as np, pandas as pd, torch
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoTokenizer, AutoModel,
    get_cosine_schedule_with_warmup
)
from contextlib import contextmanager
from tqdm.auto import tqdm
warnings.filterwarnings('ignore')

# ------------------- CONFIG -------------------
CFG = {
    # Genel
    'REQUIRE_GPU': True,
    'SEED': 42,
    'DEBUG': False,  # Hızlı debug için az veri kullan
    
    # Veri Yolları (Kaggle)
    'DATA_DIR': '/kaggle/input/hepsiburadadata',
    'OUTPUT_DIR': '/kaggle/working',
    'TEXT_FALLBACK_COLS': ["clean_address", "address", "text"],
    'LABEL_COL': "label",
    
    # Model & Training
    'MODELS': [
        "intfloat/multilingual-e5-large",  # Önce sadece ana model
    ],
    'NUM_FOLDS': 5,                        # Cross-validation
    'VAL_SIZE': 0.10,                      # Her fold için
    
    # Embedding (Optimize edilmiş)
    'EMB_BATCH': 16,                       # Daha küçük batch
    'EMB_MAXLEN': 96,                      # Daha kısa sequence
    'SAVE_FP16': True,                     # Disk tasarrufu
    
    # MLP (Optimize edilmiş)
    'H1': 1024, 'H2': 512, 'H3': 256,     # Daha küçük model
    'DROPOUT1': 0.35, 'DROPOUT2': 0.25, 'DROPOUT3': 0.15,
    'LR': 2e-4,
    'WARMUP_RATIO': 0.05,                  # Warmup
    'WD': 0.01,                            # L2 reg
    'EPOCHS': 15,                          # Daha az epoch
    'BATCH_TRAIN': 8,                      # Daha küçük batch
    'LABEL_SMOOTH': 0.1,                   # Regularization
    'PATIENCE': 3,                         # Early stopping
    'CLIP_NORM': 1.0,                      # Gradient clipping
    
    # Ensemble & Predict
    'FOLD_WEIGHTS': None,                  # Auto-weighted
    'FINETUNE_STEPS': 2,                   # Son fine-tune
    'BATCH_PRED': 32,                      # Prediction batch
    
    # Log/ilerleme
    'SHOW_PROGRESS': True
}

In [4]:
# ------------------- ENV & DEVICE -------------------
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")
os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
torch.set_num_threads(1)
torch.backends.cudnn.benchmark = True

np.random.seed(CFG["SEED"])
torch.manual_seed(CFG["SEED"])
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(CFG["SEED"])

device = "cuda" if torch.cuda.is_available() else "cpu"
if CFG["REQUIRE_GPU"]:
    assert device == "cuda", "GPU görünmüyor. Lütfen GPU runtime aç ve tekrar çalıştır."
print("Device:", device)
if device == "cuda":
    print("GPU Count:", torch.cuda.device_count())
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}:", torch.cuda.get_device_name(i))

# ------------------- LOAD DATA -------------------
TRAIN_CSV = os.path.join(CFG["DATA_DIR"], "train.csv")
TEST_CSV  = os.path.join(CFG["DATA_DIR"], "test.csv")
SAMPLE_SUB = os.path.join(CFG["DATA_DIR"], "sample_submission.csv")

train = pd.read_csv(TRAIN_CSV)
test  = pd.read_csv(TEST_CSV)
sample = pd.read_csv(SAMPLE_SUB)

if CFG["DEBUG"]:
    print("DEBUG MODE: Az veri kullanılıyor!")
    train = train.sample(1000, random_state=CFG["SEED"]).reset_index(drop=True)
    test = test.sample(1000, random_state=CFG["SEED"]).reset_index(drop=True)

print("\nTrain shape:", train.shape, "Test shape:", test.shape)
print("Columns:", train.columns.tolist())

Device: cuda
GPU Count: 2
GPU 0: Tesla T4
GPU 1: Tesla T4

Train shape: (848237, 2) Test shape: (217241, 2)
Columns: ['address', 'label']


In [5]:
# ------------------- ADVANCED TEXT PREPROCESSING -------------------
# Temel temizlik
turkish_map = str.maketrans({
    "ç":"c", "ğ":"g", "ı":"i", "ö":"o", "ş":"s", "ü":"u",
    "Ç":"c", "Ğ":"g", "İ":"i", "Ö":"o", "Ş":"s", "Ü":"u"
})

# Genişletilmiş kısaltma sözlüğü
abbrev_map = {
    # Temel adres
    "mah":"mahallesi", "mah.":"mahallesi", "mh":"mahallesi", "mh.":"mahallesi",
    "sok":"sokak", "sok.":"sokak", "sk":"sokak", "sk.":"sokak",
    "cad":"cadde", "cad.":"cadde", "caddesi":"cadde", "cd":"cadde", "cd.":"cadde",
    "bulvari":"bulvar", "bulvarı":"bulvar", "bulv":"bulvar", "blv":"bulvar",
    "apt":"apartman", "apt.":"apartman", "ap":"apartman", "ap.":"apartman",
    # Numaralar
    "no":"numara", "no.":"numara", "no::":"numara", "no:":"numara",
    # Bina detayları
    "daire":"daire", "d":"daire", "d.":"daire", "d:":"daire",
    "kat":"kat", "k":"kat", "k.":"kat", "k:":"kat",
    "blok":"blok", "bl":"blok", "bl.":"blok", "b":"blok", "b.":"blok",
    # Site/kompleks
    "sitesi":"site", "st.":"site", "evleri":"evler", "konutlari":"konutlar",
    # İş yeri
    "is":"iş", "mrk":"merkez", "mrk.":"merkez", "merkezi":"merkez",
    "org.san.":"organize sanayi", "osb":"organize sanayi", "san.":"sanayi",
    # Bölge
    "böl":"bölge", "böl.":"bölge", "bolge":"bölge", "bolg.":"bölge",
    # Yön
    "kuz":"kuzey", "kuz.":"kuzey", "gun":"güney", "gün":"güney",
    "dog":"doğu", "doğ":"doğu", "bat":"batı", "bat.":"batı",
}

# Regex patterns
_re_nonword = re.compile(r"[^\w\s]")
_re_letter_digit = re.compile(r"([a-z])(\d)")
_re_digit_letter = re.compile(r"(\d)([a-z])")
_re_manydigits = re.compile(r"\d{5,}")
_re_manypunct = re.compile(r"[^\w\s]{2,}")
_re_space = re.compile(r"\s+")

def strip_accents(s: str) -> str:
    s = unicodedata.normalize("NFKD", str(s))
    return "".join(ch for ch in s if not unicodedata.combining(ch))

def clean_address(addr: str) -> str:
    # Temel normalizasyon
    addr = strip_accents(addr).lower().translate(turkish_map)
    
    # Noktalama ve boşluk temizliği
    addr = _re_nonword.sub(" ", addr)
    addr = _re_manypunct.sub(" ", addr)
    
    # Sayı-harf arası boşluk
    addr = _re_letter_digit.sub(r"\1 \2", addr)
    addr = _re_digit_letter.sub(r"\1 \2", addr)
    
    # Uzun sayıları standartlaştır
    addr = _re_manydigits.sub(" NUM ", addr)
    
    # Boşlukları normalize et
    addr = _re_space.sub(" ", addr).strip()
    
    # Kısaltmaları genişlet
    toks = []
    for t in addr.split():
        # Nokta ile biten kelimeler için de kontrol
        t_nodot = t.rstrip(".")
        if t_nodot in abbrev_map:
            toks.append(abbrev_map[t_nodot])
        else:
            toks.append(t)
    
    return " ".join(toks)

# Adres metni için kolon seç
TEXT_COL = None
for col in CFG["TEXT_FALLBACK_COLS"]:
    if col in train.columns:
        TEXT_COL = col
        break
if TEXT_COL is None:
    raise ValueError("Adres metni için bir kolon bulunamadı. TEXT_FALLBACK_COLS'a göre (clean_address/address/text) bekleniyordu.")

# Temizlenmiş adres kolonu oluştur/güncelle
if TEXT_COL != "clean_address":
    print(f"'{TEXT_COL}' üzerinden gelişmiş temizlik yapılıyor -> 'clean_address'")
    train["clean_address"] = train[TEXT_COL].astype(str).apply(clean_address)
    test["clean_address"]  = test[TEXT_COL].astype(str).apply(clean_address)
    TEXT_COL = "clean_address"

LABEL_COL = CFG["LABEL_COL"]

# Label encoding
le = LabelEncoder()
y_all = le.fit_transform(train[LABEL_COL].astype(str).values)
print("\nUnique labels:", len(le.classes_))

'address' üzerinden gelişmiş temizlik yapılıyor -> 'clean_address'

Unique labels: 10390


In [6]:
# ------------------- EMBEDDING MODEL -------------------
@contextmanager
def _tok_parallel(on=True):
    """Tokenizer paralelleştirme yönetimi"""
    old = os.environ.get("TOKENIZERS_PARALLELISM", "false")
    try:
        os.environ["TOKENIZERS_PARALLELISM"] = "true" if on else "false"
        yield
    finally:
        os.environ["TOKENIZERS_PARALLELISM"] = old

class FastEmbeddingModel:
    def __init__(self, model_name, device="cuda"):
        print(f"Loading model: {model_name}")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        # Model'i DataParallel ile çift GPU'ya dağıt
        base_model = AutoModel.from_pretrained(model_name)
        if torch.cuda.device_count() > 1:
            print(f"Using {torch.cuda.device_count()} GPUs!")
            self.model = torch.nn.DataParallel(base_model).to(device)
        else:
            self.model = base_model.to(device)
            
        self.device = device
        self.model.eval()
        
        # Model boyutuna göre attention weights
        hidden_size = base_model.config.hidden_size
        self.attention_weights = torch.nn.Parameter(
            torch.ones(hidden_size, device=device)
        ).softmax(dim=0)
    
    def weighted_pool(self, last_hidden, attn_mask):
        mask = attn_mask.unsqueeze(-1).float()
        weighted = last_hidden * self.attention_weights
        return (weighted * mask).sum(1) / mask.sum(1).clamp(min=1e-9)
    
    @torch.inference_mode()
    def embed(self, texts, batch=32, max_len=128, show_progress=True):
        if isinstance(texts, str):
            texts = [texts]
        
        # Çift GPU varsa batch size'ı 2 katına çıkar
        if torch.cuda.device_count() > 1:
            batch = batch * 2
            print(f"Increased batch size to {batch} for multi-GPU")
        
        # Uzunluğa göre sırala ve batch'le
        lengths = np.array([len(t) for t in texts], dtype=np.int32)
        order = np.argsort(lengths)
        restore = np.empty_like(order); restore[order] = np.arange(len(order))
        texts_sorted = [texts[i] for i in order]
        
        vecs = []
        t0 = time.time()
        
        # Paralel tokenization
        with _tok_parallel(True):
            # Tüm metinleri tokenize et
            encoded = self.tokenizer(
                texts_sorted,
                padding=True,
                truncation=True,
                max_length=max_len,
                return_tensors="pt"
            )
            
            # Batch'ler halinde işle
            for i in tqdm(range(0, len(texts_sorted), batch), 
                         disable=not show_progress,
                         desc=f"Embedding ({self.model.module.config.model_type if hasattr(self.model, 'module') else self.model.config.model_type})"):
                
                inputs = {
                    k: v[i:i+batch].to(self.device) 
                    for k, v in encoded.items()
                }
                
                # Mixed precision
                with torch.autocast(device_type="cuda", dtype=torch.float16):
                    outputs = self.model(**inputs)
                    pooled = self.weighted_pool(
                        outputs.last_hidden_state,
                        inputs["attention_mask"]
                    )
                
                vecs.append(pooled.cpu())
                
                # Bellek temizliği
                del inputs, outputs, pooled
                if i % 100 == 0:
                    torch.cuda.empty_cache()
                    gc.collect()
        
        X = torch.cat(vecs, dim=0).numpy()
        X = X[restore]
        
        # L2 normalize
        X /= np.linalg.norm(X, axis=1, keepdims=True).clip(1e-12)
        return X

def fast_embeddings(texts, model, fname, batch=32, max_len=128):
    fname = os.path.join(CFG["OUTPUT_DIR"], fname)
    if os.path.exists(fname):
        print(f"Loading cached embeddings: {fname}")
        return np.load(fname, mmap_mode="r")
    
    print(f"Creating embeddings -> {fname}")
    X = model.embed(texts, batch=batch, max_len=max_len)
    np.save(fname, X.astype(np.float16))
    return np.load(fname, mmap_mode="r")

# Ana embedding hesaplama
print("Starting embedding computation...")
print(f"Using {torch.cuda.device_count()} GPUs")
embeddings_train, embeddings_test = [], []

for model_name in CFG["MODELS"]:
    safe_model = model_name.split("/")[-1].replace("-", "_")
    print(f"\nProcessing model: {model_name}")
    
    model = FastEmbeddingModel(model_name, device)
    
    # Train embeddings
    X_train = fast_embeddings(
        train[TEXT_COL].astype(str).tolist(),
        model,
        f"X_train_{safe_model}.npy",
        batch=CFG["EMB_BATCH"],
        max_len=CFG["EMB_MAXLEN"]
    )
    embeddings_train.append(X_train)
    print(f"Train embeddings shape: {X_train.shape}")
    
    # Test embeddings
    X_test = fast_embeddings(
        test[TEXT_COL].astype(str).tolist(),
        model,
        f"X_test_{safe_model}.npy",
        batch=CFG["EMB_BATCH"],
        max_len=CFG["EMB_MAXLEN"]
    )
    embeddings_test.append(X_test)
    print(f"Test embeddings shape: {X_test.shape}")
    
    # Bellek temizliği
    del model
    torch.cuda.empty_cache()
    gc.collect()
    time.sleep(2)  # GPU soğuma

print("\nEmbedding shapes:")
for i, model_name in enumerate(CFG["MODELS"]):
    print(f"- {model_name.split('/')[-1]}: {embeddings_train[i].shape}")

# GPU kullanım bilgisi
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"\nGPU {i} Memory Usage:")
        print(f"Allocated: {torch.cuda.memory_allocated(i)/1024**3:.2f} GB")
        print(f"Cached: {torch.cuda.memory_reserved(i)/1024**3:.2f} GB")

Starting embedding computation...
Using 2 GPUs

Processing model: intfloat/multilingual-e5-large
Loading model: intfloat/multilingual-e5-large


tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

2025-08-23 10:33:17.503849: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755945197.830846      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755945197.923422      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Using 2 GPUs!
Creating embeddings -> /kaggle/working/X_train_multilingual_e5_large.npy
Increased batch size to 32 for multi-GPU


Embedding (xlm-roberta):   0%|          | 0/26508 [00:00<?, ?it/s]

Train embeddings shape: (848237, 1024)
Creating embeddings -> /kaggle/working/X_test_multilingual_e5_large.npy
Increased batch size to 32 for multi-GPU


Embedding (xlm-roberta):   0%|          | 0/6789 [00:00<?, ?it/s]

Test embeddings shape: (217241, 1024)

Embedding shapes:
- multilingual-e5-large: (848237, 1024)

GPU 0 Memory Usage:
Allocated: 0.01 GB
Cached: 0.01 GB

GPU 1 Memory Usage:
Allocated: 0.01 GB
Cached: 0.02 GB
