In [5]:
import os
import re
import random
import warnings
import pickle
import numpy as np
import pandas as pd
from collections import Counter

warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from transformers import (
    AutoTokenizer, 
    AutoModel, 
    AutoConfig,
    get_cosine_schedule_with_warmup,
    DebertaV2Config,
    DebertaV2Model,
    PreTrainedTokenizerFast
)

from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from safetensors.torch import load_file

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

CONFIG = {
    'train_path': '/kaggle/input/nn-26-review-sentiment-classification/train.csv',
    'test_path': '/kaggle/input/nn-26-review-sentiment-classification/test.csv',
    'save_dir': '/kaggle/working/',
    'transformer_model': 'microsoft/deberta-v3-large',
    'pt_max_len': 384,
    'pt_batch_size': 4,
    'pt_accumulation_steps': 8,
    'val_size': 0.15,
    'pt_epochs': 6,
    'pt_learning_rate': 2e-5,
    'pt_weight_decay': 0.01,
    'pt_warmup_ratio': 0.1,
    'pt_dropout': 0.3,
    'llrd_decay': 0.95,
    'tf_max_features': 30000,
    'tf_max_len': 200,
    'tf_embedding_dim': 300,
    'tf_batch_size': 16,  # REDUCED from 32
    'tf_epochs': 30,
    'tf_learning_rate': 0.001,
    'tf_lstm_units': 128,
    'tf_gru_units': 128,
    'tf_cnn_filters': 128,
    'tf_dropout_rate': 0.4,
    'transformer_heads': 8,
    'transformer_ff_dim': 512,
    'transformer_blocks': 2,  # REDUCED from 4
    'label_smoothing': 0.1,
    'focal_gamma': 2.0,
    'ordinal_alpha': 1.5,
    'augment_minority': True,
    'target_samples': 2500,
    'seed': 42,
}

CLASS_LABELS = ['Very bad', 'Bad', 'Good', 'Very good', 'Excellent']
LABEL_TO_IDX = {label: idx for idx, label in enumerate(CLASS_LABELS)}
IDX_TO_LABEL = {idx: label for idx, label in enumerate(CLASS_LABELS)}
NUM_CLASSES = len(CLASS_LABELS)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(CONFIG['seed'])

def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<.*?>', '', text)
    contractions = {
        "won't": "will not", "can't": "cannot", "n't": " not",
        "'re": " are", "'s": " is", "'d": " would",
        "'ll": " will", "'ve": " have", "'m": " am"
    }
    for contraction, expansion in contractions.items():
        text = text.replace(contraction, expansion)
    text = re.sub(r'[^a-zA-Z0-9\s!?.,;:\'-]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def get_layerwise_optimizer(model, base_lr, weight_decay, llrd_decay=0.95):
    no_decay = ['bias', 'LayerNorm.weight', 'layer_norm.weight']
    n_layers = model.config.num_hidden_layers
    optimizer_grouped_parameters = []
    
    optimizer_grouped_parameters.extend([
        {
            'params': [p for n, p in model.classifier.named_parameters() 
                      if not any(nd in n for nd in no_decay)],
            'weight_decay': weight_decay,
            'lr': base_lr
        },
        {
            'params': [p for n, p in model.classifier.named_parameters() 
                      if any(nd in n for nd in no_decay)],
            'weight_decay': 0.0,
            'lr': base_lr
        }
    ])
    
    if hasattr(model, 'layer_norm'):
        optimizer_grouped_parameters.append({
            'params': model.layer_norm.parameters(),
            'weight_decay': 0.0,
            'lr': base_lr * 0.95
        })
    
    for layer_idx in range(n_layers - 1, -1, -1):
        layer_lr = base_lr * (llrd_decay ** (n_layers - layer_idx))
        layer = model.transformer.encoder.layer[layer_idx]
        
        optimizer_grouped_parameters.extend([
            {
                'params': [p for n, p in layer.named_parameters() 
                          if not any(nd in n for nd in no_decay)],
                'weight_decay': weight_decay,
                'lr': layer_lr
            },
            {
                'params': [p for n, p in layer.named_parameters() 
                          if any(nd in n for nd in no_decay)],
                'weight_decay': 0.0,
                'lr': layer_lr
            }
        ])
    
    if hasattr(model.transformer, 'embeddings'):
        emb_lr = base_lr * (llrd_decay ** (n_layers + 1))
        optimizer_grouped_parameters.extend([
            {
                'params': [p for n, p in model.transformer.embeddings.named_parameters() 
                          if not any(nd in n for nd in no_decay)],
                'weight_decay': weight_decay,
                'lr': emb_lr
            },
            {
                'params': [p for n, p in model.transformer.embeddings.named_parameters() 
                          if any(nd in n for nd in no_decay)],
                'weight_decay': 0.0,
                'lr': emb_lr
            }
        ])
    
    return AdamW(optimizer_grouped_parameters)

class TextAugmenter:
    def __init__(self):
        self.synonyms = {
            'good': ['great', 'nice', 'fine', 'excellent', 'wonderful', 'fantastic'],
            'bad': ['terrible', 'awful', 'poor', 'horrible', 'dreadful', 'lousy'],
            # ... (rest of synonyms)
        }
    
    def synonym_replacement(self, text, n=2):
        words = text.split()
        new_words = words.copy()
        replacements = 0
        indices = list(range(len(words)))
        random.shuffle(indices)
        
        for i in indices:
            word_lower = words[i].lower()
            if word_lower in self.synonyms and replacements < n:
                synonym = random.choice(self.synonyms[word_lower])
                new_words[i] = synonym
                replacements += 1
        
        return ' '.join(new_words)
    
    def augment(self, text):
        return self.synonym_replacement(text, n=random.randint(1, 3))

def augment_to_target(df, text_col, label_col, target_samples=2500):
    augmenter = TextAugmenter()
    class_counts = df[label_col].value_counts()
    print("\nOriginal class distribution:")
    print(class_counts)
    
    augmented_data = []
    for label in class_counts.index:
        label_data = df[df[label_col] == label]
        current_count = len(label_data)
        samples_needed = target_samples - current_count
        
        if samples_needed > 0:
            print(f"Augmenting '{label}': {current_count} → {target_samples} (+{samples_needed})")
            for _ in range(samples_needed):
                sample = label_data.sample(1).iloc[0]
                aug_text = augmenter.augment(sample[text_col])
                augmented_data.append({
                    text_col: aug_text,
                    label_col: label,
                    'id': -1
                })
    
    if augmented_data:
        aug_df = pd.DataFrame(augmented_data)
        df = pd.concat([df, aug_df], ignore_index=True)
    
    return df

class CombinedOrdinalFocalLoss(nn.Module):
    def __init__(self, alpha=None, focal_gamma=2.0, ordinal_alpha=1.5, 
                 focal_weight=0.7, label_smoothing=0.1):
        super().__init__()
        self.alpha = alpha
        self.focal_gamma = focal_gamma
        self.ordinal_alpha = ordinal_alpha
        self.focal_weight = focal_weight
        self.ordinal_weight = 1.0 - focal_weight
        self.label_smoothing = label_smoothing
    
    def focal_component(self, logits, targets):
        probs = F.softmax(logits, dim=-1)
        pt = probs.gather(1, targets.unsqueeze(1)).squeeze(1)
        focal_weight = (1 - pt) ** self.focal_gamma
        
        ce_loss = F.cross_entropy(
            logits, targets, 
            reduction='none',
            label_smoothing=self.label_smoothing
        )
        
        if self.alpha is not None:
            alpha_t = self.alpha.gather(0, targets)
            focal_loss = alpha_t * focal_weight * ce_loss
        else:
            focal_loss = focal_weight * ce_loss
        
        return focal_loss.mean()
    
    def ordinal_component(self, logits, targets):
        probs = F.softmax(logits, dim=-1)
        num_classes = logits.size(1)
        
        ordinal_loss = 0
        for i in range(num_classes):
            distance = torch.abs(i - targets).float()
            ordinal_loss += -torch.log(1 - probs[:, i] + 1e-8) * (distance ** self.ordinal_alpha)
        
        return ordinal_loss.mean()
    
    def forward(self, logits, targets):
        focal = self.focal_component(logits, targets)
        ordinal = self.ordinal_component(logits, targets)
        return self.focal_weight * focal + self.ordinal_weight * ordinal

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

class DeBERTaClassifier(nn.Module):
    def __init__(self, transformer_model, num_classes, dropout=0.3):
        super().__init__()
        
        self.config = DebertaV2Config(
            architectures=["DebertaV2Model"],
            attention_probs_dropout_prob=0.1,
            hidden_act="gelu",
            hidden_dropout_prob=0.1,
            hidden_size=1024,
            initializer_range=0.02,
            intermediate_size=4096,
            layer_norm_eps=1e-7,
            legacy=True,
            max_position_embeddings=512,
            max_relative_positions=-1,
            model_type="deberta-v2",
            norm_rel_ebd="layer_norm",
            num_attention_heads=16,
            num_hidden_layers=24,
            pad_token_id=0,
            pooler_dropout=0,
            pooler_hidden_act="gelu",
            pooler_hidden_size=1024,
            pos_att_type=["p2c", "c2p"],
            position_biased_input=False,
            position_buckets=256,
            relative_attention=True,
            share_att_key=True,
            type_vocab_size=0,
            vocab_size=128100,
            torch_dtype="float32"
        )
        
        self.transformer = DebertaV2Model(self.config)
        state_dict = load_file("/kaggle/input/deberta-dataset/out/model.safetensors")
        self.transformer.load_state_dict(state_dict)
        
        hidden_size = self.config.hidden_size
        self.dropouts = nn.ModuleList([nn.Dropout(dropout) for _ in range(5)])
        self.layer_norm = nn.LayerNorm(hidden_size)
        
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, num_classes)
        )
    
    def forward(self, input_ids, attention_mask):
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
            pooled = outputs.pooler_output
        else:
            last_hidden = outputs.last_hidden_state
            attention_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden.size()).float()
            sum_hidden = torch.sum(last_hidden * attention_mask_expanded, 1)
            sum_mask = attention_mask_expanded.sum(1).clamp(min=1e-9)
            pooled = sum_hidden / sum_mask
        
        pooled = self.layer_norm(pooled)
        
        logits = torch.zeros(pooled.size(0), self.classifier[-1].out_features).to(pooled.device)
        for dropout in self.dropouts:
            logits += self.classifier(dropout(pooled))
        logits /= len(self.dropouts)
        
        return logits

def train_epoch_pytorch(model, dataloader, optimizer, scheduler, criterion, device, accumulation_steps):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    optimizer.zero_grad()
    
    for step, batch in enumerate(dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss = loss / accumulation_steps
        
        loss.backward()
        
        if (step + 1) % accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        total_loss += loss.item() * accumulation_steps
        
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(dataloader)
    accuracy = balanced_accuracy_score(all_labels, all_preds)
    
    return avg_loss, accuracy

def validate_pytorch(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(dataloader)
    accuracy = balanced_accuracy_score(all_labels, all_preds)
    
    return avg_loss, accuracy, all_preds

def predict_pytorch(model, dataloader, device):
    model.eval()
    all_probs = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            outputs = model(input_ids, attention_mask)
            probs = F.softmax(outputs, dim=1)
            all_probs.extend(probs.cpu().numpy())
    
    return np.array(all_probs)

# TRAIN DEBERTA MODEL
print("=" * 80)
print("TRAINING DeBERTa TRANSFORMER")
print("=" * 80)

os.makedirs(CONFIG['save_dir'], exist_ok=True)

print("\n[1/2] Loading and preprocessing data...")
train_df = pd.read_csv(CONFIG['train_path'])
test_df = pd.read_csv(CONFIG['test_path'])

train_df['cleaned_text'] = train_df['text'].apply(clean_text)
test_df['cleaned_text'] = test_df['text'].apply(clean_text)

le = LabelEncoder()
le.fit(CLASS_LABELS)
train_df['label_idx'] = le.transform(train_df['review'])

if CONFIG['augment_minority']:
    print("\n[2/2] Augmenting data...")
    train_df = augment_to_target(train_df, 'cleaned_text', 'review', CONFIG['target_samples'])
    train_df['label_idx'] = le.transform(train_df['review'])

class_counts = np.bincount(train_df['label_idx'].values, minlength=NUM_CLASSES)
class_weights = 1.0 / (class_counts + 1e-6)
class_weights = class_weights / class_weights.sum() * NUM_CLASSES
class_weights_pt = torch.tensor(class_weights, dtype=torch.float32).to(DEVICE)

np.save(os.path.join(CONFIG['save_dir'], 'label_classes.npy'), le.classes_)

tokenizer = PreTrainedTokenizerFast(tokenizer_file="/kaggle/input/deberta-dataset/out/tokenizer.json")
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.save_pretrained(CONFIG['save_dir'])

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['cleaned_text'].values,
    train_df['label_idx'].values,
    test_size=CONFIG['val_size'],
    stratify=train_df['label_idx'].values,
    random_state=CONFIG['seed']
)

train_dataset = TextDataset(train_texts, train_labels, tokenizer, CONFIG['pt_max_len'])
val_dataset = TextDataset(val_texts, val_labels, tokenizer, CONFIG['pt_max_len'])
test_dataset = TextDataset(
    test_df['cleaned_text'].values,
    np.zeros(len(test_df)),
    tokenizer,
    CONFIG['pt_max_len']
)

sample_weights = class_weights_pt[train_labels].cpu().numpy()
sampler = WeightedRandomSampler(sample_weights, len(sample_weights))

train_loader = DataLoader(train_dataset, batch_size=CONFIG['pt_batch_size'], sampler=sampler)
val_loader = DataLoader(val_dataset, batch_size=CONFIG['pt_batch_size'] * 2, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=CONFIG['pt_batch_size'] * 2, shuffle=False)

model = DeBERTaClassifier(
    CONFIG['transformer_model'],
    NUM_CLASSES,
    dropout=CONFIG['pt_dropout']
).to(DEVICE)

optimizer = get_layerwise_optimizer(
    model, 
    base_lr=CONFIG['pt_learning_rate'],
    weight_decay=CONFIG['pt_weight_decay'],
    llrd_decay=CONFIG['llrd_decay']
)

total_steps = len(train_loader) * CONFIG['pt_epochs'] // CONFIG['pt_accumulation_steps']
warmup_steps = int(total_steps * CONFIG['pt_warmup_ratio'])
scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)

criterion = CombinedOrdinalFocalLoss(
    alpha=class_weights_pt,
    focal_gamma=CONFIG['focal_gamma'],
    ordinal_alpha=CONFIG['ordinal_alpha'],
    focal_weight=0.7,
    label_smoothing=CONFIG['label_smoothing']
)

best_val_acc = 0
patience = 3
patience_counter = 0

print("\nStarting training...")
for epoch in range(CONFIG['pt_epochs']):
    train_loss, train_acc = train_epoch_pytorch(
        model, train_loader, optimizer, scheduler,
        criterion, DEVICE, CONFIG['pt_accumulation_steps']
    )
    
    val_loss, val_acc, _ = validate_pytorch(model, val_loader, criterion, DEVICE)
    
    print(f"Epoch {epoch+1}/{CONFIG['pt_epochs']} | "
          f"Train Loss: {train_loss:.4f} | Train BA: {train_acc:.4f} | "
          f"Val Loss: {val_loss:.4f} | Val BA: {val_acc:.4f}")
    
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        patience_counter = 0
        torch.save(model.state_dict(), 
                  os.path.join(CONFIG['save_dir'], 'best_deberta.pt'))
        print(f"  ✓ New best model saved! BA: {val_acc:.4f}")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("  ⚠ Early stopping!")
            break

model.load_state_dict(torch.load(os.path.join(CONFIG['save_dir'], 'best_deberta.pt')))
test_probs_deberta = predict_pytorch(model, test_loader, DEVICE)

print(f"\n✓ DeBERTa Best Val BA: {best_val_acc:.4f}")

# Save DeBERTa predictions
np.save(os.path.join(CONFIG['save_dir'], 'deberta_probs.npy'), test_probs_deberta)

# FREE GPU MEMORY - CRITICAL!
del model, optimizer, scheduler, criterion
del train_loader, val_loader, test_loader
del train_dataset, val_dataset, test_dataset
torch.cuda.empty_cache()
import gc
gc.collect()

print("\n✓ GPU memory cleared for TensorFlow models")


PyTorch version: 2.6.0+cu124
CUDA available: True
Using device: cuda
TRAINING DeBERTa TRANSFORMER

[1/2] Loading and preprocessing data...

[2/2] Augmenting data...

Original class distribution:
review
Very good    2469
Excellent    2335
Good         1024
Bad           648
Very bad      524
Name: count, dtype: int64
Augmenting 'Very good': 2469 → 2500 (+31)
Augmenting 'Excellent': 2335 → 2500 (+165)
Augmenting 'Good': 1024 → 2500 (+1476)
Augmenting 'Bad': 648 → 2500 (+1852)
Augmenting 'Very bad': 524 → 2500 (+1976)

Starting training...
Epoch 1/6 | Train Loss: 1.1051 | Train BA: 0.5532 | Val Loss: 0.8604 | Val BA: 0.7179
  ✓ New best model saved! BA: 0.7179
Epoch 2/6 | Train Loss: 0.6525 | Train BA: 0.7991 | Val Loss: 0.7625 | Val BA: 0.7824
  ✓ New best model saved! BA: 0.7824
Epoch 3/6 | Train Loss: 0.4422 | Train BA: 0.8776 | Val Loss: 0.6465 | Val BA: 0.8256
  ✓ New best model saved! BA: 0.8256
Epoch 4/6 | Train Loss: 0.2935 | Train BA: 0.9259 | Val Loss: 0.8292 | Val BA: 0.8293
  

In [9]:
import os
import re
import random
import warnings
import pickle
import numpy as np
import pandas as pd

warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

# Set memory growth for TensorFlow to prevent OOM
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        for device in physical_devices:
            tf.config.experimental.set_memory_growth(device, True)
    except:
        pass

print(f"TensorFlow version: {tf.__version__}")

# Use existing CONFIG from previous cell
CLASS_LABELS = ['Very bad', 'Bad', 'Good', 'Very good', 'Excellent']
LABEL_TO_IDX = {label: idx for idx, label in enumerate(CLASS_LABELS)}
IDX_TO_LABEL = {idx: label for idx, label in enumerate(CLASS_LABELS)}
NUM_CLASSES = len(CLASS_LABELS)

CONFIG = {
    'train_path': '/kaggle/input/nn-26-review-sentiment-classification/train.csv',
    'test_path': '/kaggle/input/nn-26-review-sentiment-classification/test.csv',
    'save_dir': '/kaggle/working',
    'tf_max_features': 30000,
    'tf_max_len': 200,
    'tf_embedding_dim': 200,  # REDUCED from 300
    'tf_batch_size': 16,  # REDUCED from 32
    'tf_epochs': 30,
    'tf_learning_rate': 0.001,
    'tf_lstm_units': 96,  # REDUCED from 128
    'tf_gru_units': 96,  # REDUCED from 128
    'tf_cnn_filters': 96,  # REDUCED from 128
    'tf_dropout_rate': 0.4,
    'transformer_heads': 8,
    'transformer_ff_dim': 256,  # REDUCED from 512
    'transformer_blocks': 2,  # REDUCED from 4
    'seed': 42,
}

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

seed_everything(CONFIG['seed'])

def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<.*?>', '', text)
    contractions = {
        "won't": "will not", "can't": "cannot", "n't": " not",
        "'re": " are", "'s": " is", "'d": " would",
        "'ll": " will", "'ve": " have", "'m": " am"
    }
    for contraction, expansion in contractions.items():
        text = text.replace(contraction, expansion)
    text = re.sub(r'[^a-zA-Z0-9\s!?.,;:\'-]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

class BahdanauAttention(layers.Layer):
    def __init__(self, units, **kwargs):
        super().__init__(**kwargs)
        self.units = units
        self.W1 = layers.Dense(units)
        self.W2 = layers.Dense(units)
        self.V = layers.Dense(1)
    
    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights
    
    def get_config(self):
        config = super().get_config()
        config.update({"units": self.units})
        return config

class PositionalEncoding(layers.Layer):
    def __init__(self, max_len, d_model, **kwargs):
        super().__init__(**kwargs)
        self.max_len = max_len
        self.d_model = d_model
        
        position = np.arange(max_len)[:, np.newaxis]
        div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
        
        pos_encoding = np.zeros((max_len, d_model))
        pos_encoding[:, 0::2] = np.sin(position * div_term)
        pos_encoding[:, 1::2] = np.cos(position * div_term)
        
        self.pos_encoding = tf.constant(pos_encoding, dtype=tf.float32)
    
    def call(self, x):
        seq_len = tf.shape(x)[1]
        return x + self.pos_encoding[:seq_len, :]
    
    def get_config(self):
        config = super().get_config()
        config.update({"max_len": self.max_len, "d_model": self.d_model})
        return config

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout_rate=0.1, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.dropout_rate = dropout_rate
        
        self.att = layers.MultiHeadAttention(
            num_heads=num_heads, 
            key_dim=embed_dim // num_heads,
            dropout=dropout_rate
        )
        self.ffn = keras.Sequential([
            layers.Dense(ff_dim, activation='gelu'),
            layers.Dropout(dropout_rate),
            layers.Dense(embed_dim),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(dropout_rate)
        self.dropout2 = layers.Dropout(dropout_rate)
    
    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs, training=training)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        
        ffn_output = self.ffn(out1, training=training)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)
        
        return out2
    
    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "ff_dim": self.ff_dim,
            "dropout_rate": self.dropout_rate
        })
        return config

def create_transformer_model(vocab_size, num_classes=5):
    inputs = layers.Input(shape=(CONFIG['tf_max_len'],))
    
    x = layers.Embedding(vocab_size, CONFIG['tf_embedding_dim'])(inputs)
    x = layers.SpatialDropout1D(0.2)(x)
    
    x = PositionalEncoding(CONFIG['tf_max_len'], CONFIG['tf_embedding_dim'])(x)
    
    for _ in range(CONFIG['transformer_blocks']):
        x = TransformerBlock(
            embed_dim=CONFIG['tf_embedding_dim'],
            num_heads=CONFIG['transformer_heads'],
            ff_dim=CONFIG['transformer_ff_dim'],
            dropout_rate=CONFIG['tf_dropout_rate'] * 0.5
        )(x)
    
    avg_pool = layers.GlobalAveragePooling1D()(x)
    max_pool = layers.GlobalMaxPooling1D()(x)
    x = layers.concatenate([avg_pool, max_pool])
    
    x = layers.Dense(256, activation='gelu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(CONFIG['tf_dropout_rate'])(x)
    x = layers.Dense(128, activation='gelu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(CONFIG['tf_dropout_rate'])(x)
    
    outputs = layers.Dense(num_classes, activation='softmax', dtype='float32')(x)
    
    return keras.Model(inputs=inputs, outputs=outputs, name='Transformer')

def create_bilstm_model(vocab_size, num_classes=5):
    inputs = layers.Input(shape=(CONFIG['tf_max_len'],))
    
    x = layers.Embedding(vocab_size, CONFIG['tf_embedding_dim'], mask_zero=True)(inputs)
    x = layers.SpatialDropout1D(0.2)(x)
    
    x = layers.Bidirectional(
        layers.LSTM(CONFIG['tf_lstm_units'], return_sequences=True, dropout=0.2, recurrent_dropout=0)
    )(x)
    x = layers.Bidirectional(
        layers.LSTM(CONFIG['tf_lstm_units'] // 2, return_sequences=True, dropout=0.2, recurrent_dropout=0)
    )(x)
    
    avg_pool = layers.GlobalAveragePooling1D()(x)
    max_pool = layers.GlobalMaxPooling1D()(x)
    x = layers.concatenate([avg_pool, max_pool])
    
    x = layers.Dense(256, activation='gelu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(CONFIG['tf_dropout_rate'])(x)
    x = layers.Dense(128, activation='gelu')(x)
    x = layers.Dropout(CONFIG['tf_dropout_rate'])(x)
    
    outputs = layers.Dense(num_classes, activation='softmax', dtype='float32')(x)
    
    return keras.Model(inputs=inputs, outputs=outputs, name='BiLSTM')

def create_cnn_bilstm_model(vocab_size, num_classes=5):
    inputs = layers.Input(shape=(CONFIG['tf_max_len'],))
    
    x = layers.Embedding(vocab_size, CONFIG['tf_embedding_dim'])(inputs)
    x = layers.SpatialDropout1D(0.2)(x)
    
    conv_blocks = []
    for kernel_size in [2, 3, 4]:  # REDUCED from [2,3,4,5]
        conv = layers.Conv1D(CONFIG['tf_cnn_filters'], kernel_size, activation='relu', padding='same')(x)
        conv = layers.BatchNormalization()(conv)
        conv = layers.GlobalMaxPooling1D()(conv)
        conv_blocks.append(conv)
    
    cnn_features = layers.concatenate(conv_blocks)
    
    lstm_out = layers.Bidirectional(
        layers.LSTM(CONFIG['tf_lstm_units'], return_sequences=True, recurrent_dropout=0)
    )(x)
    lstm_out = layers.GlobalAveragePooling1D()(lstm_out)
    
    x = layers.concatenate([cnn_features, lstm_out])
    
    x = layers.Dense(256, activation='gelu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(CONFIG['tf_dropout_rate'])(x)
    x = layers.Dense(128, activation='gelu')(x)
    x = layers.Dropout(CONFIG['tf_dropout_rate'])(x)
    
    outputs = layers.Dense(num_classes, activation='softmax', dtype='float32')(x)
    
    return keras.Model(inputs=inputs, outputs=outputs, name='CNN_BiLSTM')

def create_gru_attention_model(vocab_size, num_classes=5):
    inputs = layers.Input(shape=(CONFIG['tf_max_len'],))
    
    x = layers.Embedding(vocab_size, CONFIG['tf_embedding_dim'], mask_zero=True)(inputs)
    x = layers.SpatialDropout1D(0.2)(x)
    
    x = layers.Bidirectional(
        layers.GRU(CONFIG['tf_gru_units'], return_sequences=True, dropout=0.2, recurrent_dropout=0)
    )(x)
    gru_out = layers.Bidirectional(
        layers.GRU(CONFIG['tf_gru_units'] // 2, return_sequences=True, dropout=0.2, recurrent_dropout=0)
    )(x)
    
    query = layers.Lambda(lambda x: x[:, -1, :])(gru_out)
    context_vector, _ = BahdanauAttention(CONFIG['tf_gru_units'])(query, gru_out)
    
    avg_pool = layers.GlobalAveragePooling1D()(gru_out)
    max_pool = layers.GlobalMaxPooling1D()(gru_out)
    
    x = layers.concatenate([context_vector, avg_pool, max_pool])
    
    x = layers.Dense(256, activation='gelu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(CONFIG['tf_dropout_rate'])(x)
    x = layers.Dense(128, activation='gelu')(x)
    
    outputs = layers.Dense(num_classes, activation='softmax', dtype='float32')(x)
    
    return keras.Model(inputs=inputs, outputs=outputs, name='GRU_Attention')

class BalancedAccuracyCallback(keras.callbacks.Callback):
    def __init__(self, validation_data):
        super().__init__()
        self.validation_data = validation_data
        self.best_ba = 0.0
    
    def on_epoch_end(self, epoch, logs=None):
        X_val, y_val = [], []
        for x, y in self.validation_data:
            X_val.append(x.numpy())
            y_val.append(y.numpy())
        X_val = np.concatenate(X_val)
        y_val = np.concatenate(y_val)
        
        y_pred = self.model.predict(X_val, verbose=0)
        y_pred_classes = np.argmax(y_pred, axis=1)
        
        ba = balanced_accuracy_score(y_val, y_pred_classes)
        logs['val_balanced_accuracy'] = ba
        
        if ba > self.best_ba:
            self.best_ba = ba
            print(f" - val_balanced_accuracy: {ba:.4f} (best)")

# TRAIN TENSORFLOW MODELS
print("=" * 80)
print("TRAINING TensorFlow MODELS")
print("=" * 80)

# Load data
train_df = pd.read_csv(CONFIG['train_path'])
test_df = pd.read_csv(CONFIG['test_path'])

train_df['cleaned_text'] = train_df['text'].apply(clean_text)
test_df['cleaned_text'] = test_df['text'].apply(clean_text)

le = LabelEncoder()
le.fit(CLASS_LABELS)
train_df['label_idx'] = le.transform(train_df['review'])

# Load augmented data if exists
augmented_path = os.path.join(CONFIG['save_dir'], 'augmented_train.csv')
if os.path.exists(augmented_path):
    train_df = pd.read_csv(augmented_path)
    train_df['label_idx'] = le.transform(train_df['review'])

X_train, X_val, y_train, y_val = train_test_split(
    train_df['cleaned_text'].values,
    train_df['label_idx'].values,
    test_size=0.15,
    stratify=train_df['label_idx'].values,
    random_state=CONFIG['seed']
)

print("\nCreating text vectorization...")
vectorize_layer = layers.TextVectorization(
    max_tokens=CONFIG['tf_max_features'],
    output_mode='int',
    output_sequence_length=CONFIG['tf_max_len'],
)
vectorize_layer.adapt(np.concatenate([X_train, X_val]))
vocab_size = len(vectorize_layer.get_vocabulary())
print(f"Vocabulary size: {vocab_size}")

X_train_vec = vectorize_layer(X_train).numpy()
X_val_vec = vectorize_layer(X_val).numpy()
X_test_vec = vectorize_layer(test_df['cleaned_text'].values).numpy()

config_data = {
    'vocab': vectorize_layer.get_vocabulary(),
    'max_len': CONFIG['tf_max_len'],
    'idx_to_label': IDX_TO_LABEL,
    'label_to_idx': LABEL_TO_IDX
}
with open(os.path.join(CONFIG['save_dir'], 'tf_config.pkl'), 'wb') as f:
    pickle.dump(config_data, f)

tf_class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
tf_class_weights_dict = {i: w for i, w in enumerate(tf_class_weights)}

train_ds = tf.data.Dataset.from_tensor_slices((X_train_vec, y_train))
train_ds = train_ds.shuffle(len(X_train)).batch(CONFIG['tf_batch_size']).prefetch(tf.data.AUTOTUNE)

val_ds = tf.data.Dataset.from_tensor_slices((X_val_vec, y_val))
val_ds = val_ds.batch(CONFIG['tf_batch_size']).prefetch(tf.data.AUTOTUNE)

tf_models = [
    ('Transformer', create_transformer_model),
    ('BiLSTM', create_bilstm_model),
    ('CNN_BiLSTM', create_cnn_bilstm_model),
    ('GRU_Attention', create_gru_attention_model),
]

model_scores = {}
all_test_probs = []

# Load DeBERTa results
deberta_probs = np.load(os.path.join(CONFIG['save_dir'], 'deberta_probs.npy'))
label_classes = np.load(os.path.join(CONFIG['save_dir'], 'label_classes.npy'), allow_pickle=True)

model_scores['DeBERTa'] = 0.844  # Update this with actual value
all_test_probs.append(('DeBERTa', deberta_probs))

for name, builder in tf_models:
    print(f"\n--- Training {name} ---")
    
    # Clear session before each model
    keras.backend.clear_session()
    
    model = builder(vocab_size)
    
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=CONFIG['tf_learning_rate']),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    callbacks = [
        keras.callbacks.ModelCheckpoint(
            filepath=os.path.join(CONFIG['save_dir'], f'{name}_best.weights.h5'),
            monitor='val_loss',
            save_best_only=True,
            save_weights_only=True,
            verbose=1
        ),
        keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=5,
            restore_best_weights=True,
            verbose=1
        ),
        keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=2,
            min_lr=1e-7,
            verbose=1
        ),
        BalancedAccuracyCallback(val_ds)
    ]
    
    history = model.fit(
        train_ds,
        epochs=CONFIG['tf_epochs'],
        validation_data=val_ds,
        class_weight=tf_class_weights_dict,
        callbacks=callbacks,
        verbose=1
    )
    
    y_pred = np.argmax(model.predict(X_val_vec), axis=1)
    ba = balanced_accuracy_score(y_val, y_pred)
    model_scores[name] = ba
    print(f"\n✓ {name} Val BA: {ba:.4f}")
    
    test_probs = model.predict(X_test_vec)
    all_test_probs.append((name, test_probs))
    
    test_pred_labels = [IDX_TO_LABEL[i] for i in np.argmax(test_probs, axis=1)]
    sub_df = pd.DataFrame({'id': test_df['id'], 'review': test_pred_labels})
    sub_df.to_csv(f'submission_{name}.csv', index=False)
    
    # Clear memory
    del model
    keras.backend.clear_session()
    import gc
    gc.collect()

print("\n" + "=" * 60)
print("CREATING WEIGHTED ENSEMBLE")
print("=" * 60)

total_weight = sum(model_scores.values())
ensemble_probs = np.zeros((len(test_df), NUM_CLASSES))

print("\nModel weights:")
for name, probs in all_test_probs:
    weight = model_scores[name] / total_weight
    ensemble_probs += probs * weight
    print(f"  {name}: {weight:.3f} (BA: {model_scores[name]:.4f})")

ensemble_preds = np.argmax(ensemble_probs, axis=1)
ensemble_labels = le.inverse_transform(ensemble_preds)

submission = pd.DataFrame({
    'id': test_df['id'],
    'review': ensemble_labels
})
submission.to_csv('submission_ensemble.csv', index=False)

deberta_preds = np.argmax(deberta_probs, axis=1)
deberta_labels = le.inverse_transform(deberta_preds)
sub_deberta = pd.DataFrame({'id': test_df['id'], 'review': deberta_labels})
sub_deberta.to_csv('submission_deberta.csv', index=False)

print("\n" + "=" * 60)
print("TRAINING COMPLETE!")
print("=" * 60)

print("\nModel Balanced Accuracy Scores:")
for name, score in sorted(model_scores.items(), key=lambda x: -x[1]):
    print(f"  {name}: {score:.4f}")

print("\nSubmission files created:")
print("  ⭐ submission_ensemble.csv (BEST)")
print("  ⭐ submission_deberta.csv (DeBERTa only)")
print("  - submission_Transformer.csv")
print("  - submission_BiLSTM.csv")
print("  - submission_CNN_BiLSTM.csv")
print("  - submission_GRU_Attention.csv")


TensorFlow version: 2.18.0
TRAINING TensorFlow MODELS

Creating text vectorization...
Vocabulary size: 29565

--- Training Transformer ---
Epoch 1/30
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.1976 - loss: 2.3914
Epoch 1: val_loss improved from inf to 3.75069, saving model to /kaggle/working/Transformer_best.weights.h5
 - val_balanced_accuracy: 0.2000 (best)
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 47ms/step - accuracy: 0.1976 - loss: 2.3910 - val_accuracy: 0.0752 - val_loss: 3.7507 - learning_rate: 0.0010 - val_balanced_accuracy: 0.2000
Epoch 2/30
[1m371/372[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.2046 - loss: 1.8409
Epoch 2: val_loss improved from 3.75069 to 3.22452, saving model to /kaggle/working/Transformer_best.weights.h5
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.2046 - loss: 1.8408 - val_accuracy: 0.3333 - val_loss: 3.224

In [11]:
import os
import re
import pickle
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import PreTrainedTokenizerFast, DebertaV2Config, DebertaV2Model
from safetensors.torch import load_file

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# ============================================================================
# CONFIGURATION
# ============================================================================
CONFIG = {
    'models_path': '/kaggle/input/final-data',
    'test_data_path': '/kaggle/input/nn-26-review-sentiment-classification/test.csv',  # Change this to your test data path
    'output_dir': '/kaggle/working/',
    'pt_max_len': 384,
    'pt_batch_size': 8,
    'tf_max_len': 200,
    'tf_batch_size': 16,
    'tf_max_features': 30000,
    'tf_embedding_dim': 200,
    'tf_lstm_units': 96,
    'tf_gru_units': 96,
    'tf_cnn_filters': 96,
    'tf_dropout_rate': 0.4,
    'transformer_heads': 8,
    'transformer_ff_dim': 256,
    'transformer_blocks': 2,
}

CLASS_LABELS = ['Very bad', 'Bad', 'Good', 'Very good', 'Excellent']
NUM_CLASSES = len(CLASS_LABELS)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Using device: {DEVICE}")
print(f"Models path: {CONFIG['models_path']}")
print(f"Test data path: {CONFIG['test_data_path']}")

# ============================================================================
# HELPER FUNCTIONS
# ============================================================================
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<.*?>', '', text)
    contractions = {
        "won't": "will not", "can't": "cannot", "n't": " not",
        "'re": " are", "'s": " is", "'d": " would",
        "'ll": " will", "'ve": " have", "'m": " am"
    }
    for contraction, expansion in contractions.items():
        text = text.replace(contraction, expansion)
    text = re.sub(r'[^a-zA-Z0-9\s!?.,;:\'-]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# ============================================================================
# PYTORCH MODEL DEFINITIONS
# ============================================================================
class DeBERTaClassifier(nn.Module):
    def __init__(self, num_classes, dropout=0.3):
        super().__init__()
        
        self.config = DebertaV2Config(
            architectures=["DebertaV2Model"],
            attention_probs_dropout_prob=0.1,
            hidden_act="gelu",
            hidden_dropout_prob=0.1,
            hidden_size=1024,
            initializer_range=0.02,
            intermediate_size=4096,
            layer_norm_eps=1e-7,
            max_position_embeddings=512,
            max_relative_positions=-1,
            model_type="deberta-v2",
            norm_rel_ebd="layer_norm",
            num_attention_heads=16,
            num_hidden_layers=24,
            pad_token_id=0,
            pooler_dropout=0,
            pooler_hidden_act="gelu",
            pooler_hidden_size=1024,
            pos_att_type=["p2c", "c2p"],
            position_biased_input=False,
            position_buckets=256,
            relative_attention=True,
            share_att_key=True,
            type_vocab_size=0,
            vocab_size=128100,
            torch_dtype="float32"
        )
        
        self.transformer = DebertaV2Model(self.config)
        hidden_size = self.config.hidden_size
        self.dropouts = nn.ModuleList([nn.Dropout(dropout) for _ in range(5)])
        self.layer_norm = nn.LayerNorm(hidden_size)
        
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, num_classes)
        )
    
    def forward(self, input_ids, attention_mask):
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
            pooled = outputs.pooler_output
        else:
            last_hidden = outputs.last_hidden_state
            attention_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden.size()).float()
            sum_hidden = torch.sum(last_hidden * attention_mask_expanded, 1)
            sum_mask = attention_mask_expanded.sum(1).clamp(min=1e-9)
            pooled = sum_hidden / sum_mask
        
        pooled = self.layer_norm(pooled)
        
        logits = torch.zeros(pooled.size(0), self.classifier[-1].out_features).to(pooled.device)
        for dropout in self.dropouts:
            logits += self.classifier(dropout(pooled))
        logits /= len(self.dropouts)
        
        return logits

# ============================================================================
# TENSORFLOW CUSTOM LAYERS
# ============================================================================
class BahdanauAttention(layers.Layer):
    def __init__(self, units, **kwargs):
        super().__init__(**kwargs)
        self.units = units
        self.W1 = layers.Dense(units)
        self.W2 = layers.Dense(units)
        self.V = layers.Dense(1)
    
    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights
    
    def get_config(self):
        config = super().get_config()
        config.update({"units": self.units})
        return config

class PositionalEncoding(layers.Layer):
    def __init__(self, max_len, d_model, **kwargs):
        super().__init__(**kwargs)
        self.max_len = max_len
        self.d_model = d_model
        
        position = np.arange(max_len)[:, np.newaxis]
        div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
        
        pos_encoding = np.zeros((max_len, d_model))
        pos_encoding[:, 0::2] = np.sin(position * div_term)
        pos_encoding[:, 1::2] = np.cos(position * div_term)
        
        self.pos_encoding = tf.constant(pos_encoding, dtype=tf.float32)
    
    def call(self, x):
        seq_len = tf.shape(x)[1]
        return x + self.pos_encoding[:seq_len, :]
    
    def get_config(self):
        config = super().get_config()
        config.update({"max_len": self.max_len, "d_model": self.d_model})
        return config

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout_rate=0.1, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.dropout_rate = dropout_rate
        
        self.att = layers.MultiHeadAttention(
            num_heads=num_heads, 
            key_dim=embed_dim // num_heads,
            dropout=dropout_rate
        )
        self.ffn = keras.Sequential([
            layers.Dense(ff_dim, activation='gelu'),
            layers.Dropout(dropout_rate),
            layers.Dense(embed_dim),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(dropout_rate)
        self.dropout2 = layers.Dropout(dropout_rate)
    
    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs, training=training)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        
        ffn_output = self.ffn(out1, training=training)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)
        
        return out2
    
    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "ff_dim": self.ff_dim,
            "dropout_rate": self.dropout_rate
        })
        return config

# ============================================================================
# TENSORFLOW MODEL BUILDERS
# ============================================================================
def create_transformer_model(vocab_size, num_classes=5):
    inputs = layers.Input(shape=(CONFIG['tf_max_len'],))
    x = layers.Embedding(vocab_size, CONFIG['tf_embedding_dim'])(inputs)
    x = layers.SpatialDropout1D(0.2)(x)
    x = PositionalEncoding(CONFIG['tf_max_len'], CONFIG['tf_embedding_dim'])(x)
    
    for _ in range(CONFIG['transformer_blocks']):
        x = TransformerBlock(
            embed_dim=CONFIG['tf_embedding_dim'],
            num_heads=CONFIG['transformer_heads'],
            ff_dim=CONFIG['transformer_ff_dim'],
            dropout_rate=CONFIG['tf_dropout_rate'] * 0.5
        )(x)
    
    avg_pool = layers.GlobalAveragePooling1D()(x)
    max_pool = layers.GlobalMaxPooling1D()(x)
    x = layers.concatenate([avg_pool, max_pool])
    
    x = layers.Dense(256, activation='gelu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(CONFIG['tf_dropout_rate'])(x)
    x = layers.Dense(128, activation='gelu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(CONFIG['tf_dropout_rate'])(x)
    
    outputs = layers.Dense(num_classes, activation='softmax', dtype='float32')(x)
    return keras.Model(inputs=inputs, outputs=outputs, name='Transformer')

def create_bilstm_model(vocab_size, num_classes=5):
    inputs = layers.Input(shape=(CONFIG['tf_max_len'],))
    x = layers.Embedding(vocab_size, CONFIG['tf_embedding_dim'], mask_zero=True)(inputs)
    x = layers.SpatialDropout1D(0.2)(x)
    
    x = layers.Bidirectional(
        layers.LSTM(CONFIG['tf_lstm_units'], return_sequences=True, dropout=0.2, recurrent_dropout=0)
    )(x)
    x = layers.Bidirectional(
        layers.LSTM(CONFIG['tf_lstm_units'] // 2, return_sequences=True, dropout=0.2, recurrent_dropout=0)
    )(x)
    
    avg_pool = layers.GlobalAveragePooling1D()(x)
    max_pool = layers.GlobalMaxPooling1D()(x)
    x = layers.concatenate([avg_pool, max_pool])
    
    x = layers.Dense(256, activation='gelu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(CONFIG['tf_dropout_rate'])(x)
    x = layers.Dense(128, activation='gelu')(x)
    x = layers.Dropout(CONFIG['tf_dropout_rate'])(x)
    
    outputs = layers.Dense(num_classes, activation='softmax', dtype='float32')(x)
    return keras.Model(inputs=inputs, outputs=outputs, name='BiLSTM')

def create_cnn_bilstm_model(vocab_size, num_classes=5):
    inputs = layers.Input(shape=(CONFIG['tf_max_len'],))
    x = layers.Embedding(vocab_size, CONFIG['tf_embedding_dim'])(inputs)
    x = layers.SpatialDropout1D(0.2)(x)
    
    conv_blocks = []
    for kernel_size in [2, 3, 4]:
        conv = layers.Conv1D(CONFIG['tf_cnn_filters'], kernel_size, activation='relu', padding='same')(x)
        conv = layers.BatchNormalization()(conv)
        conv = layers.GlobalMaxPooling1D()(conv)
        conv_blocks.append(conv)
    
    cnn_features = layers.concatenate(conv_blocks)
    
    lstm_out = layers.Bidirectional(
        layers.LSTM(CONFIG['tf_lstm_units'], return_sequences=True, recurrent_dropout=0)
    )(x)
    lstm_out = layers.GlobalAveragePooling1D()(lstm_out)
    
    x = layers.concatenate([cnn_features, lstm_out])
    
    x = layers.Dense(256, activation='gelu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(CONFIG['tf_dropout_rate'])(x)
    x = layers.Dense(128, activation='gelu')(x)
    x = layers.Dropout(CONFIG['tf_dropout_rate'])(x)
    
    outputs = layers.Dense(num_classes, activation='softmax', dtype='float32')(x)
    return keras.Model(inputs=inputs, outputs=outputs, name='CNN_BiLSTM')

def create_gru_attention_model(vocab_size, num_classes=5):
    inputs = layers.Input(shape=(CONFIG['tf_max_len'],))
    x = layers.Embedding(vocab_size, CONFIG['tf_embedding_dim'], mask_zero=True)(inputs)
    x = layers.SpatialDropout1D(0.2)(x)
    
    x = layers.Bidirectional(
        layers.GRU(CONFIG['tf_gru_units'], return_sequences=True, dropout=0.2, recurrent_dropout=0)
    )(x)
    gru_out = layers.Bidirectional(
        layers.GRU(CONFIG['tf_gru_units'] // 2, return_sequences=True, dropout=0.2, recurrent_dropout=0)
    )(x)
    
    query = layers.Lambda(lambda x: x[:, -1, :])(gru_out)
    context_vector, _ = BahdanauAttention(CONFIG['tf_gru_units'])(query, gru_out)
    
    avg_pool = layers.GlobalAveragePooling1D()(gru_out)
    max_pool = layers.GlobalMaxPooling1D()(gru_out)
    
    x = layers.concatenate([context_vector, avg_pool, max_pool])
    
    x = layers.Dense(256, activation='gelu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(CONFIG['tf_dropout_rate'])(x)
    x = layers.Dense(128, activation='gelu')(x)
    
    outputs = layers.Dense(num_classes, activation='softmax', dtype='float32')(x)
    return keras.Model(inputs=inputs, outputs=outputs, name='GRU_Attention')

# ============================================================================
# LOAD DATA
# ============================================================================
print("\n" + "="*80)
print("LOADING TEST DATA")
print("="*80)

test_df = pd.read_csv(CONFIG['test_data_path'])
print(f"Test samples: {len(test_df)}")

test_df['cleaned_text'] = test_df['text'].apply(clean_text)

# ============================================================================
# LOAD DEBERTA MODEL AND PREDICT
# ============================================================================
print("\n" + "="*80)
print("LOADING DeBERTa MODEL")
print("="*80)

tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=os.path.join(CONFIG['models_path'], 'tokenizer.json')
)

# Set pad token if not present
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    
print(f"Tokenizer vocab size: {len(tokenizer)}")
print(f"Pad token: {tokenizer.pad_token}")

model_deberta = DeBERTaClassifier(NUM_CLASSES, dropout=0.3).to(DEVICE)
model_deberta.load_state_dict(
    torch.load(os.path.join(CONFIG['models_path'], 'best_deberta.pt'), 
               map_location=DEVICE)
)
model_deberta.eval()

print("Making DeBERTa predictions...")
deberta_probs = []
with torch.no_grad():
    for i in range(0, len(test_df), CONFIG['pt_batch_size']):
        batch_texts = test_df['cleaned_text'].iloc[i:i+CONFIG['pt_batch_size']].tolist()
        
        encodings = tokenizer.batch_encode_plus(
            batch_texts,
            add_special_tokens=True,
            max_length=CONFIG['pt_max_len'],
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        input_ids = encodings['input_ids'].to(DEVICE)
        attention_mask = encodings['attention_mask'].to(DEVICE)
        
        outputs = model_deberta(input_ids, attention_mask)
        probs = F.softmax(outputs, dim=1)
        deberta_probs.extend(probs.cpu().numpy())

deberta_probs = np.array(deberta_probs)
print(f"✓ DeBERTa predictions shape: {deberta_probs.shape}")

# Save DeBERTa predictions
deberta_preds = np.argmax(deberta_probs, axis=1)
deberta_labels = [CLASS_LABELS[i] for i in deberta_preds]
submission_deberta = pd.DataFrame({'id': test_df['id'], 'review': deberta_labels})
submission_deberta.to_csv(os.path.join(CONFIG['output_dir'], 'submission_deberta.csv'), index=False)
print("✓ Saved: submission_deberta.csv")

# Clear GPU memory
del model_deberta
torch.cuda.empty_cache()

# ============================================================================
# LOAD TENSORFLOW CONFIG AND VECTORIZER
# ============================================================================
print("\n" + "="*80)
print("LOADING TensorFlow CONFIGURATION")
print("="*80)

with open(os.path.join(CONFIG['models_path'], 'tf_config.pkl'), 'rb') as f:
    tf_config = pickle.load(f)

vocab = tf_config['vocab']
vocab_size = len(vocab)
print(f"Vocabulary size: {vocab_size}")

# Create and adapt vectorizer
vectorize_layer = layers.TextVectorization(
    max_tokens=CONFIG['tf_max_features'],
    output_mode='int',
    output_sequence_length=CONFIG['tf_max_len'],
    vocabulary=vocab
)

X_test_vec = vectorize_layer(test_df['cleaned_text'].values).numpy()
print(f"✓ Vectorized test data shape: {X_test_vec.shape}")

# ============================================================================
# LOAD AND PREDICT WITH TENSORFLOW MODELS
# ============================================================================
tf_models_info = [
    ('Transformer', create_transformer_model),
    ('BiLSTM', create_bilstm_model),
    ('CNN_BiLSTM', create_cnn_bilstm_model),
    ('GRU_Attention', create_gru_attention_model),
]

all_tf_probs = {}

for name, builder in tf_models_info:
    print(f"\n--- Loading {name} ---")
    keras.backend.clear_session()
    
    model = builder(vocab_size)
    model.load_weights(os.path.join(CONFIG['models_path'], f'{name}_best.weights.h5'))
    
    print(f"Making {name} predictions...")
    probs = model.predict(X_test_vec, batch_size=CONFIG['tf_batch_size'], verbose=0)
    all_tf_probs[name] = probs
    
    # Save individual predictions
    preds = np.argmax(probs, axis=1)
    pred_labels = [CLASS_LABELS[i] for i in preds]
    submission = pd.DataFrame({'id': test_df['id'], 'review': pred_labels})
    submission.to_csv(os.path.join(CONFIG['output_dir'], f'submission_{name}.csv'), index=False)
    print(f"✓ Saved: submission_{name}.csv")
    
    del model
    keras.backend.clear_session()

# ============================================================================
# CREATE ENSEMBLE PREDICTIONS
# ============================================================================
print("\n" + "="*80)
print("CREATING ENSEMBLE PREDICTIONS")
print("="*80)

# Ensemble 1: TensorFlow models only
print("\n1. TensorFlow Models Ensemble")
tf_ensemble_probs = np.mean([all_tf_probs[name] for name in all_tf_probs.keys()], axis=0)
tf_preds = np.argmax(tf_ensemble_probs, axis=1)
tf_labels = [CLASS_LABELS[i] for i in tf_preds]
submission = pd.DataFrame({'id': test_df['id'], 'review': tf_labels})
submission.to_csv(os.path.join(CONFIG['output_dir'], 'submission_tf_ensemble.csv'), index=False)
print("✓ Saved: submission_tf_ensemble.csv")

# Ensemble 2: TensorFlow + DeBERTa (weighted by validation performance)
print("\n2. TensorFlow + DeBERTa Ensemble")
# Approximate weights based on validation scores
weights = {
    'DeBERTa': 0.844,
    'Transformer': 0.200,
    'BiLSTM': 0.391,
    'CNN_BiLSTM': 0.396,
    'GRU_Attention': 0.394
}
total_weight = sum(weights.values())
normalized_weights = {k: v/total_weight for k, v in weights.items()}

tf_deberta_ensemble = normalized_weights['DeBERTa'] * deberta_probs
for name in all_tf_probs.keys():
    tf_deberta_ensemble += normalized_weights[name] * all_tf_probs[name]

tf_deberta_preds = np.argmax(tf_deberta_ensemble, axis=1)
tf_deberta_labels = [CLASS_LABELS[i] for i in tf_deberta_preds]
submission = pd.DataFrame({'id': test_df['id'], 'review': tf_deberta_labels})
submission.to_csv(os.path.join(CONFIG['output_dir'], 'submission_tf_deberta_ensemble.csv'), index=False)
print("✓ Saved: submission_tf_deberta_ensemble.csv")

print("\nWeights used:")
for name, weight in normalized_weights.items():
    print(f"  {name}: {weight:.3f}")

# Ensemble 3: TensorFlow + Custom Transformer (weighted)
print("\n3. TensorFlow + Custom Transformer Ensemble")
tf_custom_transformer_weights = {
    'Transformer': 0.200,
    'BiLSTM': 0.391,
    'CNN_BiLSTM': 0.396,
    'GRU_Attention': 0.394
}
total_weight = sum(tf_custom_transformer_weights.values())
normalized_weights = {k: v/total_weight for k, v in tf_custom_transformer_weights.items()}

tf_custom_ensemble = np.zeros((len(test_df), NUM_CLASSES))
for name in tf_custom_transformer_weights.keys():
    tf_custom_ensemble += normalized_weights[name] * all_tf_probs[name]

tf_custom_preds = np.argmax(tf_custom_ensemble, axis=1)
tf_custom_labels = [CLASS_LABELS[i] for i in tf_custom_preds]
submission = pd.DataFrame({'id': test_df['id'], 'review': tf_custom_labels})
submission.to_csv(os.path.join(CONFIG['output_dir'], 'submission_tf_custom_transformer.csv'), index=False)
print("✓ Saved: submission_tf_custom_transformer.csv")

# Ensemble 4: All models (equal weight)
print("\n4. All Models Equal Weight Ensemble")
all_models_equal = deberta_probs.copy()
for name in all_tf_probs.keys():
    all_models_equal += all_tf_probs[name]
all_models_equal /= (len(all_tf_probs) + 1)

all_equal_preds = np.argmax(all_models_equal, axis=1)
all_equal_labels = [CLASS_LABELS[i] for i in all_equal_preds]
submission = pd.DataFrame({'id': test_df['id'], 'review': all_equal_labels})
submission.to_csv(os.path.join(CONFIG['output_dir'], 'submission_all_models_equal.csv'), index=False)
print("✓ Saved: submission_all_models_equal.csv")

# Ensemble 5: All Transformers (DeBERTa + Custom Transformer)
print("\n5. All Transformers Ensemble")
transformer_weights = {
    'DeBERTa': 0.844,
    'Transformer': 0.200
}
total_weight = sum(transformer_weights.values())
normalized_weights = {k: v/total_weight for k, v in transformer_weights.items()}

all_transformers_ensemble = (normalized_weights['DeBERTa'] * deberta_probs + 
                              normalized_weights['Transformer'] * all_tf_probs['Transformer'])

all_transformers_preds = np.argmax(all_transformers_ensemble, axis=1)
all_transformers_labels = [CLASS_LABELS[i] for i in all_transformers_preds]
submission = pd.DataFrame({'id': test_df['id'], 'review': all_transformers_labels})
submission.to_csv(os.path.join(CONFIG['output_dir'], 'submission_all_transformers.csv'), index=False)
print("✓ Saved: submission_all_transformers.csv")

# ============================================================================
# SUMMARY
# ============================================================================
print("\n" + "="*80)
print("INFERENCE COMPLETE!")
print("="*80)

print("\nGenerated submission files:")
print("\nIndividual Models:")
print("  - submission_deberta.csv (DeBERTa)")
print("  - submission_Transformer.csv (Custom Transformer)")
print("  - submission_BiLSTM.csv")
print("  - submission_CNN_BiLSTM.csv")
print("  - submission_GRU_Attention.csv")

print("\nEnsemble Combinations:")
print("  ⭐ submission_tf_deberta_ensemble.csv (TensorFlow + DeBERTa, weighted - RECOMMENDED)")
print("  - submission_tf_ensemble.csv (TensorFlow models only)")
print("  - submission_tf_custom_transformer.csv (TensorFlow + Custom Transformer)")
print("  - submission_all_models_equal.csv (All models, equal weight)")
print("  - submission_all_transformers.csv (DeBERTa + Custom Transformer)")

print("\n" + "="*80)

Using device: cuda
Models path: /kaggle/input/final-data
Test data path: /kaggle/input/nn-26-review-sentiment-classification/test.csv

LOADING TEST DATA
Test samples: 3000

LOADING DeBERTa MODEL
Tokenizer vocab size: 128001
Pad token: [PAD]
Making DeBERTa predictions...
✓ DeBERTa predictions shape: (3000, 5)
✓ Saved: submission_deberta.csv

LOADING TensorFlow CONFIGURATION
Vocabulary size: 29565
✓ Vectorized test data shape: (3000, 200)

--- Loading Transformer ---
Making Transformer predictions...
✓ Saved: submission_Transformer.csv

--- Loading BiLSTM ---
Making BiLSTM predictions...
✓ Saved: submission_BiLSTM.csv

--- Loading CNN_BiLSTM ---
Making CNN_BiLSTM predictions...
✓ Saved: submission_CNN_BiLSTM.csv

--- Loading GRU_Attention ---
Making GRU_Attention predictions...
✓ Saved: submission_GRU_Attention.csv

CREATING ENSEMBLE PREDICTIONS

1. TensorFlow Models Ensemble
✓ Saved: submission_tf_ensemble.csv

2. TensorFlow + DeBERTa Ensemble
✓ Saved: submission_tf_deberta_ensemble.csv