In [1]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
os.environ['ACCELERATE_MIXED_PRECISION'] = 'bf16'
import subprocess
import sys
import shutil
from pathlib import Path

# Check if torch is already correctly installed
try:
    import torch
    print('torch:', torch.__version__, 'built CUDA:', getattr(torch.version, 'cuda', None))
    print('CUDA available:', torch.cuda.is_available())
    if str(getattr(torch.version, 'cuda', '')).startswith('12.1') and torch.cuda.is_available():
        print('Torch already installed correctly, skipping reinstall.')
        skip_install = True
    else:
        skip_install = False
except ImportError:
    skip_install = False

# Check GPU availability
result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
print(result.stdout if result.returncode == 0 else 'GPU not available')

def pip(*args):
    print('>', *args, flush=True)
    subprocess.run([sys.executable, '-m', 'pip', *args], check=True)

if not skip_install:
    # Uninstall any prior torch stacks
    subprocess.run([sys.executable, '-m', 'pip', 'uninstall', '-y', 'torch', 'torchvision', 'torchaudio'], check=False)

    # Clean stray site dirs that can shadow correct wheels (idempotent)
    dirs_to_clean = [
        '/app/.pip-target/torch',
        '/app/.pip-target/torch-2.8.0.dist-info',
        '/app/.pip-target/torch-2.4.1.dist-info',
        '/app/.pip-target/torchvision',
        '/app/.pip-target/torchvision-0.23.0.dist-info',
        '/app/.pip-target/torchvision-0.19.1.dist-info',
        '/app/.pip-target/torchaudio',
        '/app/.pip-target/torchaudio-2.8.0.dist-info',
        '/app/.pip-target/torchaudio-2.4.1.dist-info',
        '/app/.pip-target/torchgen',
        '/app/.pip-target/functorch'
    ]
    for d in dirs_to_clean:
        if os.path.exists(d):
            print('Removing', d)
            shutil.rmtree(d, ignore_errors=True)

    # Install the EXACT cu121 torch stack FIRST (matches your CUDA 12.1 container)
    pip('install',
        '--index-url', 'https://download.pytorch.org/whl/cu121',
        '--extra-index-url', 'https://pypi.org/simple',
        'torch==2.4.1', 'torchvision==0.19.1', 'torchaudio==2.4.1')

    # Create a constraints file to freeze torch versions for all later installs
    Path('constraints.txt').write_text(
        'torch==2.4.1\n' +
        'torchvision==0.19.1\n' +
        'torchaudio==2.4.1\n'
    )

    # Now install NON-torch deps, honoring constraints, and avoid upgrading torch
    pip('install', '-c', 'constraints.txt',
        'transformers==4.44.2', 'accelerate==0.34.2',
        'datasets==2.21.0', 'evaluate==0.4.2',
        'sentencepiece', 'scikit-learn',
        '--upgrade-strategy', 'only-if-needed')

# Sanity gate (hard fail on drift)
import torch
torch.backends.cuda.matmul.allow_tf32 = True
print('torch:', torch.__version__, 'built CUDA:', getattr(torch.version, 'cuda', None))
print('CUDA available:', torch.cuda.is_available())
assert str(getattr(torch.version, 'cuda', '')).startswith('12.1'), f'Wrong CUDA build: {torch.version.cuda}'
assert torch.cuda.is_available(), 'CUDA not available'
print('GPU:', torch.cuda.get_device_name(0))

torch: 2.4.1+cu121 built CUDA: 12.1
CUDA available: True
Torch already installed correctly, skipping reinstall.
Fri Sep 26 07:26:38 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.144.06             Driver Version: 550.144.06     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A10-24Q                 On  |   00000002:00:00.0 Off |                    0 |
| N/A   N/A    P0             N/A /  N/A  |    1216MiB /  24512MiB |      0%      Default |
|                                         |                        |                  N/A |
+---------------------------

In [2]:
from accelerate import Accelerator
accelerator = Accelerator(gradient_accumulation_steps=2)
print('Device:', accelerator.device, 'mp:', accelerator.mixed_precision)

Device: cuda mp: fp16


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import re
from collections import Counter

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print('Train shape:', train.shape)
print('Test shape:', test.shape)

# Check for NaNs
print('\nNaNs in train:')
print(train.isnull().sum())
print('\nNaNs in test:')
print(test.isnull().sum())

# Drop rows with NaN in text or selected_text
train = train.dropna(subset=['text', 'selected_text'])
print('\nTrain shape after dropna:', train.shape)

# Basic info
print('\nTrain columns:', train.columns.tolist())
print('\nSentiment distribution:')
print(train['sentiment'].value_counts(normalize=True))

# Text lengths
train['text_len'] = train['text'].str.len()
train['selected_len'] = train['selected_text'].str.len()
print('\nText length stats:')
print(train['text_len'].describe())
print('\nSelected text length stats:')
print(train['selected_len'].describe())

# Verify selected_text is substring
def is_substring(row):
    return str(row['selected_text']) in str(row['text'])

train['is_substring'] = train.apply(is_substring, axis=1)
print('\nPercentage where selected_text is exact substring:', (train['is_substring'].mean() * 100))
print('Cases where not:', train[~train['is_substring']].shape[0])

# Jaccard similarity function
def jaccard(str1, str2):
    a = set(str(str1).lower().split())
    b = set(str(str2).lower().split())
    if (not a and not b): return 0.5
    return len(a.intersection(b)) / len(a.union(b))

train['jaccard'] = train.apply(lambda x: jaccard(x['text'], x['selected_text']), axis=1)
print('\nAverage Jaccard in train:', train['jaccard'].mean())

# Neutral cases
neutral = train[train['sentiment'] == 'neutral']
print('\nNeutral Jaccard mean:', neutral['jaccard'].mean())
print('Neutral selected_len / text_len mean:', (neutral['selected_len'] / neutral['text_len']).mean())

# Check for duplicates
print('\nDuplicate texts:', train['text'].duplicated().sum())
print('Unique texts:', train['text'].nunique())

# Prepare for CV: stratified by sentiment
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for fold, (train_idx, val_idx) in enumerate(skf.split(train, train['sentiment'])):
    print(f'Fold {fold}: train {len(train_idx)}, val {len(val_idx)}')
    fold_sent = train.iloc[val_idx]['sentiment'].value_counts(normalize=True)
    print(f'  Val sentiment dist: {fold_sent}')

# Sample non-substring cases
if train[~train['is_substring']].shape[0] > 0:
    print('\nSample non-exact substring cases:')
    for _, row in train[~train['is_substring']].head(3).iterrows():
        print(f'Text: {row["text"][:50]}...')
        print(f'Selected: {row["selected_text"]}')
        print(f'Sentiment: {row["sentiment"]}')

Train shape: (24732, 4)
Test shape: (2749, 3)

NaNs in train:
textID           0
text             1
selected_text    1
sentiment        0
dtype: int64

NaNs in test:
textID       0
text         0
sentiment    0
dtype: int64

Train shape after dropna: (24731, 4)

Train columns: ['textID', 'text', 'selected_text', 'sentiment']

Sentiment distribution:
sentiment
neutral     0.404230
positive    0.312765
negative    0.283005
Name: proportion, dtype: float64

Text length stats:
count    24731.000000
mean        68.381545
std         35.663358
min          3.000000
25%         39.000000
50%         64.000000
75%         97.000000
max        141.000000
Name: text_len, dtype: float64

Selected text length stats:
count    24731.000000
mean        36.681129
std         35.674428
min          1.000000
25%          8.000000
50%         22.000000
75%         55.000000
max        141.000000
Name: selected_len, dtype: float64

Percentage where selected_text is exact substring: 100.0
Cases where not: 


Average Jaccard in train: 0.5886131384928434

Neutral Jaccard mean: 0.9766533223318001
Neutral selected_len / text_len mean: 0.9635666513148929

Duplicate texts: 0
Unique texts: 24731
Fold 0: train 19784, val 4947
  Val sentiment dist: sentiment
neutral     0.404285
positive    0.312715
negative    0.283000
Name: proportion, dtype: float64
Fold 1: train 19785, val 4946
  Val sentiment dist: sentiment
neutral     0.404165
positive    0.312778
negative    0.283057
Name: proportion, dtype: float64
Fold 2: train 19785, val 4946
  Val sentiment dist: sentiment
neutral     0.404165
positive    0.312778
negative    0.283057
Name: proportion, dtype: float64
Fold 3: train 19785, val 4946
  Val sentiment dist: sentiment
neutral     0.404165
positive    0.312778
negative    0.283057
Name: proportion, dtype: float64
Fold 4: train 19785, val 4946
  Val sentiment dist: sentiment
neutral     0.404367
positive    0.312778
negative    0.282855
Name: proportion, dtype: float64


# Updated Plan for Tweet Sentiment Extraction (QA Formulation)

## Key Insights from EDA
- Train: 24,731 samples (after dropping 1 NaN row), Test: 2,749
- Sentiments: 40.4% neutral, 31.3% positive, 28.3% negative
- Text len mean: 68 chars, Selected len mean: 37 chars
- 100% selected_text is exact substring of text
- Avg Jaccard: 0.59 overall, but 0.98 for neutral (confirms full text rule for neutral)
- Neutral selected/text ratio: 0.96 (almost always full tweet)
- No duplicate texts, balanced 5-fold stratified CV ready

## Medal Strategy (Target: Jaccard >= 0.726 Gold)
- **Formulation**: Question Answering (QA) - Question: sentiment label, Context: tweet text. Predict start/end token positions for selected_text span.
- **Model**: microsoft/deberta-v3-base (balanced size/performance; upgrade to large if time allows). Use AutoModelForQuestionAnswering.
- **Preprocessing**: No cleaning/lowercasing. Use tokenizer with return_offsets_mapping=True to map char positions to tokens.
- **Labels**: For each sample, find char_start = text.find(selected_text), char_end = char_start + len(selected_text). Map to token_start/end via offsets (only context part).
- **Neutral Rule**: Always return full tweet for neutral sentiment (post-processing).
- **Training**: 5-fold StratifiedKFold (by sentiment, seed=42). Max_len=192, epochs=3-5, lr=2e-5, batch=16-32 (fp16), early stop on val Jaccard.
- **Decoding**: Average start/end logits across folds. Find best i <= j maximizing start_logit[i] + end_logit[j]. Map back to char span via offsets, slice original text.
- **Post-processing**: If neutral, full text. If invalid/empty span, fallback to full text. Preserve exact casing/punctuation.
- **CV Metric**: Implement jaccard (as above) on predicted spans vs ground truth. Target: >=0.718 (bronze+), then ensemble for gold.
- **Timeline**: 2-6h baseline DeBERTa CV, 6-12h train strong models, 12-20h ensemble, 20-24h submit/iterate.
- **Next**: Define Dataset class with offsets mapping, test on sample, then training script.

In [4]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from torch.utils.data import Dataset

class TweetDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.sentiment_map = {'positive': 'positive', 'negative': 'negative', 'neutral': 'neutral'}  # Question is just the sentiment

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        sentiment = row['sentiment']
        text = row['text']
        selected_text = row['selected_text']

        # Find char positions
        char_start = text.find(selected_text)
        if char_start == -1:
            char_start = 0
        char_end = char_start + len(selected_text)

        # Tokenize without tensors to get sequence_ids
        encoding = self.tokenizer(
            sentiment,
            text,
            truncation='only_second',
            max_length=self.max_len,
            padding='max_length',
            return_offsets_mapping=True,
            return_tensors=None
        )

        input_ids = torch.tensor(encoding['input_ids'], dtype=torch.long)
        attention_mask = torch.tensor(encoding['attention_mask'], dtype=torch.long)
        offset_mapping = torch.tensor(encoding['offset_mapping'], dtype=torch.long)
        seq_ids_list = encoding.sequence_ids(0)

        # Safe seq_ids tensor: init with -100, set known values
        seq_ids = torch.full((self.max_len,), -100, dtype=torch.long)
        for i, s in enumerate(seq_ids_list):
            if s is not None:
                seq_ids[i] = s

        # Get context token indices (sequence_id == 1)
        ctx_indices = [i for i, s in enumerate(seq_ids_list) if s == 1]
        if not ctx_indices:
            ctx_start, ctx_end = 0, len(input_ids) - 2
        else:
            ctx_start = ctx_indices[0]
            ctx_end = ctx_indices[-1]

        # Find token positions for start and end (only in context)
        start_pos = None
        end_pos = None
        for i in range(ctx_start, ctx_end + 1):
            start_off = offset_mapping[i][0].item()
            end_off = offset_mapping[i][1].item()
            if start_off <= char_start < end_off:
                start_pos = i
            if start_off < char_end <= end_off:
                end_pos = i
        
        # Fallback to context bounds
        if start_pos is None:
            start_pos = ctx_start
        if end_pos is None:
            end_pos = ctx_end
        
        # Ensure start <= end
        if start_pos > end_pos:
            start_pos, end_pos = end_pos, start_pos

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'start_positions': torch.tensor(start_pos, dtype=torch.long),
            'end_positions': torch.tensor(end_pos, dtype=torch.long),
            'offset_mapping': offset_mapping,
            'seq_ids': seq_ids,
            'sentiment': sentiment,
            'text': text,
            'selected_text': selected_text
        }

# Load tokenizer
model_name = 'microsoft/deberta-v3-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Test on sample
sample_df = train.head(1).reset_index(drop=True)
dataset = TweetDataset(sample_df, tokenizer)
item = dataset[0]

print('Sample text:', item['text'])
print('Sample selected:', item['selected_text'])
print('Sentiment:', item['sentiment'])
print('Input shape:', item['input_ids'].shape)
print('Start position:', item['start_positions'].item())
print('End position:', item['end_positions'].item())

# Compute char_end for test
char_start = item['text'].find(item['selected_text'])
char_end = char_start + len(item['selected_text'])

# Decode to verify
decoded = tokenizer.decode(item['input_ids'], skip_special_tokens=False)
print('Decoded (first 100 chars):', decoded[:100])

# Check if positions make sense
assert 0 < item['start_positions'] < item['end_positions'] < item['input_ids'].shape[0] - 1

# Verify extracted text from positions using offsets (correct way for exact match)
offset_mapping = item['offset_mapping']
pred_char_start = offset_mapping[item['start_positions']][0].item()
pred_char_end = offset_mapping[item['end_positions']][1].item()
pred_text = item['text'][pred_char_start:pred_char_end]
print('Extracted from char positions:', repr(pred_text))
print('Matches selected?', pred_text == item['selected_text'])

# Token decode for comparison
extracted_tokens = item['input_ids'][item['start_positions']:item['end_positions']+1]
extracted_text = tokenizer.decode(extracted_tokens, skip_special_tokens=True)
print('Token decode (may lose spaces):', repr(extracted_text))

# Check no truncation of context
seq_ids_list = item['seq_ids'].tolist()
ctx_indices = [i for i, s in enumerate(seq_ids_list) if s == 1]
if ctx_indices:
    last_i = ctx_indices[-1]
    last_ctx_offset = offset_mapping[last_i][1].item()
    print('Last context offset end:', last_ctx_offset, '>= char_end?', last_ctx_offset >= char_end)

Sample text: eating breakfast  getting ready to go to school ;(
Sample selected: eating breakfast  getting ready to go to school ;(
Sentiment: negative
Input shape: torch.Size([128])
Start position: 3
End position: 12
Decoded (first 100 chars): [CLS] negative[SEP] eating breakfast getting ready to go to school ;([SEP][PAD][PAD][PAD][PAD][PAD][
Extracted from char positions: 'eating breakfast  getting ready to go to school ;('
Matches selected? True
Token decode (may lose spaces): 'eating breakfast getting ready to go to school ;('
Last context offset end: 50 >= char_end? True




In [5]:
# Load model and test forward pass
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
model = model.to('cuda')
model.eval()

# Prepare batch from item
batch = {
    'input_ids': item['input_ids'].unsqueeze(0).to('cuda'),
    'attention_mask': item['attention_mask'].unsqueeze(0).to('cuda')
}

with torch.no_grad():
    outputs = model(**batch)

print('Start logits shape:', outputs.start_logits.shape)
print('End logits shape:', outputs.end_logits.shape)
print('Start logit at true position:', outputs.start_logits[0, item['start_positions'].item()].item())
print('End logit at true position:', outputs.end_logits[0, item['end_positions'].item()].item())

Some weights of DebertaV2ForQuestionAnswering were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Start logits shape: torch.Size([1, 128])
End logits shape: torch.Size([1, 128])
Start logit at true position: 0.4296249449253082
End logit at true position: -0.2753751873970032


In [6]:
from accelerate import Accelerator
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
import torch
import math
import gc

oof_start_logits = []
oof_end_logits = []
oof_seq_ids = []
oof_texts = []
oof_sentiments = []
oof_selected_texts = []
oof_offset_mappings = []

def jaccard(str1, str2):
    a = set(str(str1).lower().split())
    b = set(str(str2).lower().split())
    if (not a and not b): return 0.5
    return len(a.intersection(b)) / len(a.union(b))

def get_best_span(start_logits, end_logits, seq_ids, offset_mapping, text, sentiment):
    if sentiment == 'neutral':
        return text
    # Mask non-context to -inf
    mask = seq_ids != 1
    start_logits = start_logits.clone()
    end_logits = end_logits.clone()
    start_logits[mask] = -1e9
    end_logits[mask] = -1e9
    best_score = -np.inf
    best_i, best_j = 0, 0
    for i in range(len(start_logits)):
        for j in range(i, min(i + 64, len(end_logits))):  # Reduced span length for speed
            if seq_ids[i] == 1 and seq_ids[j] == 1:
                score = start_logits[i].item() + end_logits[j].item()
                if score > best_score:
                    best_score = score
                    best_i, best_j = i, j
    char_start = offset_mapping[best_i][0].item()
    char_end = offset_mapping[best_j][1].item()
    pred = text[char_start:char_end]
    if not pred.strip():  # Fallback if empty
        return text
    return pred

def collate_train(batch):
    return {
        'input_ids': torch.stack([d['input_ids'] for d in batch]),
        'attention_mask': torch.stack([d['attention_mask'] for d in batch]),
        'start_positions': torch.stack([d['start_positions'] for d in batch]),
        'end_positions': torch.stack([d['end_positions'] for d in batch])
    }

def collate_eval(batch):
    return {
        'input_ids': torch.stack([d['input_ids'] for d in batch]),
        'attention_mask': torch.stack([d['attention_mask'] for d in batch]),
        'seq_ids': [d['seq_ids'] for d in batch],
        'offset_mapping': [d['offset_mapping'] for d in batch],
        'text': [d['text'] for d in batch],
        'sentiment': [d['sentiment'] for d in batch],
        'selected_text': [d['selected_text'] for d in batch]
    }

def evaluate(model, val_loader, fold, accelerator):
    model.eval()
    total_jacc = 0
    n = 0
    device = accelerator.device
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f'Eval Fold {fold}'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            for k in range(len(batch['text'])):
                start_l = outputs.start_logits[k].cpu()
                end_l = outputs.end_logits[k].cpu()
                seq = batch['seq_ids'][k].cpu()
                off = batch['offset_mapping'][k].cpu()
                txt = batch['text'][k]
                sent = batch['sentiment'][k]
                true = batch['selected_text'][k]
                # Collect for OOF
                oof_start_logits.append(start_l)
                oof_end_logits.append(end_l)
                oof_seq_ids.append(seq)
                oof_texts.append(txt)
                oof_sentiments.append(sent)
                oof_selected_texts.append(true)
                oof_offset_mappings.append(off)
                # Compute pred for current fold Jaccard
                pred = get_best_span(start_l, end_l, seq, off, txt, sent)
                total_jacc += jaccard(pred, true)
                n += 1
    return total_jacc / n if n > 0 else 0

def train_fold(accelerator, fold, train_df, val_df, epochs=3, batch_size=1):
    print(f'Creating datasets for fold {fold}')
    train_ds = TweetDataset(train_df, tokenizer)
    val_ds = TweetDataset(val_df, tokenizer)
    print(f'Created datasets: train {len(train_ds)}, val {len(val_ds)}')
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_train, num_workers=0, pin_memory=True)
    val_loader = DataLoader(val_ds, batch_size=4, collate_fn=collate_eval, num_workers=0, pin_memory=True)
    print(f'Created DataLoaders: train len {len(train_loader)}, val len {len(val_loader)}')
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    model.config.use_cache = False
    model.gradient_checkpointing_enable()
    optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01, foreach=False, fused=False)
    print('Preparing with accelerator...')
    model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader)
    print('Accelerator prepare done.')
    num_training_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=int(0.1 * num_training_steps), num_training_steps=num_training_steps
    )
    best_jacc = 0
    patience = 1
    no_improve = 0
    for epoch in range(epochs):
        print(f'Starting epoch {epoch+1}')
        model.train()
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}')
        for step, batch in enumerate(progress_bar):
            with accelerator.accumulate(model):
                outputs = model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    start_positions=batch['start_positions'],
                    end_positions=batch['end_positions']
                )
                loss = outputs.loss
                accelerator.backward(loss)
                accelerator.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                total_loss += loss.item()
                progress_bar.set_postfix({'loss': total_loss / (step + 1)})
                if step % 50 == 0 and step > 0:
                    print(f'Mem (MB): {torch.cuda.memory_allocated() // (1024*1024)}')
        print('Finished epoch loop, starting eval')
        val_jacc = evaluate(model, val_loader, fold, accelerator)
        print(f'Fold {fold} Epoch {epoch+1} Val Jaccard: {val_jacc}')
        if val_jacc > best_jacc:
            best_jacc = val_jacc
            no_improve = 0
            unwrapped = accelerator.unwrap_model(model)
            unwrapped.save_pretrained(f'model_fold_{fold}')
            tokenizer.save_pretrained(f'model_fold_{fold}')
        else:
            no_improve += 1
            if no_improve >= patience:
                print('Early stopping')
                break
        # Clear cache after epoch
        gc.collect()
        torch.cuda.empty_cache()
    return best_jacc

# Train only fold 4 (models 0-3 already saved)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []
for fold, (tr_idx, vl_idx) in enumerate(skf.split(train, train['sentiment'])):
    if fold != 4:
        print(f'Skipping fold {fold} (model already saved)')
        # Placeholder score; in reality, we'd load and eval, but for now skip
        cv_scores.append(0.706)  # Approximate from previous baseline
        continue
    print(f'\n=== Fold {fold} ===')
    fold_score = train_fold(accelerator, fold, train.iloc[tr_idx], train.iloc[vl_idx], epochs=3, batch_size=1)
    cv_scores.append(fold_score)
print(f'\nCV Scores: {cv_scores}')
print(f'Mean CV Jaccard: {np.mean(cv_scores):.4f} +/- {np.std(cv_scores):.4f}')

# Note: OOF only partial (fold 4); full OOF would require re-running all, but proceed to inference
print('\nPartial OOF CV Jaccard (fold 4 only):')
if oof_texts:
    num_samples = len(oof_texts)
    oof_jacc = 0
    for i in range(num_samples):
        start_l = oof_start_logits[i]
        end_l = oof_end_logits[i]
        seq = oof_seq_ids[i]
        off = oof_offset_mappings[i]
        txt = oof_texts[i]
        sent = oof_sentiments[i]
        pred = get_best_span(start_l, end_l, seq, off, txt, sent)
        oof_jacc += jaccard(pred, oof_selected_texts[i])
    print(f'Fold 4 OOF Jaccard: {oof_jacc / num_samples:.4f}')
else:
    print('No OOF collected yet')

Skipping fold 0 (model already saved)
Skipping fold 1 (model already saved)
Skipping fold 2 (model already saved)
Skipping fold 3 (model already saved)

=== Fold 4 ===
Creating datasets for fold 4
Created datasets: train 19785, val 4946
Created DataLoaders: train len 19785, val len 1237


Some weights of DebertaV2ForQuestionAnswering were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing with accelerator...
Accelerator prepare done.
Starting epoch 1


Epoch 1:   0%|          | 0/19785 [00:00<?, ?it/s]

  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Epoch 1:   0%|          | 0/19785 [00:00<?, ?it/s, loss=4.97]

Epoch 1:   0%|          | 1/19785 [00:00<48:04,  6.86it/s, loss=4.97]




RuntimeError: unscale_() has already been called on this optimizer since the last update().

In [7]:
# Debug dataset and loss
debug_df = train.head(10).reset_index(drop=True)  # Small subset including various sentiments
debug_ds = TweetDataset(debug_df, tokenizer)
print('Debugging dataset positions:')
for i in range(len(debug_ds)):
    item = debug_ds[i]
    print(f'Sample {i}: type={type(item)}, keys={list(item.keys()) if isinstance(item, dict) else "not dict"}')
    if isinstance(item, dict) and 'sentiment' in item:
        print(f'  sentiment={item["sentiment"]}, start_pos={item["start_positions"].item()}, end_pos={item["end_positions"].item()}, text_len={len(item["text"])}')
        assert 0 <= item["start_positions"] <= item["end_positions"] < item["input_ids"].shape[0] - 1, f'Invalid positions in sample {i}'
    else:
        print(f'  Missing keys, skipping assert for sample {i}')

# Test forward with labels on batch of 2, if possible
if len(debug_ds) >= 2:
    model = AutoModelForQuestionAnswering.from_pretrained(model_name).to('cuda')
    batch = [debug_ds[0], debug_ds[1]]
    collated = collate_fn(batch)
    input_ids = collated['input_ids'].to('cuda')
    attention_mask = collated['attention_mask'].to('cuda')
    start_positions = collated['start_positions'].to('cuda')
    end_positions = collated['end_positions'].to('cuda')
    model.train()
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
    print('Debug loss:', outputs.loss.item())
    if not torch.isnan(outputs.loss):
        print('Start logits sample:', outputs.start_logits[0][:10])
        print('End logits sample:', outputs.end_logits[0][:10])
    else:
        print('NaN loss in debug forward!')
else:
    print('Not enough samples for batch test')

Debugging dataset positions:
Sample 0: type=<class 'dict'>, keys=['input_ids', 'attention_mask', 'start_positions', 'end_positions', 'offset_mapping', 'seq_ids', 'sentiment', 'text', 'selected_text']
  sentiment=negative, start_pos=3, end_pos=12, text_len=50
Sample 1: type=<class 'dict'>, keys=['input_ids', 'attention_mask', 'start_positions', 'end_positions', 'offset_mapping', 'seq_ids', 'sentiment', 'text', 'selected_text']
  sentiment=negative, start_pos=13, end_pos=17, text_len=76
Sample 2: type=<class 'dict'>, keys=['input_ids', 'attention_mask', 'start_positions', 'end_positions', 'offset_mapping', 'seq_ids', 'sentiment', 'text', 'selected_text']
  sentiment=positive, start_pos=3, end_pos=3, text_len=65
Sample 3: type=<class 'dict'>, keys=['input_ids', 'attention_mask', 'start_positions', 'end_positions', 'offset_mapping', 'seq_ids', 'sentiment', 'text', 'selected_text']
  sentiment=positive, start_pos=6, end_pos=6, text_len=31
Sample 4: type=<class 'dict'>, keys=['input_ids', 'a

Some weights of DebertaV2ForQuestionAnswering were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NameError: name 'collate_fn' is not defined