# Data & Tokenizer Checks for dimabsa

This notebook runs a compact set of checks to ensure the dataset and tokenizer are consistent and sane for training. It focuses on:
- tokenizer special tokens and segment ids
- sample encodings for `text` and `(text, target)` pairs
- label shapes and statistics
- simple data quality checks (malformed lines, default labels)

Run cells sequentially. If you want, I can execute these cells and show the outputs for your project.

In [None]:
# 1) Imports & helper utilities

import os
import sys
import json
import random
import datetime
from pathlib import Path

import numpy as np
import pandas as pd
import torch

from pprint import pprint

# Helper utils

def print_heading(text):
    print("\n" + "#"*6 + " " + text + " " + "#"*6 + "\n")


def assert_ok(cond, msg):
    if not cond:
        raise AssertionError(msg)
    else:
        print("OK: ", msg)


print("Imports done.")

In [None]:
# 2) Environment & dependency versions

print_heading("Environment & Versions")
print("Python:", sys.version)
print("Torch:", torch.__version__)
try:
    import transformers
    print("Transformers:", transformers.__version__)
except Exception:
    print("Transformers not installed or can't be imported")

try:
    import sklearn
    print("sklearn:", sklearn.__version__)
except Exception:
    print("sklearn not available")

print_heading("Working directory and key files")
print("CWD:", os.getcwd())
print("Data file exists:", Path('data/eng_laptop_train_alltasks.jsonl').exists())
print("Dataloader exists:", Path('dataloader.py').exists())


In [None]:
# 3) Setup variables and load dataset

# Defaults (change if you used a different path/model)
DATA_PATH = Path('data/eng_laptop_train_alltasks.jsonl')
MODEL_NAME = 'microsoft/deberta-v3-large'

print_heading('Paths & Model')
print('DATA_PATH:', DATA_PATH)
print('MODEL_NAME:', MODEL_NAME)

# Import Dataloader from the local file
from dataloader import Dataloader

# Parse and instantiate
print_heading('Parsing data (this uses Dataloader._parse_jsonl)')
full_data = Dataloader._parse_jsonl(str(DATA_PATH))
print('Total flattened entries:', len(full_data))

print('\nShow 3 sample flattened entries:')
for i, e in enumerate(full_data[:3]):
    print(f"--- Sample {i} ---")
    pprint(e)


In [None]:
# 4) Tokenizer sanity checks and sample encodings

print_heading('Tokenizer & encoding checks')

# Instantiate dataset (this will create a tokenizer inside Dataloader)
train_ds, val_ds = Dataloader.prepare_splits(str(DATA_PATH), MODEL_NAME)

print('Train size, Val size:', len(train_ds), len(val_ds))

# Access tokenizer
tokenizer = train_ds.tokenizer
print('Special tokens:')
print('CLS token:', tokenizer.cls_token, 'id:', tokenizer.cls_token_id)
print('SEP token:', tokenizer.sep_token, 'id:', tokenizer.sep_token_id)
print('Pad token:', tokenizer.pad_token, 'id:', tokenizer.pad_token_id)

# Show encoding for a sample
sample_idx = 0
sample_row = train_ds.data[sample_idx]
print('\nSample raw text and target:')
print('Text:', sample_row['Text'])
print('Target:', sample_row['Target'])
print('Labels (Valence, Arousal):', sample_row['Valence'], sample_row['Arousal'])

enc_pair = tokenizer(str(sample_row['Text']), str(sample_row['Target']), max_length=128, padding='max_length', truncation=True, return_tensors=None)
enc_text = tokenizer(str(sample_row['Text']), max_length=128, padding='max_length', truncation=True, return_tensors=None)

print('\nEncoding keys:', list(enc_pair.keys()))
print('First 50 input_ids (pair):', enc_pair['input_ids'][:50])
print('First 50 attention_mask (pair):', enc_pair['attention_mask'][:50])
if 'token_type_ids' in enc_pair:
    print('First 50 token_type_ids (pair):', enc_pair['token_type_ids'][:50])

# Convert ids to tokens for the first 50 tokens
tokens_pair = tokenizer.convert_ids_to_tokens(enc_pair['input_ids'][:50])
print('\nTokens (pair) first 50:\n', tokens_pair)

# Compare text-only encoding to pair encoding around the separator
ids_text = enc_text['input_ids']
ids_pair = enc_pair['input_ids']

# Find SEP tokens positions (if available)
sep_id = tokenizer.sep_token_id
sep_positions = [i for i, idv in enumerate(ids_pair) if idv == sep_id]
print('\nSEP positions in pair encoding:', sep_positions)

# Show the sub-token sequences around sep tokens
for pos in sep_positions:
    start = max(0, pos-10)
    end = min(len(ids_pair), pos+10)
    print('\nContext around SEP at pos', pos, ':')
    print(tokenizer.convert_ids_to_tokens(ids_pair[start:end]))


In [None]:
# 5) Labels & statistics

print_heading('Labels statistics')

vals = [ (row['Valence'], row['Arousal']) for row in train_ds.data ]
val_arr = np.array(vals)

print('Valence stats: mean=%.3f std=%.3f min=%.3f max=%.3f' % (val_arr[:,0].mean(), val_arr[:,0].std(), val_arr[:,0].min(), val_arr[:,0].max()))
print('Arousal stats: mean=%.3f std=%.3f min=%.3f max=%.3f' % (val_arr[:,1].mean(), val_arr[:,1].std(), val_arr[:,1].min(), val_arr[:,1].max()))

# Count default labels (5.0, 5.0) vs parsed
default_count = sum(1 for v,a in vals if v == 5.0 and a == 5.0)
print('Default labels (5.0,5.0) count:', default_count)

# Show distribution of unique targets
from collections import Counter
targets = [row['Target'] for row in train_ds.data]
print('\nNumber of unique targets:', len(set(targets)))
print('Most common targets:')
for t,c in Counter(targets).most_common(10):
    print(f"  {t}: {c}")


In [None]:
# 6) Consistency check: tokenizer vs model in train.py

print_heading('Consistency check with train.py')

# Try to read train.py argument default for model (best-effort)
train_model_name = None
try:
    import ast
    with open('train.py', 'r') as f:
        tree = ast.parse(f.read())
        # crude search for a default model_name arg
        for node in ast.walk(tree):
            if isinstance(node, ast.Call) and getattr(node.func, 'id', '') == 'add_argument':
                args = [a.s for a in node.args if isinstance(a, ast.Str)]
                if args and '--model_name' in args:
                    for kw in node.keywords:
                        if kw.arg == 'default':
                            train_model_name = kw.value.s

except Exception:
    train_model_name = None

print('Model name in this notebook (MODEL_NAME):', MODEL_NAME)
print('Detected default in train.py:', train_model_name)
if train_model_name and train_model_name != MODEL_NAME:
    print('\nWARNING: model name used for tokenizer is different from train.py default. Use the same model for tokenizer and model weights to avoid mismatch.')
else:
    print('\nModel names appear consistent or could not be determined from train.py')


In [None]:
# 7) Simple data quality tests (assertions)

print_heading('Quick assertion tests')

# test 1: dataset non-empty
assert_ok(len(train_ds) > 0, 'train dataset not empty')

# test 2: labels shape and range
sample_lab = train_ds[0]['labels']
assert_ok(isinstance(sample_lab, torch.Tensor) and sample_lab.shape[0] == 2, 'labels are a torch tensor of length 2')
assert_ok((sample_lab >= 0).all().item() and (sample_lab <= 10).all().item(), 'labels in reasonable range (0-10)')

# test 3: token_type_ids presence is optional but we print a note
if 'token_type_ids' not in train_ds[0]:
    print('NOTE: token_type_ids not present in encoding. This is OK for many models (e.g., Deberta). If your model expects token_type_ids, switch to a model that supports them or add them.')
else:
    print('token_type_ids present: first 20 ->', train_ds[0]['token_type_ids'][:20])

print('\nAll quick assertions passed (if no AssertionError above).')

In [None]:
# 8) Save a small sample of encodings to disk and reload to validate save/load

print_heading('Save / Load validation')

to_save = {
    'sample_idx': sample_idx,
    'raw': sample_row,
    'enc_pair': enc_pair,
}

out_path = Path('logs') / 'sample_encoding.json'
out_path.parent.mkdir(exist_ok=True)
with open(out_path, 'w', encoding='utf-8') as fh:
    json.dump(to_save, fh, indent=2, ensure_ascii=False)

print('Saved sample to', out_path)
with open(out_path, 'r', encoding='utf-8') as fh:
    loaded = json.load(fh)

assert_ok('raw' in loaded and 'enc_pair' in loaded, 'saved JSON contains expected keys')
print('Save/load OK')

In [None]:
# 9) Reproducibility check: seeding

print_heading('Reproducibility checks (seeding)')

random.seed(1234)
np.random.seed(1234)
torch.manual_seed(1234)

# sample random indices twice and assert same
idxs1 = np.random.randint(0, len(train_ds), size=5)
np.random.seed(1234)
idxs2 = np.random.randint(0, len(train_ds), size=5)
assert_ok(np.array_equal(idxs1, idxs2), 'random sampling reproducible with seed')
print('Example indices:', idxs1)


# 10) Final guidance

print_heading('Guidance & next steps')
print('If you see:')
print(" - token_type_ids missing: this is fine for many DeBERTa-like models; only BERT needs them.")
print(" - different tokenizer vs model: use the same model name for tokenizer and model weights (pass args.model_name into Dataloader.prepare_splits)")
print(" - constant labels (e.g., many 5.0): check source data parsing and the 'VA' field parsing logic")
print(" - labels out of expected range: make sure Valence/Arousal parsing uses correct separators and casts to float")

print('\nIf you want, I can execute these cells and show the outputs for your project â€” say `run it` and I will run the key cells and paste their outputs here.')