In [1]:
import pandas as pd

In [20]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

# 1. Define the label mappings exactly as described in the PDF
label_list = ["O", "B-Object", "I-Object", "B-Aspect", "I-Aspect", "B-Predicate", "I-Predicate"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

print(f"Labels defined: {label2id}")

# 2. Function to read the CoNLL/TSV file
def read_conll_file(file_path, has_labels=True):
    data = {'tokens': [], 'ner_tags': []}
    
    current_tokens = []
    current_labels = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            
            # Empty line means end of sentence
            if not line:
                if current_tokens:
                    data['tokens'].append(current_tokens)
                    if has_labels:
                        data['ner_tags'].append(current_labels)
                    current_tokens = []
                    current_labels = []
                continue
            
            # Split line by tab
            parts = line.split('\t')
            
            # Safety check: ensure line has content
            current_tokens.append(parts[0])
            
            if has_labels:
                # If the file has labels, map the string label to its ID
                # Default to 0 ("O") if something goes wrong
                label_str = parts[1] if len(parts) > 1 else "O" 
                current_labels.append(label2id.get(label_str, 0))

        # Catch the last sentence if there's no newline at the end
        if current_tokens:
            data['tokens'].append(current_tokens)
            if has_labels:
                data['ner_tags'].append(current_labels)
                
    return data

Labels defined: {'O': 0, 'B-Object': 1, 'I-Object': 2, 'B-Aspect': 3, 'I-Aspect': 4, 'B-Predicate': 5, 'I-Predicate': 6}


In [25]:
# 3. Load the data
raw_data = read_conll_file("train.tsv", has_labels=True)

# 4. Split into Train (90%) and Validation (10%)
# We split the lists of sentences
train_tokens, val_tokens, train_labels, val_labels = train_test_split(
    raw_data['tokens'], 
    raw_data['ner_tags'], 
    test_size=0.1, 
    random_state=42
)

# 5. Create Hugging Face Datasets
train_dataset = Dataset.from_dict({'tokens': train_tokens, 'ner_tags': train_labels})
valid_dataset = Dataset.from_dict({'tokens': val_tokens, 'ner_tags': val_labels})

dataset = DatasetDict({
    'train': train_dataset,
    'validation': valid_dataset
})

# 6. Verify it looks correct
print("Data loaded successfully!")
print(f"Training sentences: {len(dataset['train'])}")
print(f"Validation sentences: {len(dataset['validation'])}")
print("\nExample sentence 0:")
print(dataset['train'][0]['tokens'])
print(dataset['train'][0]['ner_tags'])

Data loaded successfully!
Training sentences: 2100
Validation sentences: 234

Example sentence 0:
['in', 'a', '2005', 'article', ',', 'curt', 'hibbs', 'famously', 'claimed', 'that', 'ruby', 'on', 'rails', 'gives', '10', 'times', 'greater', 'productivity', 'than', 'a', 'typical', 'java', 'framework', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 5, 3, 0, 0, 0, 1, 0, 0]


In [26]:
import pandas as pd

# 1. Create a DataFrame from the training lists
df_train = pd.DataFrame({
    'tokens': train_tokens,
    'ner_tags_ids': train_labels
})

# 2. Add a column with human-readable labels (converting IDs 1 -> "B-Object")
# We use the id2label dictionary defined in Step 1
df_train['ner_tags_str'] = df_train['ner_tags_ids'].apply(
    lambda ids: [id2label[i] for i in ids]
)

# 3. Add a column for sentence length (useful to see how long your inputs are)
df_train['sentence_length'] = df_train['tokens'].apply(len)

# 4. Show the top 5 rows
pd.set_option('display.max_colwidth', None) # Show full text
display(df_train.head())

# --- OPTIONAL: Detailed View ---
# If you want to see exactly how one specific sentence matches up word-for-word:
print("\n--- Detailed View of Sentence #0 ---")
sample_idx = 0
for word, label in zip(df_train.iloc[sample_idx]['tokens'], df_train.iloc[sample_idx]['ner_tags_str']):
    print(f"{word:<20} {label}")

Unnamed: 0,tokens,ner_tags_ids,ner_tags_str,sentence_length
0,"[in, a, 2005, article, ,, curt, hibbs, famously, claimed, that, ruby, on, rails, gives, 10, times, greater, productivity, than, a, typical, java, framework, .]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 5, 3, 0, 0, 0, 1, 0, 0]","[O, O, O, O, O, O, O, O, O, O, B-Object, O, O, O, O, O, B-Predicate, B-Aspect, O, O, O, B-Object, O, O]",24
1,"[amazon, will, come, up, with, a, better, tablet, ,, but, the, quality, will, most, likely, never, match, ipad, ,, since, apple, can, afford, to, deliver, superior, quality, by, setting, price, points, at, much, higher, levels, .]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 5, 3, 0, 0, 3, 0, 0, 0, 5, 0, 0]","[B-Object, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-Object, O, O, O, O, B-Predicate, B-Aspect, O, O, B-Aspect, O, O, O, B-Predicate, O, O]",36
2,"[microsoft, is, offering, verizon, better, terms, than, google, has, (, and, possibly, ever, will, ),, including, better, revenue, sharing, and, guarantees, of, higher, payments, .]","[1, 0, 0, 0, 5, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 5, 3, 0, 0, 0, 0, 5, 3, 0]","[B-Object, O, O, O, B-Predicate, B-Aspect, O, B-Object, O, O, O, O, O, O, O, O, B-Predicate, B-Aspect, O, O, O, O, B-Predicate, B-Aspect, O]",25
3,"[however, ,, the, golf, ball, is, much, heavier, than, the, table, tennis, ball, .]","[0, 0, 0, 1, 3, 0, 0, 5, 0, 0, 0, 1, 0, 0]","[O, O, O, B-Object, B-Aspect, O, O, B-Predicate, O, O, O, B-Object, O, O]",14
4,"[these, figures, should, appear, at, least, a, little, bit, surprising, as, wii, and, ds, both, have, more, top, hit, software, than, ps2, even, though, the, ps2, hardware, base, was, bigger, than, either, platform, in, 2000, -, 2009, .]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0]","[O, O, O, O, O, O, O, O, O, O, O, B-Object, O, B-Object, O, O, O, O, O, O, O, B-Object, O, O, O, B-Object, O, O, O, B-Predicate, O, O, O, O, O, O, O, O]",38



--- Detailed View of Sentence #0 ---
in                   O
a                    O
2005                 O
article              O
,                    O
curt                 O
hibbs                O
famously             O
claimed              O
that                 O
ruby                 B-Object
on                   O
rails                O
gives                O
10                   O
times                O
greater              B-Predicate
productivity         B-Aspect
than                 O
a                    O
typical              O
java                 B-Object
framework            O
.                    O


In [27]:
print("\n--- Detailed View of Sentence #1 ---")
sample_idx = 1
for word, label in zip(df_train.iloc[sample_idx]['tokens'], df_train.iloc[sample_idx]['ner_tags_str']):
    print(f"{word:<20} {label}")


--- Detailed View of Sentence #1 ---
amazon               B-Object
will                 O
come                 O
up                   O
with                 O
a                    O
better               O
tablet               O
,                    O
but                  O
the                  O
quality              O
will                 O
most                 O
likely               O
never                O
match                O
ipad                 O
,                    O
since                O
apple                B-Object
can                  O
afford               O
to                   O
deliver              O
superior             B-Predicate
quality              B-Aspect
by                   O
setting              O
price                B-Aspect
points               O
at                   O
much                 O
higher               B-Predicate
levels               O
.                    O


In [28]:
from transformers import AutoTokenizer

model_checkpoint = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        truncation=True, 
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens (<s>, </s>) map to None. Set to -100 to ignore in loss.
            if word_idx is None:
                label_ids.append(-100)
            # If start of a new word, apply the label
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # If it's a sub-word (continuation of previous word), ignore it
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply this function to the Hugging Face Dataset we created earlier
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/2100 [00:00<?, ? examples/s]

Map:   0%|          | 0/234 [00:00<?, ? examples/s]

In [29]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2100
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 234
    })
})