In [0]:
!pip install -r requirements.txt

In [0]:
from transformers import BertTokenizer, BertForTokenClassification, Trainer, TrainingArguments, EarlyStoppingCallback, DataCollatorWithPadding
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import torch
import re

In [0]:
df_path = 'Dataset/normalization_assesment_dataset_10k.csv'
df = pd.read_csv(df_path)
display(df)

In [0]:
print(df['raw_comp_writers_text'].isnull().sum())
# 1

# Dropping the null row
df.dropna()
print(df['raw_comp_writers_text'].isnull().sum())
# 1

# It must be an empty string
df.replace('',pd.NA, inplace = True)
df.dropna()
print(df['raw_comp_writers_text'].isnull().sum())

# Maybe strip whitespaces
df['raw_comp_writers_text'] = df['raw_comp_writers_text'].str.strip()
df.replace("", pd.NA, inplace=True)
df = df.dropna()
print(df['raw_comp_writers_text'].isnull().sum())

# Using Transformers (BERT)

The model might be biased toward predicting 0 for unseen tokens if it hasn't generalized well during training.

Tokenize raw text and label tokens, if they exist in the normalized text.

In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_and_label(row):
    raw_text = row['raw_comp_writers_text']
    clean_text = row['CLEAN_TEXT']

    tokenized = tokenizer(
        raw_text,
        padding='max_length',
        truncation=True,
        max_length=64,
        return_tensors='pt'
    )

    tokens = tokenizer.convert_ids_to_tokens(tokenized['input_ids'].squeeze(0))
    labels = []
    for token in tokens:
        if token in tokenizer.all_special_tokens:
            labels.append(-100)  # Ignore special tokens
        else:
            labels.append(1 if token in tokenizer.tokenize(clean_text) else 0)

    return tokenized['input_ids'].squeeze(0).tolist(), tokenized['attention_mask'].squeeze(0).tolist(), labels

df.loc[:, 'token_labels'] = df.apply(tokenize_and_label, axis=1) # tuple of (tokens, labels)

In [0]:
display(df)

Spliting into train,val,test

In [0]:
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=98)
val_df, test_df = train_test_split(temp_df, test_size=0.33, random_state=98)  

print(len(train_df), len(val_df), len(test_df))

In [0]:
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

The NormalizationDataset class is a PyTorch Dataset designed to convert rows from a DataFrame into inputs suitable for a BERT-based token classification model.

In [0]:
class NormalizationDataset(Dataset):
    def __init__(self, df):
        self.df = df.reset_index(drop=True)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        input_ids, attention_mask, labels = row['token_labels']

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

    def __len__(self):
        return len(self.df)

train_dataset = NormalizationDataset(train_df)
val_dataset = NormalizationDataset(val_df)
test_dataset = NormalizationDataset(test_df)

In [0]:
for i in range(10):
    example = train_dataset[i]
    print(f"Example {i+1}:")
    print(f"Input IDs: {example['input_ids']}")
    print(f"Attention Mask: {example['attention_mask']}")
    print(f"Labels: {example['labels']}")
    print("\n")

## Fine-tune BERT for token classification

In [0]:
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='/tmp/results',  # Save checkpoints to a temporary directory because of memory issues
    evaluation_strategy="epoch", 
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    per_device_train_batch_size=16,
    num_train_epochs=20,
    learning_rate=2e-5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_steps=10
)

# Use Hugging Face's built-in data collator for token classification
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,  # Automatically handles padding
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

MEXRI EDW KALA MALLON

In [0]:
import shutil

try:
    shutil.copytree('/tmp/results', './results')
    print("Succesfully copied tmp results to cd")
except Exception as e:
    print(f"Error copying tmp results to cd: {e}")

## Use the fine-tuned model to predict and reconstruct normalized text.

## Test model's performance

In [0]:
def evaluate_on_test(test_df, model, tokenizer):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    all_predictions = []
    all_labels = []

    for _, row in test_df.iterrows():

        precomputed = {
            'input_ids': torch.tensor(row['token_labels'][0]).unsqueeze(0).to(device),
            'attention_mask': torch.tensor(row['token_labels'][1]).unsqueeze(0).to(device)
        }

        outputs = model(**precomputed)
        predictions = torch.argmax(outputs.logits, dim=2).squeeze(0).cpu().numpy()

        true_labels = torch.tensor(row['token_labels'][2]).numpy()
        valid_indices = true_labels != -100  # Exclude ignored tokens (e.g., [PAD], [CLS], [SEP])

        all_predictions.extend(predictions[valid_indices])
        all_labels.extend(true_labels[valid_indices])

    precision = precision_score(all_labels, all_predictions, average='weighted')
    recall = recall_score(all_labels, all_predictions, average='weighted')
    f1 = f1_score(all_labels, all_predictions, average='weighted')

    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}")
    return precision, recall, f1

precision, recall, f1 = evaluate_on_test(test_df, model, tokenizer)

## Test model on custom text

In [0]:
def test_custom_text(raw_text, model, tokenizer):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    inputs = tokenizer(
        raw_text,
        return_tensors="pt",
        padding='max_length',
        truncation=True,
        max_length=128
    )
    inputs = {key: value.to(device) for key, value in inputs.items()}

    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2).squeeze(0).cpu().numpy()

    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze(0))
    normalized_tokens = [
        token for token, label in zip(tokens, predictions)
        if label == 1 and token not in tokenizer.all_special_tokens
    ]

    normalized_text = tokenizer.convert_tokens_to_string(normalized_tokens)

    return normalized_text.title()

In [0]:
custom_texts = [
    "Tony Grace/Rob DeBoer",
    "Budde Music/Lorenz Brunner",
    "Jordan Riley/Adam Argyle/Copyright Control"
]

for i, custom_text in enumerate(custom_texts):
    normalized_output = test_custom_text(custom_text, model, tokenizer)
    print(f"Example {i+1}:")
    print(f"Raw Text: {custom_text}")
    print(f"Normalized Text: {normalized_output}")
    print("-" * 40)