# Batch 1: Import Libraries and Dataset Preparation


In [26]:
# Batch 1: Import Libraries and Dataset Preparation

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2Model, GPT2PreTrainedModel
from transformers import AdamW
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.nn import CrossEntropyLoss

# Define NER mappings
ner_map = {
    "O": 0,  # Outside of a named entity
    "B-ORDER_NUMBER": 1,  # Beginning of an order number
    "I-ORDER_NUMBER": 2,  # Inside an order number
    "B-INVOICE_NUMBER": 3,  # Beginning of an invoice number
    "I-INVOICE_NUMBER": 4,  # Inside an invoice number
    "B-ONLINE_ORDER_INTERACTION": 5,  # Beginning of an online order interaction
    "I-ONLINE_ORDER_INTERACTION": 6,  # Inside an online order interaction
    "B-ONLINE_PAYMENT_INTERACTION": 7,  # Beginning of an online payment interaction
    "I-ONLINE_PAYMENT_INTERACTION": 8,  # Inside an online payment interaction
    "B-ONLINE_NAVIGATION_STEP": 9,  # Beginning of an online navigation step
    "I-ONLINE_NAVIGATION_STEP": 10,  # Inside an online navigation step
    "B-ONLINE_CUSTOMER_SUPPORT_CHANNEL": 11,  # Beginning of an online customer support channel
    "I-ONLINE_CUSTOMER_SUPPORT_CHANNEL": 12,  # Inside an online customer support channel
    "B-PROFILE": 13,  # Beginning of a profile
    "I-PROFILE": 14,  # Inside a profile
    "B-PROFILE_TYPE": 15,  # Beginning of a profile type
    "I-PROFILE_TYPE": 16,  # Inside a profile type
    "B-SETTINGS": 17,  # Beginning of settings
    "I-SETTINGS": 18,  # Inside settings
    "B-ONLINE_COMPANY_PORTAL_INFO": 19,  # Beginning of online company portal info
    "I-ONLINE_COMPANY_PORTAL_INFO": 20,  # Inside online company portal info
    "B-DATE": 21,  # Beginning of a date
    "I-DATE": 22,  # Inside a date
    "B-DATE_RANGE": 23,  # Beginning of a date range
    "I-DATE_RANGE": 24,  # Inside a date range
    "B-SHIPPING_CUT_OFF_TIME": 25,  # Beginning of a shipping cut-off time
    "I-SHIPPING_CUT_OFF_TIME": 26,  # Inside a shipping cut-off time
    "B-DELIVERY_CITY": 27,  # Beginning of a delivery city
    "I-DELIVERY_CITY": 28,  # Inside a delivery city
}

# Function to generate NER labels based on entities
def generate_ner_labels(text, entities):
    labels = ["O"] * len(text.split())
    for entity, label in entities.items():
        entity_tokens = entity.split()
        for i in range(len(text.split()) - len(entity_tokens) + 1):
            if text.split()[i:i+len(entity_tokens)] == entity_tokens:
                labels[i] = f"B-{label}"
                for j in range(1, len(entity_tokens)):
                    labels[i+j] = f"I-{label}"
    return labels

class CustomNERDataset(Dataset):
    def __init__(self, data, tokenizer, max_length, ner_map):
        self.data = data
        self.tokenizer = tokenizer
    
        self.max_length = max_length
        self.ner_map = ner_map

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]  # Access by row index
        text = f"{row['instruction']} {row['response']}"  # NER input from instruction + response
        entities = {
            "{{Order Number}}": "ORDER_NUMBER",
            "{{Invoice Number}}": "INVOICE_NUMBER",
            "{{Online Order Interaction}}": "ONLINE_ORDER_INTERACTION",
            "{{Online Payment Interaction}}": "ONLINE_PAYMENT_INTERACTION",
            "{{Online Navigation Step}}": "ONLINE_NAVIGATION_STEP",
            "{{Online Customer Support Channel}}": "ONLINE_CUSTOMER_SUPPORT_CHANNEL",
            "{{Profile}}": "PROFILE",
            "{{Profile Type}}": "PROFILE_TYPE",
            "{{Settings}}": "SETTINGS",
            "{{Online Company Portal Info}}": "ONLINE_COMPANY_PORTAL_INFO",
            "{{Date}}": "DATE",
            "{{Date Range}}": "DATE_RANGE",
            "{{Shipping Cut-off Time}}": "SHIPPING_CUT_OFF_TIME",
            "{{Delivery City}}": "DELIVERY_CITY",
        }
        ner_labels = generate_ner_labels(text, entities)

        # Tokenize text with padding and truncation
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",  # Padding will use the newly defined pad_token
            truncation=True,
            return_tensors="pt"
        )

        # Map NER labels to numerical labels
        ner_label_ids = [self.ner_map[label] for label in ner_labels]
        ner_label_ids += [0] * (self.max_length - len(ner_label_ids))  # Pad NER labels to max_length

        # Return inputs and task-specific labels
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "ner_labels": torch.tensor(ner_label_ids, dtype=torch.long)  # Map NER labels to numerical labels
        }

# Load your dataset
df = pd.read_csv('../../dataset/Bitext.csv')


# Batch 2: DataLoader and Train-Validation Split


In [27]:
# Batch 2: DataLoader and Train-Validation Split

# Assuming df is your full dataset, apply train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to eos_token for GPT-2

# Create dataset instances
train_dataset = CustomNERDataset(train_df, tokenizer, max_length=128, ner_map=ner_map)
val_dataset = CustomNERDataset(val_df, tokenizer, max_length=128, ner_map=ner_map)

# Custom collate function to handle padding
def collate_fn(batch):
    max_length = max([len(item['input_ids']) for item in batch])
    input_ids = torch.stack([torch.nn.functional.pad(item['input_ids'], (0, max_length - len(item['input_ids']))) for item in batch])
    attention_mask = torch.stack([torch.nn.functional.pad(item['attention_mask'], (0, max_length - len(item['attention_mask']))) for item in batch])
    ner_labels = torch.stack([torch.nn.functional.pad(item['ner_labels'], (0, max_length - len(item['ner_labels']))) for item in batch])
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'ner_labels': ner_labels
    }

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)


# Batch 3: Model Setup


In [28]:
# Batch 3: Model Setup

class NERModel(GPT2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.gpt2 = GPT2Model(config)
        self.classifier = torch.nn.Linear(config.hidden_size, self.num_labels)
        self.init_weights()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.gpt2(input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]
        logits = self.classifier(sequence_output)
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        return (loss, logits) if loss is not None else logits

# Initialize the model
ner_model = NERModel.from_pretrained("gpt2", num_labels=len(ner_map))  # Use the length of ner_map for num_labels

# Move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
ner_model.to(device)


Some weights of NERModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['classifier.bias', 'classifier.weight', 'gpt2.h.0.attn.c_attn.bias', 'gpt2.h.0.attn.c_attn.weight', 'gpt2.h.0.attn.c_proj.bias', 'gpt2.h.0.attn.c_proj.weight', 'gpt2.h.0.ln_1.bias', 'gpt2.h.0.ln_1.weight', 'gpt2.h.0.ln_2.bias', 'gpt2.h.0.ln_2.weight', 'gpt2.h.0.mlp.c_fc.bias', 'gpt2.h.0.mlp.c_fc.weight', 'gpt2.h.0.mlp.c_proj.bias', 'gpt2.h.0.mlp.c_proj.weight', 'gpt2.h.1.attn.c_attn.bias', 'gpt2.h.1.attn.c_attn.weight', 'gpt2.h.1.attn.c_proj.bias', 'gpt2.h.1.attn.c_proj.weight', 'gpt2.h.1.ln_1.bias', 'gpt2.h.1.ln_1.weight', 'gpt2.h.1.ln_2.bias', 'gpt2.h.1.ln_2.weight', 'gpt2.h.1.mlp.c_fc.bias', 'gpt2.h.1.mlp.c_fc.weight', 'gpt2.h.1.mlp.c_proj.bias', 'gpt2.h.1.mlp.c_proj.weight', 'gpt2.h.10.attn.c_attn.bias', 'gpt2.h.10.attn.c_attn.weight', 'gpt2.h.10.attn.c_proj.bias', 'gpt2.h.10.attn.c_proj.weight', 'gpt2.h.10.ln_1.bias', 'gpt2.h.10.ln_1.weight', 'gpt2.h.10.ln_2.bias', 'gpt2.h.1

OutOfMemoryError: CUDA out of memory. Tried to allocate 148.00 MiB. GPU 0 has a total capacity of 3.81 GiB of which 13.19 MiB is free. Including non-PyTorch memory, this process has 3.79 GiB memory in use. Of the allocated memory 3.65 GiB is allocated by PyTorch, and 49.95 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# Batch 4: Loss Functions and Optimizer


In [24]:
# Batch 4: Loss Functions and Optimizer

def compute_loss(ner_logits, ner_labels):
    loss_fct = CrossEntropyLoss()
    ner_loss = loss_fct(ner_logits.view(-1, ner_logits.size(-1)), ner_labels.view(-1))
    return ner_loss

# Initialize optimizer
optimizer = AdamW(ner_model.parameters(), lr=2e-5)




# Batch 5: Training Loop


In [25]:
# Batch 5: Training Loop

def train_epoch(ner_model, dataloader, optimizer, device):
    ner_model.train()
    total_loss = 0
    batch_count = 0

    for batch in tqdm(dataloader, desc="Training Batches"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        ner_labels = batch['ner_labels'].to(device)

        # Step 1: Get NER logits (token-level task)
        ner_logits = ner_model(input_ids=input_ids, attention_mask=attention_mask)[1]

        # Step 2: Compute loss for this batch
        loss = compute_loss(ner_logits=ner_logits, ner_labels=ner_labels)

        # Step 3: Backward pass
        optimizer.zero_grad()  # Reset gradients
        loss.backward()        # Compute gradients
        optimizer.step()       # Update model weights

        # Accumulate total loss
        total_loss += loss.item()
        batch_count += 1

    # Return the average loss for the epoch
    avg_loss = total_loss / batch_count
    return avg_loss

# Main Training Loop
for epoch in range(3):  # Total number of epochs
    print(f"Epoch {epoch + 1}/3")
    
    # Train for one epoch
    avg_loss = train_epoch(ner_model, train_dataloader, optimizer, device)
    
    # Print the average loss for the epoch
    print(f"Epoch {epoch + 1}, Average Loss: {avg_loss}")


Epoch 1/3


Training Batches:   0%|          | 0/1344 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 3.81 GiB of which 13.19 MiB is free. Including non-PyTorch memory, this process has 3.79 GiB memory in use. Of the allocated memory 3.66 GiB is allocated by PyTorch, and 43.95 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

usage: ipykernel_launcher.py [-h] --train_file TRAIN_FILE --eval_file
                             EVAL_FILE --model MODEL [--bert_model BERT_MODEL]
                             [--xlnet_model XLNET_MODEL]
                             [--gpt2_model GPT2_MODEL]
                             [--gpt2_classification_type GPT2_CLASSIFICATION_TYPE]
                             [--train_batch_size TRAIN_BATCH_SIZE] [--gpu GPU]
                             [--eval_batch_size EVAL_BATCH_SIZE]
                             [--learning_rate LEARNING_RATE]
                             [--num_train_epochs NUM_TRAIN_EPOCHS]
                             [--prob_threshold PROB_THRESHOLD]
                             [--max_seq_length MAX_SEQ_LENGTH]
ipykernel_launcher.py: error: the following arguments are required: --train_file, --eval_file, --model


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


# Batch 6: Validation Loop


In [None]:
# Batch 6: Validation Loop

def validate(ner_model, dataloader, device):
    ner_model.eval()
    total_loss = 0
    batch_count = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validating Batches"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            ner_labels = batch['ner_labels'].to(device)

            # Step 1: Get NER logits (token-level task)
            ner_logits = ner_model(input_ids=input_ids, attention_mask=attention_mask)[1]

            # Step 2: Compute loss for this batch
            loss = compute_loss(ner_logits=ner_logits, ner_labels=ner_labels)

            # Accumulate total loss
            total_loss += loss.item()
            batch_count += 1

    # Return the average loss for the validation
    avg_loss = total_loss / batch_count
    return avg_loss

# Validate the model
avg_val_loss = validate(ner_model, val_dataloader, device)
print(f"Average Validation Loss: {avg_val_loss}")


# Batch 7: Save the model


In [None]:
# Batch 7: Save Model and Tokenizer

# Save the NER model
ner_model.save_pretrained("path_to_save_ner_model")
# Save the tokenizer
tokenizer.save_pretrained("path_to_save_tokenizer")


NameError: name 'ner_model' is not defined

# Batch 8: Inference


In [None]:
# Batch 8: Inference

def inference(ner_model, tokenizer, text, device):
    # Tokenize text with padding and truncation
    encoding = tokenizer(
        text,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    # Step 1: Get NER logits (token-level task)
    ner_logits = ner_model(input_ids=input_ids, attention_mask=attention_mask)[1]
    ner_predictions = torch.argmax(ner_logits, dim=-1).cpu().numpy()

    return ner_predictions

# Example usage
text = "Your example text here"
ner_predictions = inference(ner_model, tokenizer, text, device)
print(f"NER Predictions: {ner_predictions}")
