In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('../dataset/dataset.csv')

# Inspect the dataset
print(df.head())
print(df.info())


   flags                                        instruction category  \
0      B   question about cancelling order {{Order Number}}    ORDER   
1    BQZ  i have a question about cancelling oorder {{Or...    ORDER   
2   BLQZ    i need help cancelling puchase {{Order Number}}    ORDER   
3     BL         I need to cancel purchase {{Order Number}}    ORDER   
4  BCELN  I cannot afford this order, cancel purchase {{...    ORDER   

         intent                                           response  
0  cancel_order  I've understood you have a question regarding ...  
1  cancel_order  I've been informed that you have a question ab...  
2  cancel_order  I can sense that you're seeking assistance wit...  
3  cancel_order  I understood that you need assistance with can...  
4  cancel_order  I'm sensitive to the fact that you're facing f...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26872 entries, 0 to 26871
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
--- 

In [2]:
import re

# Function to extract NER entities
def extract_ner_entities(instruction):
    return re.findall(r'\{\{([^{}]*)\}\}', instruction)

# Apply the function to the dataset
df['ner_entities'] = df['instruction'].apply(extract_ner_entities)

# Example output
print(df[['instruction', 'ner_entities']].head())


                                         instruction    ner_entities
0   question about cancelling order {{Order Number}}  [Order Number]
1  i have a question about cancelling oorder {{Or...  [Order Number]
2    i need help cancelling puchase {{Order Number}}  [Order Number]
3         I need to cancel purchase {{Order Number}}  [Order Number]
4  I cannot afford this order, cancel purchase {{...  [Order Number]


In [4]:
# Function to preprocess text
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^a-zA-Z0-9\s\{\}]+', '', text)  # Remove special characters except placeholders
    return text

# Apply the preprocessing function
df['instruction'] = df['instruction'].apply(preprocess_text)
df['response'] = df['response'].apply(preprocess_text)
df['category'] = df['category'].apply(preprocess_text)  # Apply to category column

# Example output
print(df[['instruction', 'response', 'category', 'ner_entities']].head())


                                         instruction  \
0   question about cancelling order {{order number}}   
1  i have a question about cancelling oorder {{or...   
2    i need help cancelling puchase {{order number}}   
3         i need to cancel purchase {{order number}}   
4  i cannot afford this order cancel purchase {{o...   

                                            response    ner_entities  
0  ive understood you have a question regarding c...  [Order Number]  
1  ive been informed that you have a question abo...  [Order Number]  
2  i can sense that youre seeking assistance with...  [Order Number]  
3  i understood that you need assistance with can...  [Order Number]  
4  im sensitive to the fact that youre facing fin...  [Order Number]  


In [5]:
# Ensure all necessary columns are present
df_final = df[['instruction', 'intent', 'category', 'response', 'ner_entities']]

# Save the preprocessed dataset
df_final.to_csv('preprocessed_dataset.csv', index=False)

# Example output
print(df_final.head())


                                         instruction        intent category  \
0   question about cancelling order {{order number}}  cancel_order    ORDER   
1  i have a question about cancelling oorder {{or...  cancel_order    ORDER   
2    i need help cancelling puchase {{order number}}  cancel_order    ORDER   
3         i need to cancel purchase {{order number}}  cancel_order    ORDER   
4  i cannot afford this order cancel purchase {{o...  cancel_order    ORDER   

                                            response    ner_entities  
0  ive understood you have a question regarding c...  [Order Number]  
1  ive been informed that you have a question abo...  [Order Number]  
2  i can sense that youre seeking assistance with...  [Order Number]  
3  i understood that you need assistance with can...  [Order Number]  
4  im sensitive to the fact that youre facing fin...  [Order Number]  


In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and temporary sets
train_temp_df, test_df = train_test_split(df_final, test_size=0.2, random_state=42, stratify=df_final[['intent', 'category']])

# Split the temporary set into validation and test sets
train_df, val_df = train_test_split(train_temp_df, test_size=0.25, random_state=42, stratify=train_temp_df[['intent', 'category']])

# Save the split datasets
train_df.to_csv('train_dataset.csv', index=False)
val_df.to_csv('val_dataset.csv', index=False)
test_df.to_csv('test_dataset.csv', index=False)

# Example output
print(train_df.head())
print(val_df.head())
print(test_df.head())


                                             instruction               intent  \
19593                 how to shop several of you article          place_order   
10562  i want to create a {{account type}} account fo...       create_account   
9654              i want help talking with a human agent  contact_human_agent   
7505            help lodging a claim against ur business            complaint   
22800       i try to leave some feedback about a service               review   

       category                                           response  \
19593     ORDER  thank you for your interest in shopping severa...   
10562   ACCOUNT  thats fantastic its great to hear that youre i...   
9654    CONTACT  thank you for reaching out its great to hear t...   
7505   FEEDBACK  im sorry to hear that youre encountering an is...   
22800  FEEDBACK  we appreciate your efforts to provide feedback...   

         ner_entities  
19593              []  
10562  [Account Type]  
9654               [

In [10]:
from transformers import GPT2Tokenizer, DistilBertTokenizer

# Load tokenizers
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

# Function to tokenize and prepare NER labels
def prepare_ner_data(tokenizer, df):
    encodings = tokenizer(df['instruction'].tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt')
    labels = []
    for i, instruction in enumerate(df['instruction']):
        ner_entities = df.iloc[i]['ner_entities']
        label = [0] * len(encodings.input_ids[i])  # Initialize labels with 0 (O: Outside)
        for entity in ner_entities:
            start_idx = instruction.find(entity)
            end_idx = start_idx + len(entity)
            for j in range(start_idx, end_idx):
                if j < len(label):
                    label[j] = 1  # 1: Inside
        labels.append(label)
    encodings['labels'] = labels
    return encodings

# Filter rows with NER labels
train_df_with_ner = train_df[train_df['ner_entities'].apply(len) > 0]
val_df_with_ner = val_df[val_df['ner_entities'].apply(len) > 0]
test_df_with_ner = test_df[test_df['ner_entities'].apply(len) > 0]

# Prepare NER data for GPT-2 and DistilBERT
train_encodings_ner_gpt2 = prepare_ner_data(gpt2_tokenizer, train_df_with_ner)
val_encodings_ner_gpt2 = prepare_ner_data(gpt2_tokenizer, val_df_with_ner)
test_encodings_ner_gpt2 = prepare_ner_data(gpt2_tokenizer, test_df_with_ner)


In [11]:
from transformers import GPT2ForTokenClassification, Trainer, TrainingArguments

# Load GPT-2 model
gpt2_ner_model = GPT2ForTokenClassification.from_pretrained('gpt2', num_labels=2)  # 2 labels: O (Outside), I (Inside)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define the trainer for NER
trainer_ner_gpt2 = Trainer(
    model=gpt2_ner_model,
    args=training_args,
    train_dataset=train_encodings_ner_gpt2,
    eval_dataset=val_encodings_ner_gpt2,
)

# Train the model
trainer_ner_gpt2.train()


RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_tf_utils because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [None]:
# Test the model with a new instruction
import torch


new_instruction = "How to check my order number 12323?"

# Tokenize the new instruction
new_encodings_gpt2 = gpt2_tokenizer(new_instruction, return_tensors='pt')

# Get predictions
def get_ner_predictions(model, encodings, tokenizer):
    outputs = model(**encodings)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).squeeze().tolist()
    tokens = tokenizer.convert_ids_to_tokens(encodings.input_ids.squeeze())
    ner_entities = [tokens[i] for i, label in enumerate(predictions) if label == 1]
    return ner_entities

# Get NER predictions from GPT-2
ner_predictions_gpt2 = get_ner_predictions(gpt2_ner_model, new_encodings_gpt2, gpt2_tokenizer)

# Print the results
print(f"Instruction: {new_instruction}")
print(f"NER (GPT-2): {ner_predictions_gpt2}")
