In [1]:
!pip install faker






[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: C:\Users\reaga\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


# Data Preprocessing


## Load Dataset


In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv('../dataset/dataset.csv')

# Inspect the dataset
print(df.head())



   flags                                        instruction category  \
0      B   question about cancelling order {{Order Number}}    ORDER   
1    BQZ  i have a question about cancelling oorder {{Or...    ORDER   
2   BLQZ    i need help cancelling puchase {{Order Number}}    ORDER   
3     BL         I need to cancel purchase {{Order Number}}    ORDER   
4  BCELN  I cannot afford this order, cancel purchase {{...    ORDER   

         intent                                           response  
0  cancel_order  I've understood you have a question regarding ...  
1  cancel_order  I've been informed that you have a question ab...  
2  cancel_order  I can sense that you're seeking assistance wit...  
3  cancel_order  I understood that you need assistance with can...  
4  cancel_order  I'm sensitive to the fact that you're facing f...  


## Text Cleaning


In [3]:
# prompt: lowercase all column

# prompt: Remove special characters. include import librarie dont implement this

import re

def remove_special_characters(text):
    # Keep {{ and }}, remove other special characters
    text = re.sub(r'(?<!{){(?!{)|(?<!})}(?!})|[^\w\s{}]', '', text)
    return text

# Example usage (assuming you have a 'text' column in your DataFrame)
if 'instruction' in df.columns:
    df['instruction'] = df['instruction'].apply(remove_special_characters)
else:
    print("Column 'instruction' not found in the DataFrame.")



# Lowercase all column names
df.columns = df.columns.str.lower()

print(df.head())


   flags                                        instruction category  \
0      B   question about cancelling order {{Order Number}}    ORDER   
1    BQZ  i have a question about cancelling oorder {{Or...    ORDER   
2   BLQZ    i need help cancelling puchase {{Order Number}}    ORDER   
3     BL         I need to cancel purchase {{Order Number}}    ORDER   
4  BCELN  I cannot afford this order cancel purchase {{O...    ORDER   

         intent                                           response  
0  cancel_order  I've understood you have a question regarding ...  
1  cancel_order  I've been informed that you have a question ab...  
2  cancel_order  I can sense that you're seeking assistance wit...  
3  cancel_order  I understood that you need assistance with can...  
4  cancel_order  I'm sensitive to the fact that you're facing f...  


## NER


In [4]:
import pandas as pd
import re
from collections import Counter


# Detect rows with placeholders
df['has_placeholder'] = df['instruction'].str.contains(r'\{\{.*?\}\}')

# Count rows with placeholders
rows_with_placeholders = df['has_placeholder'].sum()

# Extract placeholders function
def extract_placeholders(text):
    return re.findall(r'\{\{(.*?)\}\}', text)

# Apply function and get all placeholders
placeholders = df[df['has_placeholder']]['instruction'].apply(extract_placeholders).explode()

# Count occurrences of each placeholder
placeholder_counts = Counter(placeholders)

# Convert to DataFrame
placeholder_df = pd.DataFrame.from_dict(placeholder_counts, orient='index', columns=['count'])

# Display initial results
print(f"Rows with placeholders: {rows_with_placeholders}")
print("\nInitial Distribution of Placeholders:")
print(placeholder_df.sort_values(by='count', ascending=False))

# Print total rows and missing values
print("\nTotal rows in the dataset:", len(df))
print("\nMissing values per column:")
print(df.isnull().sum())

# Remove placeholders with the lowest occurrence
if not placeholder_df.empty:
    min_count = placeholder_df['count'].min()  # Find the lowest count
    placeholder_df = placeholder_df[placeholder_df['count'] > min_count]  # Keep only placeholders with higher counts

# Display updated results
print("\nUpdated Distribution of Placeholders (After Removing Lowest Occurrences):")
print(placeholder_df.sort_values(by='count', ascending=False))


Rows with placeholders: 6670

Initial Distribution of Placeholders:
                  count
Order Number       2907
Account Type       1011
Person Name         887
Account Category    822
Refund Amount       624
Currency Symbol     372
Delivery City       234
Delivery Country    177
Invoice Number        8

Total rows in the dataset: 26872

Missing values per column:
flags              0
instruction        0
category           0
intent             0
response           0
has_placeholder    0
dtype: int64

Updated Distribution of Placeholders (After Removing Lowest Occurrences):
                  count
Order Number       2907
Account Type       1011
Person Name         887
Account Category    822
Refund Amount       624
Currency Symbol     372
Delivery City       234
Delivery Country    177


In [5]:
import re
import pandas as pd
from faker import Faker

# Initialize Faker
fake = Faker()

# Define synthetic data generators
data_generators = {
    'order number': lambda: fake.unique.bothify(text='ORD-#####'),
    'account type': lambda: fake.random_element(elements=('Business', 'Personal')),
    'person name': lambda: fake.name(),
    'account category': lambda: fake.random_element(elements=('Category A', 'Category B', 'Category C')),
    'refund amount': lambda: f"${fake.random_number(digits=3)}.00",
    'currency symbol': lambda: fake.currency_symbol(),
    'delivery city': lambda: fake.city(),
    'delivery country': lambda: fake.country(),
    'invoice number': lambda: fake.unique.bothify(text='INV###')
}

# Function to replace placeholders and track NER labels
def replace_placeholders(text, data_generators):
    if pd.isnull(text):
        return text, []  # Return empty list for NER labels

    placeholder_pattern = re.compile(r'\{\{(.*?)\}\}')  # Match placeholders {{...}}
    placeholders = placeholder_pattern.findall(text)
    ner_labels = []  # Store entity labels

    for placeholder in placeholders:
        placeholder_cleaned = placeholder.strip().lower()  # Normalize case
        if placeholder_cleaned in data_generators:
            replacement_value = data_generators[placeholder_cleaned]()  # Generate synthetic data
            text = text.replace(f'{{{{{placeholder}}}}}', replacement_value, 1)  # Replace placeholder

            # Append entity info in required format
            ner_labels.append({'text': replacement_value, 'label': placeholder_cleaned.replace(" ", "_")})

    return text, ner_labels

# Apply the placeholder replacement function
df[['instruction', 'ner_labels']] = df['instruction'].apply(
    lambda x: pd.Series(replace_placeholders(x, data_generators))
)

# Ensure all rows are retained, even if ner_labels is empty
df['ner_labels'] = df.apply(lambda row: row['ner_labels'] if row['has_placeholder'] else [], axis=1)

print(df.head())


   flags                                        instruction category  \
0      B          question about cancelling order ORD-01367    ORDER   
1    BQZ  i have a question about cancelling oorder ORD-...    ORDER   
2   BLQZ           i need help cancelling puchase ORD-20957    ORDER   
3     BL                I need to cancel purchase ORD-33406    ORDER   
4  BCELN  I cannot afford this order cancel purchase ORD...    ORDER   

         intent                                           response  \
0  cancel_order  I've understood you have a question regarding ...   
1  cancel_order  I've been informed that you have a question ab...   
2  cancel_order  I can sense that you're seeking assistance wit...   
3  cancel_order  I understood that you need assistance with can...   
4  cancel_order  I'm sensitive to the fact that you're facing f...   

   has_placeholder                                        ner_labels  
0             True  [{'text': 'ORD-01367', 'label': 'order_number'}]  
1   

## Function to Create BIO Tags


In [9]:
import re

# Function to generate BIO tags
def generate_bio_tags(instruction, ner_labels):
    # Tokenize the instruction into words using regex to handle punctuation
    tokens = re.findall(r'\w+|[^\w\s]', instruction)

    # Initialize BIO tags as 'O' (Outside) for all tokens
    bio_tags = ['O'] * len(tokens)

    # Process each entity in ner_labels
    for entity in ner_labels:
        entity_text = entity['text']
        entity_label = entity['label']

        # Tokenize the entity text using the same regex
        entity_tokens = re.findall(r'\w+|[^\w\s]', entity_text)

        # Find the start index of the entity in the instruction
        try:
            # Create a sliding window to find the entity tokens in the instruction tokens
            for i in range(len(tokens) - len(entity_tokens) + 1):
                if tokens[i:i + len(entity_tokens)] == entity_tokens:
                    # Assign BIO tags
                    for j in range(len(entity_tokens)):
                        if j == 0:
                            bio_tags[i + j] = f"B-{entity_label}"  # Beginning of the entity
                        else:
                            bio_tags[i + j] = f"I-{entity_label}"  # Inside the entity
                    break  # Stop after finding the first match
        except Exception as e:
            # If the entity text is not found in the tokens, log the error
            print(f"Error processing entity '{entity_text}' in instruction: {instruction}")
            continue

    return tokens, bio_tags

# Apply the BIO tag generation function
df['bio_tags'] = df.apply(lambda row: generate_bio_tags(row['instruction'], row['ner_labels']), axis=1)

# Split the result into separate columns for tokens and tags
df[['tokens', 'tags']] = pd.DataFrame(df['bio_tags'].tolist(), index=df.index)

# Drop the intermediate 'bio_tags' column
df.drop(columns=['bio_tags'], inplace=True)

# Display the final DataFrame
print(df[['instruction', 'ner_labels', 'tokens', 'tags']])

df.to_csv('output.csv', index=False)    



                                             instruction  \
0              question about cancelling order ORD-01367   
1      i have a question about cancelling oorder ORD-...   
2               i need help cancelling puchase ORD-20957   
3                    I need to cancel purchase ORD-33406   
4      I cannot afford this order cancel purchase ORD...   
...                                                  ...   
26867       I am waiting for a rebate of $228.00 dollars   
26868  how to see if there is anything wrong with my ...   
26869       Im waiting for a reimbjrsement of дин$426.00   
26870  I dont know what to do to see my reimbursement...   
26871  I need to know if there is anything new on the...   

                                              ner_labels  \
0       [{'text': 'ORD-01367', 'label': 'order_number'}]   
1       [{'text': 'ORD-44043', 'label': 'order_number'}]   
2       [{'text': 'ORD-20957', 'label': 'order_number'}]   
3       [{'text': 'ORD-33406', 'label':