# Encode


In [None]:
import pandas as pd
from transformers import AutoTokenizer

def encode_labels(df):
    """
    Encodes category, intent, and preprocessed BIO tags into numerical format.
    """
    # Create mappings for category and intent
    category_map = {cat: idx for idx, cat in enumerate(df['category'].unique())}
    intent_map = {intent: idx for idx, intent in enumerate(df['intent'].unique())}
    reverse_category_map = {v: k for k, v in category_map.items()}
    reverse_intent_map = {v: k for k, v in intent_map.items()}

    # Encode intent and category using the mappings
    df['intent_encoded'] = df['intent'].map(intent_map)
    df['category_encoded'] = df['category'].map(category_map)

    # Define all possible entity types
    all_entity_types = [
        'account_category', 'account_type', 'currency_symbol', 'delivery_city',
        'delivery_country', 'invoice_number', 'order_number', 'person_name', 'refund_amount'
    ]

    # Create tag map with B- and I- prefixes
    tag_map = {}
    for idx, entity in enumerate(all_entity_types):
        tag_map[f"B-{entity}"] = idx * 2  # B- prefix
        tag_map[f"I-{entity}"] = idx * 2 + 1  # I- prefix
    tag_map['O'] = len(tag_map)  # Add 'O' for outside any entity
    reverse_tag_map = {v: k for k, v in tag_map.items()}

    # Encode the preprocessed BIO tags for DistilBERT
    def encode_bio_tags(tags):
        return [tag_map.get(tag, tag_map['O']) for tag in tags]  # Ensure unknown tags default to 'O'

    df['tags_encoded_distilbert'] = df['bio_tags_distilbert'].apply(encode_bio_tags)

    return df, category_map, intent_map, tag_map, reverse_category_map, reverse_intent_map, reverse_tag_map

# Process dataset
df_processed, category_map, intent_map, tag_map, reverse_category_map, reverse_intent_map, reverse_tag_map = encode_labels(df)


the spltting should be the same dataset used in the 2 models and gradient


In [None]:
from sklearn.model_selection import train_test_split

# Encode labels
df, category_map, intent_map, tag_map, reverse_category_map, reverse_intent_map, reverse_tag_map = encode_labels(df)

# Split into train, validation, and test sets
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# # Initialize DistilBERT tokenizer
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Create datasets
train_dataset = CustomDataset(train_df, tokenizer, max_length=128)
val_dataset = CustomDataset(val_df, tokenizer, max_length=128)
test_dataset = CustomDataset(test_df, tokenizer, max_length=128)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
