In [1]:
import pandas as pd 
#loading the json file into pandas data frame
df = pd.read_json('training_data.json')
# display thre first 5 rows of data 
df.head()

Unnamed: 0,email_text,intent,urgency
0,Subject: Inquiry about Starlight Platform Pric...,Sales Inquiry,Medium
1,Subject: URGENT: API Gateway is Down!\n\nHello...,Technical Support,High
2,Subject: Request for a Demo\n\nHi OrbitAI Sale...,Sales Inquiry,Medium
3,Subject: Quick Question: Project Nova Sync\n\n...,Meeting Request,Medium
4,Subject: Company Update: Welcome Our New Hires...,Internal Update,Low


In [2]:
from sklearn.model_selection import train_test_split
from datasets import Dataset
import pandas as pd

# Load the data again just in case
df = pd.read_json('training_data.json')

# 1. Create a single target label
df['label_str'] = df['intent'] + '_' + df['urgency']
labels = df['label_str'].unique().tolist()
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for i, label in enumerate(labels)}
df['label'] = df['label_str'].map(label2id)

# 2. Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# 3. Convert pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

print("Data has been split and converted.")
print(f"Labels: {labels}")

Data has been split and converted.
Labels: ['Sales Inquiry_Medium', 'Technical Support_High', 'Meeting Request_Medium', 'Internal Update_Low', 'Technical Support_Medium', 'Meeting Request_High']


In [3]:
from transformers import AutoTokenizer

# Load the tokenizer for DistilBERT
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Create a function to tokenize the text
def tokenize_function(examples):
    return tokenizer(examples['email_text'], padding='max_length', truncation=True)

# Apply the tokenization to our datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

print("Tokenization complete.")

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Tokenization complete.


In [4]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# 1. Load the pre-trained DistilBERT model
model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', 
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

# 2. Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Directory to save the results
    num_train_epochs=3,              # We'll train for 3 full passes over the data
    per_device_train_batch_size=8,   # Process 8 examples at a time during training
    per_device_eval_batch_size=8,    # Process 8 examples at a time during evaluation
    warmup_steps=500,                # Number of steps to warm up the learning rate
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,
)

# 3. Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)

# 4. Start the training
trainer.train()

print("Training complete.")

# 5. Save the final model
trainer.save_model('./models/triage_classifier')
tokenizer.save_pretrained('./models/triage_classifier')

print("Model saved to './models/triage_classifier'")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,1.7602
20,1.7641
30,1.7655


Training complete.
Model saved to './models/triage_classifier'


In [None]:
%pip install accelerate>=0.26.0