In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('audience_classification_dataset.csv', encoding='utf-8')
label_map = {'professional': 0, 'personal': 1, 'general': 2}
df['label'] = df['label'].map(label_map)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2
)



In [9]:
from transformers import DistilBertTokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, return_tensors="pt")

import torch

class EmailDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

    def __len__(self):
        return len(self.labels)
    
train_dataset = EmailDataset(train_encodings, train_labels)
val_dataset = EmailDataset(val_encodings, val_labels)

In [4]:
%pip install accelerate>=0.26.0

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.0121,0.007299
2,0.004,0.002858
3,0.0032,0.00232


TrainOutput(global_step=180, training_loss=0.10972993486664362, metrics={'train_runtime': 147.755, 'train_samples_per_second': 9.746, 'train_steps_per_second': 1.218, 'total_flos': 5961139246080.0, 'train_loss': 0.10972993486664362, 'epoch': 3.0})

In [15]:
def classify_intent(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=1)
    pred = torch.argmax(probs, dim=1).item()
    reverse_label_map = {0: 'professional', 1: 'personal', 2: 'general'}
    return reverse_label_map[pred], probs[0][pred].item()


output = classify_intent("I wanted to follow up on the action items we discussed.")  
print(output)  # ('professional', 0.95)

output = classify_intent("Please send me the report.")
print(output) # ('personal', 0.92)

output = classify_intent("The meeting is scheduled for next week.")
print(output)   # ('general', 0.89)

output = classify_intent("I really enjoyed our conversation.")
print(output)  # ('personal', 0.91)

output = classify_intent("I love you")
print(output)  # ('personal', 0.99)

('professional', 0.9951685667037964)
('professional', 0.9833005666732788)
('professional', 0.9508194327354431)
('general', 0.994867205619812)
('personal', 0.9955427050590515)


In [16]:
model.save_pretrained("audience_classifier_model")
tokenizer.save_pretrained("audience_classifier_model")


('audience_classifier_model\\tokenizer_config.json',
 'audience_classifier_model\\special_tokens_map.json',
 'audience_classifier_model\\vocab.txt',
 'audience_classifier_model\\added_tokens.json')

In [17]:
model = DistilBertForSequenceClassification.from_pretrained("audience_classifier_model")
tokenizer = DistilBertTokenizer.from_pretrained("audience_classifier_model")
model.push_to_hub("audience_classifier_model", safe_serialization=False)
tokenizer.push_to_hub("audience_classifier_model")


pytorch_model.bin: 100%|██████████| 268M/268M [00:33<00:00, 7.97MB/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


CommitInfo(commit_url='https://huggingface.co/parvk11/audience_classifier_model/commit/210023808352e2c7a1ef73025ca6d96b89f20fbe', commit_message='Upload tokenizer', commit_description='', oid='210023808352e2c7a1ef73025ca6d96b89f20fbe', pr_url=None, repo_url=RepoUrl('https://huggingface.co/parvk11/audience_classifier_model', endpoint='https://huggingface.co', repo_type='model', repo_id='parvk11/audience_classifier_model'), pr_revision=None, pr_num=None)