In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [24]:
df = pd.read_csv("large_synthetic_email_sentiment_dataset.csv")
X = df["email"]
y = df[["politeness_formality", "emotional_tone", "clarity_constructiveness"]]

In [25]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Split temp into validation and test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

In [26]:
import torch
import torch.nn as nn
from transformers import BertModel



In [75]:
class BertMultiOutputRegressor(nn.Module):
    def __init__(self, model_name='bert-base-uncased', num_outputs=3):
        super(BertMultiOutputRegressor, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(p=0.3)
        self.regressor = nn.Linear(self.bert.config.hidden_size, num_outputs)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        dropped_out = self.dropout(pooled_output)
        print("Called")
        return self.regressor(dropped_out)


In [1]:
from transformers import BertTokenizer
from torch.utils.data import Dataset


  from .autonotebook import tqdm as notebook_tqdm


In [29]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class EmailDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len=128):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        target = self.targets.iloc[idx].values.astype(float)
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.float)
        }


In [30]:
train_dataset = EmailDataset(X_train, y_train, tokenizer)
val_dataset = EmailDataset(X_val, y_val, tokenizer)
test_dataset = EmailDataset(X_test, y_test, tokenizer)

In [31]:
from torch.utils.data import DataLoader
import torch.optim as optim

In [None]:
model = BertMultiOutputRegressor()
optimizer = optim.Adam(model.parameters(), lr=2e-5)
loss_fn = nn.MSELoss()


In [76]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


In [85]:
losses = []
for epoch in range(1):  # Adjust the number of epochs as needed
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        outputs = model(batch['input_ids'], batch['attention_mask'])
        print("outputs shape ", outputs)
        loss = loss_fn(outputs, batch['targets'])
        print("train loss " , loss.item())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_train_loss = total_loss / len(train_loader)
    
    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            outputs = model(batch['input_ids'], batch['attention_mask'])
            loss = loss_fn(outputs, batch['targets'])
            print("validdation loss ", loss.item())
            val_loss += loss.item()
    avg_val_loss = val_loss / len(val_loader)
    # if (len(losses) > 1 and losses[-1] - avg_val_loss < 0.005):
    #     print("Early stopping triggered.")
    #     break
    losses.append((avg_val_loss))

    print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss}, Validation Loss: {avg_val_loss}")

outputs shape  tensor([[0.7124, 0.4564, 0.7135],
        [0.7715, 0.6694, 0.7322],
        [0.4514, 0.1939, 0.4274],
        [0.8706, 0.8411, 0.8100],
        [0.8809, 0.7377, 0.7407],
        [0.8022, 0.4968, 0.7711],
        [0.8213, 0.4761, 0.7679],
        [0.4811, 0.2148, 0.4349],
        [0.7956, 0.4727, 0.7838],
        [1.0149, 0.9235, 0.8360],
        [0.7201, 0.4946, 0.7463],
        [0.8694, 0.5551, 0.7635],
        [0.6448, 0.3388, 0.5216],
        [0.2877, 0.2686, 0.4997],
        [0.3012, 0.1473, 0.3550],
        [0.4442, 0.0998, 0.2923]], grad_fn=<AddmmBackward0>)
train loss  0.027096854522824287
outputs shape  tensor([[0.5466, 0.2968, 0.4237],
        [0.8334, 0.8366, 0.7042],
        [0.3107, 0.1818, 0.2840],
        [0.8751, 0.7484, 0.9817],
        [0.5655, 0.3866, 0.4032],
        [0.5116, 0.4591, 0.5017],
        [0.8828, 0.4244, 0.7548],
        [0.3815, 0.2625, 0.3957],
        [0.7364, 0.4257, 0.7288],
        [0.5092, 0.3109, 0.4840],
        [0.3031, 0.0959, 0

KeyboardInterrupt: 

In [35]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np


In [74]:
model.eval()
predictions = []
actuals = []

with torch.no_grad():
    for batch in DataLoader(test_dataset, batch_size=16):
        outputs = model(batch['input_ids'], batch['attention_mask'])
        # print(outputs)
        # print(batch['targets'])
        predictions.extend(outputs.numpy())
        actuals.extend(batch['targets'].numpy())

In [None]:

predictions = np.array(predictions)
actuals = np.array(actuals)

# Calculate metrics for each output
for i, label in enumerate(['Formality', 'Tone', 'Conciseness']):
    mse = mean_squared_error(actuals[:, i], predictions[:, i])
    mae = mean_absolute_error(actuals[:, i], predictions[:, i])
    r2 = r2_score(actuals[:, i], predictions[:, i])
    print(f"{label} - MSE: {mse}, MAE: {mae}, R2: {r2}")

Formality - MSE: 0.021171515807509422, MAE: 0.1065855324268341, R2: 0.7062410116195679
Tone - MSE: 0.018879776820540428, MAE: 0.10134880989789963, R2: 0.7875068783760071
Conciseness - MSE: 0.019477585330605507, MAE: 0.11309678107500076, R2: 0.7169640064239502


In [86]:
email_text = "This is getting frustrating. Please send them ASAP."
encoding = tokenizer.encode_plus(
    email_text,
    add_special_tokens=True,
    max_length=128,
    return_token_type_ids=False,
    padding="max_length",
    truncation=True,
    return_attention_mask=True,
    return_tensors="pt",
)

with torch.no_grad():
    input_ids = encoding["input_ids"]
    attention_mask = encoding["attention_mask"]
    output = model(input_ids, attention_mask)
    prediction = output.numpy().flatten()


labels = ["formality", "tone", "conciseness"]
for label, score in zip(labels, prediction):
    print(f"{label}: {score:.3f}")

formality: 0.548
tone: 0.397
conciseness: 0.646


In [53]:
import joblib

In [57]:
import torch, json, os

MODEL_DIR = "sentiment_model"
os.makedirs(MODEL_DIR, exist_ok=True)

# --- after training finishes ---
torch.save(model.state_dict(), f"{MODEL_DIR}/model_weights.bin")

# optional but handy for auto‑reloading
with open(f"{MODEL_DIR}/model_config.json", "w") as f:
    json.dump(
        {
            "base_model": "bert-base-uncased",
            "num_outputs": 3,
            "dropout": 0.3
        },
        f,
    )
tokenizer.save_pretrained(MODEL_DIR)



('sentiment_model\\tokenizer_config.json',
 'sentiment_model\\special_tokens_map.json',
 'sentiment_model\\vocab.txt',
 'sentiment_model\\added_tokens.json')

In [5]:
from transformers import pipeline


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
classifier = pipeline(
    "zero-shot-classification"  # or "typeform/distilbert-base-uncased-mnli"
)
 
# 2. Define your intent labels
candidate_labels = ["request", "inform", "follow-up"]
 
# 3. Classify a new email
email_text = "After our last meeting, I wanted to follow up on the action items we discussed."
result = classifier(email_text, candidate_labels)
print(result)


No model was supplied, defaulted to facebook/bart-large-mnli and revision d7645e1 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


{'sequence': 'After our last meeting, I wanted to follow up on the action items we discussed.', 'labels': ['follow-up', 'request', 'inform'], 'scores': [0.9452621340751648, 0.03445684537291527, 0.02028099074959755]}


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('intent_classification_dataset.csv')
label_map = {'follow-up': 0, 'request': 1, 'inform': 2}
df['label'] = df['label'].map(label_map)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2
)

In [5]:
from transformers import DistilBertTokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, return_tensors="pt")


In [7]:
import torch

class EmailDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

    def __len__(self):
        return len(self.labels)

In [8]:
train_dataset = EmailDataset(train_encodings, train_labels)
val_dataset = EmailDataset(val_encodings, val_labels)

In [10]:
%pip install accelerate>=0.26.0

Note: you may need to restart the kernel to use updated packages.


In [9]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.0119,0.007064
2,0.0038,0.002727
3,0.0031,0.00222


TrainOutput(global_step=180, training_loss=0.11519228724969757, metrics={'train_runtime': 71.0246, 'train_samples_per_second': 20.275, 'train_steps_per_second': 2.534, 'total_flos': 5961139246080.0, 'train_loss': 0.11519228724969757, 'epoch': 3.0})

In [13]:
def classify_intent(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=1)
    pred = torch.argmax(probs, dim=1).item()
    reverse_label_map = {0: 'follow-up', 1: 'request', 2: 'inform'}
    return reverse_label_map[pred], probs[0][pred].item()


output = classify_intent("I wanted to follow up on the action items we discussed.")  
print(output)  # ('follow-up', 0.85)

output = classify_intent("Please send me the report.")
print(output)  # ('request', 0.92)

output = classify_intent("The meeting is scheduled for next week.")
print(output)  # ('inform', 0.88)

('follow-up', 0.9974984526634216)
('request', 0.9977546334266663)
('inform', 0.9976154565811157)


In [15]:
model.save_pretrained("intent_classification_model")
tokenizer.save_pretrained("intent_classification_model")

('intent_classification_model\\tokenizer_config.json',
 'intent_classification_model\\special_tokens_map.json',
 'intent_classification_model\\vocab.txt',
 'intent_classification_model\\added_tokens.json')