In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [5]:
df = pd.read_csv("email_sentiment_full_dataset.csv")
X = df["email_text"]
y = df[["positive", "negative", "neutral", "confident", "professional"]]

In [6]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Split temp into validation and test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

In [7]:
import torch
import torch.nn as nn
from transformers import BertModel



In [8]:
class BertMultiOutputRegressor(nn.Module):
    def __init__(self, model_name='bert-base-uncased', num_outputs=5):
        super(BertMultiOutputRegressor, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(p=0.3)
        self.regressor = nn.Linear(self.bert.config.hidden_size, num_outputs)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        dropped_out = self.dropout(pooled_output)
        return self.regressor(dropped_out)


In [1]:
from transformers import BertTokenizer
from torch.utils.data import Dataset


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class EmailDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len=128):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        target = self.targets.iloc[idx].values.astype(float)
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.float)
        }


In [9]:
train_dataset = EmailDataset(X_train, y_train, tokenizer)
val_dataset = EmailDataset(X_val, y_val, tokenizer)
test_dataset = EmailDataset(X_test, y_test, tokenizer)

In [10]:
from torch.utils.data import DataLoader
import torch.optim as optim

In [11]:
model = BertMultiOutputRegressor()
optimizer = optim.Adam(model.parameters(), lr=2e-5)
loss_fn = nn.MSELoss()

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [12]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [13]:
for epoch in range(5):  # Adjust the number of epochs as needed
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        outputs = model(batch['input_ids'], batch['attention_mask'])
        loss = loss_fn(outputs, batch['targets'])
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_train_loss = total_loss / len(train_loader)
    
    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            outputs = model(batch['input_ids'], batch['attention_mask'])
            loss = loss_fn(outputs, batch['targets'])
            val_loss += loss.item()
    avg_val_loss = val_loss / len(val_loader)
    
    print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss}, Validation Loss: {avg_val_loss}")

Epoch 1, Training Loss: 0.6444055835405985, Validation Loss: 0.2957265079021454
Epoch 2, Training Loss: 0.291330228249232, Validation Loss: 0.18766649067401886
Epoch 3, Training Loss: 0.18963023771842322, Validation Loss: 0.15736019611358643
Epoch 4, Training Loss: 0.2612614681323369, Validation Loss: 0.14656411111354828
Epoch 5, Training Loss: 0.17601716021696726, Validation Loss: 0.09088512510061264


In [16]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np


In [15]:
model.eval()
predictions = []
actuals = []

with torch.no_grad():
    for batch in DataLoader(test_dataset, batch_size=16):
        outputs = model(batch['input_ids'], batch['attention_mask'])
        predictions.extend(outputs.numpy())
        actuals.extend(batch['targets'].numpy())

In [None]:

predictions = np.array(predictions)
actuals = np.array(actuals)

# Calculate metrics for each output
for i, label in enumerate(['positive', 'negative', 'neutral', 'confident', 'professional']):
    mse = mean_squared_error(actuals[:, i], predictions[:, i])
    mae = mean_absolute_error(actuals[:, i], predictions[:, i])
    r2 = r2_score(actuals[:, i], predictions[:, i])
    print(f"{label} - MSE: {mse}, MAE: {mae}, R2: {r2}")