In [None]:
import re
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import warnings

with warnings.catch_warnings():
    warnings.filterwarnings("ignore")

In [2]:
def simple_text_clean(x):
    # first we lowercase everything
    x = x.lower()
    # remove unicode characters
    x = x.encode('ascii', 'ignore').decode()
    x = re.sub(r'https*\S+', ' ', x)
    x = re.sub(r'http*\S+', ' ', x)
    # then use regex to remove @ symbols and hashtags
    #x = re.sub(r'@\S', '', x)
    #x = re.sub(r'#\S+', ' ', x)
    x = re.sub(r'\'\w+', '', x)
    #x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
    x = re.sub(r'\w*\d+\w*', '', x)
    x = re.sub(r'\s{2,}', ' ', x)
    x = re.sub(r'\s[^\w\s]\s', '', x)

    # Tokenize the text
    words = nltk.word_tokenize(x)

    # Remove stop words
    filtered_words = [word for word in words if word.lower() not in stopwords.words('english')]

    # Join the filtered words back into a sentence
    filtered_text = ' '.join(filtered_words)

    return filtered_text

In [3]:
# Read the train, val, and test data
train_data = pd.read_csv('train_data.csv')
val_data = pd.read_csv('val_data.csv')
test_data = pd.read_csv('test_data.csv')

In [4]:
train_data

Unnamed: 0,text,label
0,yes sent by: jeff dasovich make sense for me t...,3
1,we need another request for confidentiality. ...,0
2,richard shapiro \tenron energy services i part...,1
3,vance meyer this is the demonstration procedur...,5
4,minor changes shown on the attached (gotta get...,5
...,...,...
1325,Got a voicemail from Rob Walls. They are (have...,3
1326,jeff today's la times editorial revived the l...,0
1327,"me for response"" and have jeff b sign. thanks...",5
1328,california state university system fyi.... \te...,0


In [5]:
# Further clean the texts
train_data['cleaned_text'] = train_data['text'].apply(simple_text_clean)
val_data['cleaned_text'] = val_data['text'].apply(simple_text_clean)
test_data['cleaned_text'] = test_data['text'].apply(simple_text_clean)

In [6]:
# Initialize a BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize and preprocess the text data. Max length of a email text is set to 256.
def tokenize_text(text, max_length=256):
    # Tokenize the text
    tokens = tokenizer.encode(text, add_special_tokens=True, truncation=True)
    
    # Pad or truncate the tokens to the specified max_length
    if len(tokens) < max_length:
        # Pad with [PAD] tokens
        tokens += [tokenizer.pad_token_id] * (max_length - len(tokens))
    elif len(tokens) > max_length:
        # Truncate to max_length
        tokens = tokens[:max_length]

    return tokens

train_data['input_ids'] = train_data['cleaned_text'].apply(tokenize_text)
val_data['input_ids'] = val_data['cleaned_text'].apply(tokenize_text)
test_data['input_ids'] = test_data['cleaned_text'].apply(tokenize_text)

In [7]:
# Convert data to PyTorch tensors
train_inputs = torch.tensor(train_data['input_ids'].tolist())
val_inputs = torch.tensor(val_data['input_ids'].tolist())
test_inputs = torch.tensor(test_data['input_ids'].tolist())

train_labels = torch.tensor(train_data['label'].tolist())
val_labels = torch.tensor(val_data['label'].tolist())
test_labels = torch.tensor(test_data['label'].tolist())

# Create data loaders
train_dataset = TensorDataset(train_inputs, train_labels)
val_dataset = TensorDataset(val_inputs, val_labels)
test_dataset = TensorDataset(test_inputs, test_labels)

batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

# Initialize and fine-tune a BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6)
optimizer = AdamW(model.parameters(), lr=2e-5)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = 'mps'
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [8]:
# Training loop
num_epochs = 2
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}"):
        batch = tuple(t.to(device) for t in batch)
        inputs, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    # Validation
    model.eval()
    val_loss = 0.0
    val_predictions = []
    val_targets = []
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc=f"Validation"):
            batch = tuple(t.to(device) for t in batch)
            inputs, labels = batch
            outputs = model(inputs, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=1)
            val_predictions.extend(predicted_labels.cpu().numpy())
            val_targets.extend(labels.cpu().numpy())
    
    print(f"Epoch {epoch+1}: Train Loss: {train_loss/len(train_dataloader)}, Val Loss: {val_loss/len(val_dataloader)}")


Epoch 1: 100%|██████████| 42/42 [01:45<00:00,  2.52s/it]
Validation: 100%|██████████| 6/6 [00:04<00:00,  1.45it/s]


Epoch 1: Train Loss: 1.3312912668500627, Val Loss: 1.175315002600352


Epoch 2: 100%|██████████| 42/42 [01:47<00:00,  2.57s/it]
Validation: 100%|██████████| 6/6 [00:03<00:00,  1.61it/s]

Epoch 2: Train Loss: 1.1567137014298212, Val Loss: 1.120912899573644





In [9]:
# Evaluate on the test set
model.eval()
test_predictions = []
test_targets = []
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        batch = tuple(t.to(device) for t in batch)
        inputs, labels = batch
        outputs = model(inputs)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1)
        test_predictions.extend(predicted_labels.cpu().numpy())
        test_targets.extend(labels.cpu().numpy())

# Convert predictions and targets to NumPy arrays
val_predictions = np.array(val_predictions)
val_targets = np.array(val_targets)
test_predictions = np.array(test_predictions)
test_targets = np.array(test_targets)

# Print classification report and confusion matrix for validation set
print("Validation Set:")
print(classification_report(val_targets, val_predictions))
print("Confusion Matrix:")
print(confusion_matrix(val_targets, val_predictions))

# Print classification report and confusion matrix for test set
print("\nTest Set:")
print(classification_report(test_targets, test_predictions))
print("Confusion Matrix:")
print(confusion_matrix(test_targets, test_predictions))

Testing: 100%|██████████| 6/6 [00:04<00:00,  1.42it/s]

Validation Set:
              precision    recall  f1-score   support

           0       0.66      0.93      0.77        83
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00        10
           3       0.65      0.68      0.67        47
           4       0.00      0.00      0.00         8
           5       0.00      0.00      0.00        14

    accuracy                           0.66       166
   macro avg       0.22      0.27      0.24       166
weighted avg       0.51      0.66      0.57       166

Confusion Matrix:
[[77  0  0  6  0  0]
 [ 1  0  0  3  0  0]
 [ 6  0  0  4  0  0]
 [15  0  0 32  0  0]
 [ 5  0  0  3  0  0]
 [13  0  0  1  0  0]]

Test Set:
              precision    recall  f1-score   support

           0       0.64      0.92      0.75        84
           1       0.00      0.00      0.00         3
           2       0.00      0.00      0.00        10
           3       0.57      0.56      0.57        48
           4  


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
