# SOTU year determination

Fine-tuning DistilBERT for SOTU year regression. This notebook should be run on a GPU instance (we used a single V100 instance from Google Colab).

# Initializing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# change the root directory for SOTU files, if necessary
%cd /content/drive/MyDrive/Colab\ Notebooks/go-phish

/content/drive/MyDrive/Colab Notebooks/go-phish


In [None]:
!pip -q install accelerate -U
!pip -q install datasets
!pip -q install evaluate

# Imports

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
import os
import glob
from datasets import Dataset
import evaluate
from collections import defaultdict
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# Model and dataset

Using DistilBERT (faster and more lightweight than BERT)

In [None]:
checkpoint = "distilbert-base-uncased"
tokenizer  = AutoTokenizer.from_pretrained(checkpoint)

Tokenizer function

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

Function to chunk "text" into "max_len" words sliding windows with shift "step_size"

In [None]:
def chunk_text(text, label, max_length = 512, step_size = 128):
    text = text.replace('\n', ' ').split()
    chunks = [' '.join(text[i:i+max_length]) for i in range(0, len(text), step_size)]
    def make_dict(chunk):
        return {'text':chunk, 'labels':label}
    return list(map(make_dict, chunks))

Getting SOTU texts with their labels: either train texts (extension .train.txt), or test texts (extension .test.txt), or (None, None) if cannot find the right files

In [None]:
def get_file_and_label(pathname):
    filename = os.path.basename(pathname)
    foo = filename.split('.')
    label = foo[0].split('_')[1]
    label = [np.float32(label)]
    with open(pathname, 'r') as f:
      print(f'Acquiring file {filename}')
      text = f.read()
      return text, label
    return None, None

Get the content of all train / test files chunked

In [None]:
def get_all_files(dirname, train = True, **kwargs):
    dataset = []
    for pathname in os.listdir(dirname):
      if (  train  ) and pathname.endswith(".train.txt"):
        text, label = get_file_and_label(os.path.join(dirname, pathname))
        tl = chunk_text(text, label, **kwargs)
        dataset.extend(tl)
      if (not train) and pathname.endswith(".test.txt"):
        text, label = get_file_and_label(os.path.join(dirname, pathname))
        tl = chunk_text(text, label, **kwargs)
        dataset.extend(tl)
    return dataset

Look in the root directory, among all SOTU subdirectories

In [None]:
def get_all_dirs_train(**kwargs):
    dataset = []
    for dirname in glob.glob('sotu*'):
      if os.path.isdir(dirname):
        ds = get_all_files(dirname, train=True, **kwargs)
        dataset.extend(ds)
    texts = [d['text'] for d in dataset]
    labels = [d['labels'] for d in dataset]
    scaler = StandardScaler()
    labels = scaler.fit_transform(labels)
    thedict = {'text':texts, 'labels': labels}
    return Dataset.from_dict(thedict), scaler

Getting the dataset, label scaler (standardizing labels)

In [None]:
myds, myscaler = get_all_dirs_train(max_length=512, step_size=128)

Acquiring file Madison_1816.train.txt
Acquiring file Madison_1811.train.txt
Acquiring file Madison_1812.train.txt
Acquiring file Madison_1809.train.txt
Acquiring file Madison_1815.train.txt
Acquiring file Madison_1813.train.txt
Acquiring file RooseveltF_1934.train.txt
Acquiring file RooseveltF_1943.train.txt
Acquiring file RooseveltF_1935.train.txt
Acquiring file RooseveltF_1938.train.txt
Acquiring file RooseveltF_1941.train.txt
Acquiring file RooseveltF_1939.train.txt
Acquiring file RooseveltF_1937.train.txt
Acquiring file RooseveltF_1942.train.txt
Acquiring file RooseveltF_1945.train.txt
Acquiring file Ford_1975.train.txt
Acquiring file Ford_1976.train.txt
Acquiring file Buchanan_1857.train.txt
Acquiring file Buchanan_1859.train.txt
Acquiring file Buchanan_1858.train.txt
Acquiring file Cleveland_1887.train.txt
Acquiring file Cleveland_1895.train.txt
Acquiring file Cleveland_1888.train.txt
Acquiring file Cleveland_1896.train.txt
Acquiring file Cleveland_1893.train.txt
Acquiring file C

## Preprocessing and metrics

Tokenizing the train dataset

In [None]:
myds_tokenized = myds.map(tokenize_function, batched=True)

Map:   0%|          | 0/10118 [00:00<?, ? examples/s]

Model initialization: we need to modify the classifier layer for regression task

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
model.pre_classifier = torch.nn.Sequential(
            torch.nn.Linear(model.pre_classifier.in_features, 768),
            torch.nn.Dropout(p=0.1, inplace=False),
            torch.nn.GELU(),
            torch.nn.Linear(768, 256),
            torch.nn.Dropout(p=0.1, inplace=False),
            torch.nn.GELU(),
            torch.nn.Linear(256, 128),
            torch.nn.Dropout(p=0.1, inplace=False),
            torch.nn.GELU(),
        )
model.classifier = torch.nn.Linear(in_features=128, out_features=1)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

Metric for evaluation

In [None]:
metric = evaluate.load('mse')
#
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    mse = metric.compute(predictions=predictions, references=labels)
    return mse

# Model training

Training arguments and trainer initialization

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy = 'epoch',   # evaluation strategy
    num_train_epochs = 16,            # total epochs
    per_device_train_batch_size=16,  # batch size per device during training
)

In [None]:
class myTrainer(Trainer):

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = torch.norm(logits - labels, p=3.0) #
        return (loss, outputs) if return_outputs else loss

####################

trainer = myTrainer(
    model=model,
    args=training_args,
    train_dataset=myds_tokenized,  # training dataset
    eval_dataset=myds_tokenized,   # validation dataset
    compute_metrics=compute_metrics
)

Training for 8 epochs

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Mse
1,0.923,0.363919,0.03531
2,0.5959,0.292498,0.023588
3,0.5018,0.228969,0.013767
4,0.4351,0.175543,0.008254
5,0.3998,0.160448,0.006775
6,0.3844,0.155859,0.006389
7,0.3582,0.170078,0.007609
8,0.3451,0.140426,0.005282
9,0.3366,0.151654,0.007006
10,0.3264,0.146673,0.006007


TrainOutput(global_step=10128, training_loss=0.39160501297804606, metrics={'train_runtime': 3376.477, 'train_samples_per_second': 47.946, 'train_steps_per_second': 3.0, 'total_flos': 2.1558445738328064e+16, 'train_loss': 0.39160501297804606, 'epoch': 16.0})

# Model evaluation

Test dataset

In [None]:
def get_all_dirs_test(**kwargs):
    dataset = []
    for dirname in glob.glob('sotu*'):
      if os.path.isdir(dirname):
        ds = get_all_files(dirname, train=False, **kwargs)
        dataset.extend(ds)
    texts = [d['text'] for d in dataset]
    labels = [d['labels'] for d in dataset]
    thedict = {'text':texts, 'labels': labels}
    return Dataset.from_dict(thedict)

In [None]:
myds_test = get_all_dirs_test(max_length=512, step_size=128)

Acquiring file Madison_1814.test.txt
Acquiring file Madison_1810.test.txt
Acquiring file RooseveltF_1944.test.txt
Acquiring file RooseveltF_1940.test.txt
Acquiring file RooseveltF_1936.test.txt
Acquiring file Ford_1977.test.txt
Acquiring file Buchanan_1860.test.txt
Acquiring file Cleveland_1886.test.txt
Acquiring file Cleveland_1894.test.txt
Acquiring file Washington_1793.test.txt
Acquiring file Washington_1796.test.txt
Acquiring file RooseveltT_1902.test.txt
Acquiring file RooseveltT_1906.test.txt
Acquiring file Reagan_1982.test.txt
Acquiring file Reagan_1986.test.txt
Acquiring file AdamsSr_1798.test.txt
Acquiring file Pierce_1855.test.txt
Acquiring file Obama_2010.test.txt
Acquiring file Obama_2014.test.txt
Acquiring file Grant_1872.test.txt
Acquiring file Grant_1876.test.txt
Acquiring file Monroe_1824.test.txt
Acquiring file Monroe_1820.test.txt
Acquiring file Eisenhower_1954.test.txt
Acquiring file Eisenhower_1958.test.txt
Acquiring file Trump_2018.test.txt
Acquiring file Wilson_19

Tokenizing the test dataset

In [None]:
myds_test_tokenized = myds_test.map(tokenize_function, batched=True)

Map:   0%|          | 0/3839 [00:00<?, ? examples/s]

In [None]:
myds_test_tokenized = myds_test_tokenized.remove_columns(["text"])
myds_test_tokenized.set_format("torch")

In [None]:
eval_dataloader = DataLoader(myds_test_tokenized, batch_size=16)

Evaluating rmse on chunks of text: we get about 7.5 years error (not great)

In [None]:
metric = evaluate.load("mse")

model.eval()
for batch in eval_dataloader:
    batch = {k: v.to('cuda') for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = myscaler.inverse_transform(outputs.logits.cpu())
    metric.add_batch(predictions=logits, references=batch["labels"])

mse = metric.compute()
rmse = np.sqrt(mse["mse"])
print(f'RMSE on text chunks: {rmse}')

RMSE on text chunks: 7.296835997244392


Averaging logits: boost in RMSE

In [None]:
# Initialize a defaultdict to accumulate logits and counts for each label
label_logits_sum = defaultdict(lambda: np.zeros(1))
label_counts = defaultdict(int)

model.eval()
with torch.no_grad():
    for batch in eval_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}
        outputs = model(**batch)
        logits = myscaler.inverse_transform(outputs.logits.cpu())
        for i,k in enumerate(batch['labels']):
          label_key = int(k.cpu().numpy())
          logit_ind = i
          label_logits_sum[label_key] += logits[logit_ind]
          label_counts[label_key] += 1

In [None]:
preds_from_avg_logits = {}
for label_idx in label_logits_sum.keys():
  if label_counts[label_idx] > 0:
    preds_from_avg_logits[label_idx] = np.squeeze(label_logits_sum[label_idx]) / label_counts[label_idx]
  else:
    preds_from_avg_logits[label_idx] = .0

In [None]:
true_labels = np.array(list(preds_from_avg_logits.keys()))
pred_labels = np.array(list(preds_from_avg_logits.values()))

mse  = mean_squared_error(true_labels, pred_labels)
rmse = np.sqrt(mse)
print(f'RMSE with averaging: {rmse}')

RMSE with averaging: 4.520993245416568


In [None]:
true_labels = np.array(list(preds_from_avg_logits.keys()))
pred_labels_round = np.array([np.round(x) for x in list(preds_from_avg_logits.values())])

mse_round  = mean_squared_error(true_labels, pred_labels_round)
rmse_round = np.sqrt(mse_round)
print(f'RMSE with rounding: {rmse_round}')

RMSE with rounding: 4.515207279205856
