# SOTU authroship attribution

Fine-tuning DistilBERT for authorship attribution of SOTU presidential addresses. This notebook should be run on a GPU instance (we used a single V100 instance from Google Colab).

# Initializing

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# change the root directory for SOTU files, if necessary
%cd drive/MyDrive/Colab\ Notebooks/go-phish

/content/drive/MyDrive/Colab Notebooks/go-phish


In [3]:
!pip -q install --upgrade accelerate
!pip -q install datasets
!pip -q install tensorflow==2.14
!pip -q install evaluate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h

# Imports

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
import os
import glob
from datasets import Dataset
import evaluate
from collections import defaultdict
from sklearn.metrics import accuracy_score

# Model and dataset

Using DistilBERT (faster and more lightweight than BERT)

In [5]:
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizer function

In [6]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

Function to chunk "text" into "max_len" words sliding windows with shift "step_size"

In [7]:
def chunk_text(text, label, max_length = 512, step_size = 128):
    text = text.replace('\n', ' ').split()
    chunks = [' '.join(text[i:i+max_length]) for i in range(0, len(text), step_size)]
    def make_dict(chunk):
        return {'text':chunk, 'labels':label}
    return list(map(make_dict, chunks))

Encoding the categorical labels for POTUS

In [8]:
from sklearn.preprocessing import LabelEncoder

def encode_labels(labels, le = None):
    if le is None:
        le = LabelEncoder()
        labs = le.fit_transform(labels)
    else:
        labs = le.transform(labels)
    howmany = len(le.classes_)
    return labs, howmany, le


Getting SOTU texts with their labels: either train texts (extension .train.txt), or test texts (extension .test.txt), or (None, None) if cannot find the right files

In [9]:
def get_file_and_label(pathname):
    filename = os.path.basename(pathname)
    label = filename.split('_')[0]
    with open(pathname, 'r') as f:
      print(f'Acquiring file {filename}')
      text = f.read()
      return text, label
    return None, None

Get the content of all train / test files chunked

In [10]:
def get_all_files(dirname, train = True, **kwargs):
    dataset = []
    for pathname in os.listdir(dirname):
      if (  train  ) and pathname.endswith(".train.txt"):
        text, label = get_file_and_label(os.path.join(dirname, pathname))
        tl = chunk_text(text, label, **kwargs)
        dataset.extend(tl)
      if (not train) and pathname.endswith(".test.txt"):
        text, label = get_file_and_label(os.path.join(dirname, pathname))
        tl = chunk_text(text, label, **kwargs)
        dataset.extend(tl)
    return dataset

Look in the root directory, among all SOTU subdirectories

In [11]:
def get_all_dirs(train = True, le = None, **kwargs):
    dataset = []
    for dirname in glob.glob('sotu*'):
      if os.path.isdir(dirname):
        ds = get_all_files(dirname, train=train, **kwargs)
        dataset.extend(ds)
    texts = [d['text'] for d in dataset]
    labels, num , le2 = encode_labels([d['labels'] for d in dataset], le)
    thedict = {'text':texts, 'labels': labels}
    return Dataset.from_dict(thedict), num, le2

Getting the dataset, total of POTUS, label encoder

In [12]:
myds, nn, le = get_all_dirs(train=True, max_length=512, step_size=128)

Acquiring file Madison_1816.train.txt
Acquiring file Madison_1811.train.txt
Acquiring file Madison_1812.train.txt
Acquiring file Madison_1809.train.txt
Acquiring file Madison_1815.train.txt
Acquiring file Madison_1813.train.txt
Acquiring file RooseveltF_1934.train.txt
Acquiring file RooseveltF_1943.train.txt
Acquiring file RooseveltF_1935.train.txt
Acquiring file RooseveltF_1938.train.txt
Acquiring file RooseveltF_1941.train.txt
Acquiring file RooseveltF_1939.train.txt
Acquiring file RooseveltF_1937.train.txt
Acquiring file RooseveltF_1942.train.txt
Acquiring file RooseveltF_1945.train.txt
Acquiring file Ford_1975.train.txt
Acquiring file Ford_1976.train.txt
Acquiring file Buchanan_1857.train.txt
Acquiring file Buchanan_1859.train.txt
Acquiring file Buchanan_1858.train.txt
Acquiring file Cleveland_1887.train.txt
Acquiring file Cleveland_1895.train.txt
Acquiring file Cleveland_1888.train.txt
Acquiring file Cleveland_1896.train.txt
Acquiring file Cleveland_1893.train.txt
Acquiring file C

Total of POTUS

In [13]:
nn

42

## Preprocessing and metrics

Tokenizing the train dataset

In [31]:
myds_tokenized = myds.map(tokenize_function, batched=True)

Map:   0%|          | 0/10118 [00:00<?, ? examples/s]

Model initialization

In [32]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=nn)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

Metric for evaluation

In [34]:
metric = evaluate.load('accuracy')

In [35]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Model training

Training arguments and trainer initialization

In [36]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy = 'epoch',   # evaluation strategy
    num_train_epochs = 6,            # total epochs
    per_device_train_batch_size=16,  # batch size per device during training
)

In [37]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=myds_tokenized,        # training dataset
    eval_dataset=myds_tokenized,
    compute_metrics=compute_metrics
)

Training for 6 epochs

In [38]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,2.3289,1.033118,0.70933
2,1.0174,0.341014,0.909962
3,0.4992,0.089301,0.981815
4,0.0779,0.01818,0.996541
5,0.0246,0.004094,0.999703
6,0.0072,0.002897,0.999703


TrainOutput(global_step=3798, training_loss=0.5489904647001534, metrics={'train_runtime': 1317.5296, 'train_samples_per_second': 46.077, 'train_steps_per_second': 2.883, 'total_flos': 8047567423070208.0, 'train_loss': 0.5489904647001534, 'epoch': 6.0})

# Model evaluation

Test dataset

In [39]:
myds_test, nn, _ = get_all_dirs(train=False, le = le, max_length=512, step_size=128)

Acquiring file Madison_1814.test.txt
Acquiring file Madison_1810.test.txt
Acquiring file RooseveltF_1944.test.txt
Acquiring file RooseveltF_1940.test.txt
Acquiring file RooseveltF_1936.test.txt
Acquiring file Ford_1977.test.txt
Acquiring file Buchanan_1860.test.txt
Acquiring file Cleveland_1886.test.txt
Acquiring file Cleveland_1894.test.txt
Acquiring file Washington_1793.test.txt
Acquiring file Washington_1796.test.txt
Acquiring file RooseveltT_1902.test.txt
Acquiring file RooseveltT_1906.test.txt
Acquiring file Reagan_1982.test.txt
Acquiring file Reagan_1986.test.txt
Acquiring file AdamsSr_1798.test.txt
Acquiring file Pierce_1855.test.txt
Acquiring file Obama_2010.test.txt
Acquiring file Obama_2014.test.txt
Acquiring file Grant_1872.test.txt
Acquiring file Grant_1876.test.txt
Acquiring file Monroe_1824.test.txt
Acquiring file Monroe_1820.test.txt
Acquiring file Eisenhower_1954.test.txt
Acquiring file Eisenhower_1958.test.txt
Acquiring file Trump_2018.test.txt
Acquiring file Wilson_19

Tokenizing the test dataset

In [40]:
myds_test_tokenized = myds_test.map(tokenize_function, batched=True)

Map:   0%|          | 0/3839 [00:00<?, ? examples/s]

In [41]:
myds_test_tokenized = myds_test_tokenized.remove_columns(["text"])
myds_test_tokenized.set_format("torch")

In [42]:
eval_dataloader = DataLoader(myds_test_tokenized, batch_size=16)

Evaluating accuracy on chunks of text: this is a bad idea results in low accuracy

In [43]:
metric = evaluate.load("accuracy")

model.eval()
for batch in eval_dataloader:
    batch = {k: v.to('cuda') for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

acc = metric.compute()
print(f'Accuracy on text chunks: {acc}')

Accuracy on text chunks: {'accuracy': 0.667621776504298}


Averaging logits and taking the argmax: a great boost in accuracy (argmax voting)

In [44]:
# Initialize a defaultdict to accumulate logits and counts for each label
label_logits_sum = defaultdict(lambda: torch.zeros(nn).to('cuda'))
label_counts = defaultdict(int)

model.eval()
with torch.no_grad():
    for batch in eval_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        for i,k in enumerate(batch['labels']):
          label_key = int(k.cpu().numpy())
          logit_ind = i
          label_logits_sum[label_key] += logits[logit_ind]
          label_counts[label_key] += 1

In [45]:
preds_from_avg_logits = {}
for label_idx in range(nn):
  if label_counts[label_idx] > 0:
    preds_from_avg_logits[label_idx] = label_logits_sum[label_idx] / label_counts[label_idx]
  else:
    preds_from_avg_logits[label_idx] = .0
  #preds_from_avg_logits[label_idx] = torch.softmax(preds_from_avg_logits[label_idx], dim=-1)
  preds_from_avg_logits[label_idx] = torch.argmax(preds_from_avg_logits[label_idx])

true_labels = np.array(list(preds_from_avg_logits.keys()))
pred_labels = np.array([x.cpu().numpy() for x in list(preds_from_avg_logits.values())])

acc = accuracy_score(true_labels, pred_labels)
print(f'Accuracy with argmax voting: {acc}')

Accuracy with argmax voting: 0.9285714285714286
