In [1]:
import pandas as pd
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sangersteel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import random 

In [3]:
df = pd.read_json('data/presidential-speeches.json')

In [4]:
def check_if_allcaps_in_list(doc):
    tokens = doc.split(" ")
    for token in tokens:
        if token.upper() == token and token != "PRESIDENT" and "(" not in token:
            return True
    return False

In [5]:
pres = {}
for index, row in df.iterrows():
    president = row['president']
    if president not in pres:
        pres[president] = ''
    else:
        pres[president] += row['transcript'].replace("\n","").replace("\r","") + "[NEXT]"


for keys in pres.keys():
    print(keys)
    sentences = [nltk.sent_tokenize(x) for x in pres[keys].split("[NEXT]")]
    sentences = [item for sublist in sentences for item in sublist if not check_if_allcaps_in_list(item)]
    random.shuffle(sentences)
    pres[keys] = sentences


Warren G. Harding
Lyndon B. Johnson
John F. Kennedy
Benjamin Harrison
Franklin D. Roosevelt
Harry S. Truman
Richard M. Nixon
Dwight D. Eisenhower
Ronald Reagan
Andrew Johnson
Gerald Ford
Jimmy Carter
George H. W. Bush
Bill Clinton
George W. Bush
Barack Obama
George Washington
John Adams
Thomas Jefferson
Abraham Lincoln
James Madison
James Monroe
John Quincy Adams
Andrew Jackson
Martin Van Buren
William Harrison
John Tyler
James K. Polk
Zachary Taylor
Millard Fillmore
Franklin Pierce
James Buchanan
Ulysses S. Grant
Rutherford B. Hayes
James A. Garfield
Chester A. Arthur
Grover Cleveland
William McKinley
Theodore Roosevelt
William Taft
Woodrow Wilson
Calvin Coolidge
Herbert Hoover
Donald Trump
Joe Biden


In [6]:
# Fine tune BERT model for this??

In [7]:
num_classes = len(set(df['president']))

In [8]:
vals = []
for key in pres:
    current = pres[key]
    for sent in current:
            cleaned_sent = " ".join([x for x in sent.split(" ") if "(" not in x and ")" not in x and "[" not in x and "]" not in x])
            vals.append([sent, key])

In [9]:
random.shuffle(vals)

In [10]:
dat = pd.DataFrame(data = vals)

In [11]:
dat.columns = ['text','out']

In [12]:
dat['out'].value_counts()

Donald Trump             8628
Barack Obama             7353
Ronald Reagan            5740
Lyndon B. Johnson        3864
Bill Clinton             3763
Theodore Roosevelt       3740
George W. Bush           3628
John F. Kennedy          3271
George H. W. Bush        3035
Abraham Lincoln          2301
Andrew Jackson           2252
Franklin D. Roosevelt    2235
Joe Biden                2135
Calvin Coolidge          2033
Benjamin Harrison        1749
Jimmy Carter             1727
Woodrow Wilson           1618
Herbert Hoover           1614
James K. Polk            1471
Grover Cleveland         1465
James Buchanan           1461
Andrew Johnson           1445
William Taft             1368
William McKinley         1241
Ulysses S. Grant         1179
John Tyler                896
Martin Van Buren          888
James Monroe              876
Rutherford B. Hayes       870
Harry S. Truman           794
Warren G. Harding         793
Gerald Ford               735
Franklin Pierce           731
Richard M.

In [13]:
#dat = pd.get_dummies(dat, columns = ['out'], prefix_sep = '', prefix = '')

In [14]:

X = dat['text']
y = dat['out']

In [15]:
from sklearn.model_selection import train_test_split


# Split the dataframe into a train and test set
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42, shuffle=True,stratify=y)

# Split the train set into a train and validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, shuffle=True,stratify=y_train)


In [16]:
train = (X_train, y_train)
val = (X_val, y_val)


In [17]:
from transformers import BertForSequenceClassification, AdamW, BertTokenizer
from torch.utils.data import TensorDataset, DataLoader
import torch

class_labels = list(set(y))
num_labels = len(class_labels)

list_of_sentences_train, labels_train = train
list_of_sentences_valid, labels_valid = val
num_epochs = 35

class_to_idx = {class_name: i for i, class_name in enumerate(class_labels)}

# Load the pre-trained BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = num_labels)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Convert the data into the format BERT expects
input_ids_train = tokenizer.batch_encode_plus(list_of_sentences_train, add_special_tokens=True, return_attention_mask=True, max_length = 512, pad_to_max_length=True)["input_ids"]
labels_train = labels_train = [class_to_idx[label] for label in labels_train] # your labels for classification task
input_ids_valid = tokenizer.batch_encode_plus(list_of_sentences_valid, add_special_tokens=True, return_attention_mask=True, max_length = 512, pad_to_max_length=True)["input_ids"]
labels_valid = labels_valid = [class_to_idx[label] for label in labels_valid] # your labels for classification task

# Create TensorDatasets and DataLoaders for the train and validation data
train_data = TensorDataset(torch.tensor(input_ids_train), torch.tensor(labels_train))
train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True)
valid_data = TensorDataset(torch.tensor(input_ids_valid), torch.tensor(labels_valid))
valid_dataloader = DataLoader(valid_data, batch_size=32, shuffle=False)

# Set up the optimizer and criterion
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

# Initialize the best validation accuracy
best_valid_loss = 0
# Number of consecutive epochs without improvement to wait before stopping
patience = 3
# Number of consecutive epochs without improvement
no_improvement_epochs = 0

# Fine-tune the model for a few epochs
for epoch in range(num_epochs):
    model.train()
    for input_ids, labels in train_dataloader:
        optimizer.zero_grad()
        logits = model(input_ids, labels = labels).logits
        #print(logits)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1} Loss {loss.item()}')
    
    # Evaluate the model on the validation data
    model.eval()
    with torch.no_grad():
        total_loss = 0
        total_acc = 0
        for input_ids, labels in valid_dataloader:
            logits = model(input_ids, labels = labels).logits
            loss = criterion(logits, labels)
            total_loss += loss.item()
    valid_loss = total_loss / len(valid_data)
    print(f'Validation loss: {valid_loss:.3f}')

    # check if the current validation loss is less than the best validation loss
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        no_improvement_epochs = 0
    else:
        no_improvement_epochs += 1

    # check if the number of consecutive epochs without improvement has reached the threshold
    if no_improvement_epochs >= patience:
        print(f'Early stopping after {epoch+1} epochs')
        break


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

KeyboardInterrupt: 

In [None]:
len(logits)

32

In [None]:
tens = torch.tensor([ 4, 36,  1,  9, 24,  4,  4,  4, 31, 20, 24, 36,  6,  4, 31, 29, 36,  5,
         2, 19, 29, 27, 31, 24,  2, 37, 19, 32, 13,  5, 19, 31])

In [None]:
torch.nn.CrossEntropyLoss()

In [None]:
criterion(logits, labels)

IndexError: Target 29 is out of bounds.

In [None]:
loss = torch.nn.CrossEntropyLoss()
input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
output = loss(input, target)
output.backward()

In [None]:
input

tensor([[-0.0589,  1.3335, -0.4077, -0.4150,  0.3482],
        [-1.5818,  2.1508,  1.5031,  0.0034, -1.3222],
        [ 2.0961,  0.9383, -1.3459, -2.6891, -0.8457]], requires_grad=True)

In [None]:
target

tensor([0, 4, 4])