In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import logging
from tqdm import tqdm
import random
import math
import argparse
import os

from speech_reader import speech_reader

In [3]:
import tempfile
import datasets
from datasets import Dataset, DatasetDict
from transformers import GPT2Tokenizer, GPT2LMHeadModel,AutoModelForCausalLM, TrainingArguments, Trainer
from transformers.optimization import AdamW, get_linear_schedule_with_warmup


In [5]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
custom_token = "<sep>"
tokenizer.add_tokens([custom_token])

# Load the model
folder_path = "../models/"
# checkpoint = f"checkpoint-{240}"
model_name = "model_3epoch"
model_path = folder_path + model_name
model = AutoModelForCausalLM.from_pretrained(model_path)

model.resize_token_embeddings(len(tokenizer))

Embedding(50258, 768)

## data preprocess

In [4]:
speeches = speech_reader(year = '114')
speeches_df = speeches.dataset
speeches_df.head()

Unnamed: 0,speech_id,speech,date,char_count,speaker,word_count,speakerid,lastname,firstname,chamber,state,gender,party,district,nonvoting
0,1140000007,RODGERS. Madam Clerk. it is an honor to addres...,20150106,153,Mrs. McMORRIS,267,114120480,MCMORRIS RODGERS,CATHY,H,WA,F,R,5.0,voting
1,1140000009,Madam Clerk. first I would like to recognize e...,20150106,135,Mr. BECERRA,244,114118560,BECERRA,XAVIER,H,CA,M,D,34.0,voting
2,1140000011,Madam Clerk. I present for election to the off...,20150106,24,Mr. MASSIE,41,114121890,MASSIE,THOMAS,H,KY,M,R,4.0,voting
3,1140000013,Madam Clerk. I present for the election of the...,20150106,101,Mr. BRIDENSTINE,183,114122500,BRIDENSTINE,JIM,H,OK,M,R,1.0,voting
4,1140000015,Madam Clerk. I rise to place in a nomination f...,20150106,55,Mr. KING of Iowa,92,114120060,KING,STEVE,H,IA,M,R,4.0,voting


In [5]:
senate_speeches = speeches_df[speeches_df.chamber == 'S']
senate_speeches['speech'] = senate_speeches.speech.str.replace("\n", " ")
senate_speeches.date.nunique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  senate_speeches['speech'] = senate_speeches.speech.str.replace("\n", " ")


242

In [21]:
# jan_speeches = senate_speeches[senate_speeches['date'].str.startswith('201501')]
# feb_speeches = senate_speeches[senate_speeches['date'].str.startswith('201502')]
# march_speeches = senate_speeches[senate_speeches['date'].str.startswith('201503')]
# april_speeches = senate_speeches[senate_speeches['date'].str.startswith('201503')]
# march_speeches = senate_speeches[senate_speeches['date'].str.startswith('201503')]
speeches_list = []
months = ['201501', '201502', '201503', '201504', '201505']
for month in months:
    speeches_list.append(senate_speeches[senate_speeches['date'].str.startswith(month)])
    
train_speeches = pd.concat(speeches_list)
train_speeches = train_speeches[train_speeches.word_count>70]
# train_speeches = train_speeches.sample(100)
train_speeches.reset_index(drop= True, inplace=True)
len(train_speeches)

3094

In [22]:
train_speeches.speaker.unique()

array(['Mr. MCCONNELL', 'Mr. DURBIN', 'Mr. McCONNELL', 'Mr. HATCH',
       'Mr. LEAHY', 'Mrs. FEINSTEIN', 'Ms. COLLINS', 'Mr. REED',
       'Mr. UDALL', 'Mr. MERKLEY', 'Mr. BLUNT', 'Mr. THUNE',
       'Mr. SANDERS', 'Mr. CORNYN', 'Mr. WHITEHOUSE', 'Mr. HOEVEN',
       'Mr. MANCHIN', 'Mr. DAINES', 'Mr. BARRASSO', 'Ms. MURKOWSKI',
       'Ms. HEITKAMP', 'Mr. ROBERTS', 'Mr. CARDIN', 'Mr. MARKEY',
       'Mr. BROWN', 'Mr. WYDEN', 'Mr. CASEY', 'Mr. SCOTT',
       'Mrs. GILLIBRAND', 'Mr. ALEXANDER', 'Mr. BENNET', 'Mr. BOOKER',
       'Mr. BURR', 'Mr. KING', 'Mr. CRAPO', 'Mr. SCHUMER', 'Mr. COATS',
       'Mr. HELLER', 'Ms. STABENOW', 'Mrs. FISCHER', 'Ms. WARREN',
       'Mr. INHOFE', 'Mr. COONS', 'Mr. WICKER', 'Mr. COCHRAN',
       'Mr. BLUMENTHAL', 'Mr. NELSON', 'Ms. CANTWELL', 'Mr. VITTER',
       'Mrs. SHAHEEN', 'Mr. SCHATZ', 'Mr. SESSIONS', 'Mr. KAINE',
       'Mr. TOOMEY', 'Mrs. McCASKILL', 'Mr. WARNER', 'Mrs. BOXER',
       'Mr. BOOZMAN', 'Mr. MENENDEZ', 'Mrs. MURRAY', 'Mr. PORTMAN',
 

In [26]:

dataset  = train_speeches[['speaker', 'speech']].agg(': <sep> '.join, axis=1).to_list()
# dataset = random.sample(dataset, 20)

In [27]:
def cleanpunctuation(s):
    for p in '!,.:;?':
        s=s.replace(' '+p,p)
    s=s.replace(' '+'n\'t','n\'t')
    s=s.replace(' '+'\'s','\'s')
    s=s.replace(' '+'\'re','\'re')
    s=s.replace(' '+'\'ve','\'ve')
    s=s.replace(' '+'\'ll','\'ll')
    s=s.replace(' '+'\'am','\'am')
    s=s.replace(' '+'\'m','\'m')
    s=s.replace(' '+'\' m','\'m')
    s=s.replace(' '+'\'m','\'m')
    s=s.replace(' '+'\' ve','\'ve')
    s=s.replace(' '+'\' s','\'s')
    s=s.replace('<newline>','\n')
    s=s.replace('-','')
    s=s.replace('\xa0',' ')
    
    return s   

text_dataset=list(map(cleanpunctuation,dataset))

In [30]:
text_dataset[2]

'Mr. DURBIN: <sep> Mr. President. I thank the majority leader for those kind words. I am happy to report the Democratic leader of the Senate. Senator REID. is making a speedy recovery from his New Years runin with some exercise equipment. His face and ribs are still sore. He is eager to get back to work. We met with him this morning. and we can expect him back in the Senate very soon. In the meantime. it is a privilege on behalf of the Democratic Caucus to welcome our old colleagues back to work and welcome our new colleagues and their families to the U.S. Senate. I also want to wish Leader MCCONNELL. as he takes up the new duties of the majority leader. the very best. Senator Dirksen was a Senator from my home State of Illinois who served as a Republican leader of the Senate from 1959 to 1969. He famously said. "I am a man of fixed and unbending principles. the first of which is to be flexible at all times." That may sound comical. even contradictory. But Senator Dirksens ability on f

### tokenized data

In [31]:
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token=tokenizer.eos_token

encoded_data = tokenizer(text_dataset, padding=True,truncation=True,max_length=512)

In [32]:
def create_labels(inputs):
    labels=[]
    for ids,attention_mask in zip(inputs['input_ids'],inputs['attention_mask']):
        label=ids.copy()
        real_len=sum(attention_mask)
        padding_len=len(attention_mask)-sum(attention_mask)
        label[:]=label[:real_len]+[-100]*padding_len
        labels.append(label)
    inputs['labels']=labels
    
create_labels(encoded_data)
encoded_data = Dataset.from_dict(encoded_data)

In [34]:
# 90% train, 10% test + validation
train_testvalid = encoded_data.train_test_split(test_size=0.2)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
encoded_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})
encoded_dataset.save_to_disk("../data/encoded")


Saving the dataset (0/1 shards):   0%|          | 0/2475 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/310 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/309 [00:00<?, ? examples/s]

## load encoded data

In [6]:
# encoded_dataset.save_to_disk("../data/encoded")
# ...
from datasets import load_from_disk
encoded_dataset = load_from_disk("../data/encoded")

In [7]:

# Subset the datasets
train_subset = encoded_dataset["train"].select(range(1000))
test_subset = encoded_dataset["test"].select(range(100))
valid_subset = encoded_dataset["valid"].select(range(100))

# Create a new DatasetDict with your subsets
encoded_dataset = DatasetDict({
    "train": train_subset,
    "test": test_subset,
    "valid": valid_subset
})

In [8]:
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})

In [9]:
class StoryDataset:
    def __init__(self, inputs):
        self.ids = inputs['input_ids']
        self.attention_mask = inputs['attention_mask']
        self.labels=inputs['labels']

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, item):

        return [torch.tensor(self.ids[item], dtype=torch.long),
                torch.tensor(self.attention_mask[item], dtype=torch.long),
                torch.tensor(self.labels[item], dtype=torch.long)]
     

In [10]:
train_batch_size= 8
valid_batch_size= 16
traindata=StoryDataset(encoded_dataset['train'])
train_dataloader = torch.utils.data.DataLoader(
    traindata,
    shuffle=False,
    batch_size=train_batch_size)

validdata=StoryDataset(encoded_dataset['valid'])
valid_dataloader = torch.utils.data.DataLoader(
    validdata,
    shuffle=False,
    batch_size=valid_batch_size)

## evaluate

In [57]:
# Function to compute metrics (assuming accuracy for classification)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

In [58]:


# List all checkpoints
model_path = "../models/"
checkpoints = [os.path.join(model_path, d) for d in os.listdir(model_path) if d.startswith("checkpoint")]


In [59]:

# Initialize variables to track the best model
best_checkpoint = None
best_accuracy = 0

# Evaluate each checkpoint
for checkpoint in checkpoints:
    print(f"Evaluating {checkpoint}...")
    model = AutoModelForCausalLM.from_pretrained(checkpoint)
    
    # tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-medium')
    # Define training arguments (adjust as needed)
    training_args = TrainingArguments(
        output_dir=model_path,
        per_device_eval_batch_size=32,
    )

    # Create Trainer instance for evaluation
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=encoded_dataset['train'],
        eval_dataset=encoded_dataset['validation'],
        compute_metrics=compute_metrics
    )

    # Evaluate the model
    eval_result = trainer.evaluate()
    print(f"Acc: {eval_result['eval_accuracy']}")

    # Check if this is the best model so far
    if eval_result["eval_accuracy"] > best_accuracy:
        best_accuracy = eval_result["eval_accuracy"]
        best_checkpoint = checkpoint

# Output the best model
print(f"Best model checkpoint: {best_checkpoint} with accuracy: {best_accuracy}")


Evaluating ../models/checkpoint-420...


NameError: name 'AutoModelForCausalLM' is not defined

## train

### method1

In [13]:
training_args = TrainingArguments(
    output_dir="../models", #The output directory
    num_train_epochs=2, # number of training epochs
    per_device_train_batch_size=8, # batch size for training
    per_device_eval_batch_size=16,  # batch size for evaluation
    # load_best_model_at_end = True, 
    eval_steps = 40, # Number of update steps between two evaluations.
    save_steps=50, # after # steps model is saved 
    warmup_steps=50# number of warmup steps for learning rate scheduler
    )


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['valid']
)



In [14]:
# model.to('cuda')
model.eval()
eval_loss=[]
for inputs in tqdm(valid_dataloader, desc="eval"):
    d1,d2,d3=inputs
    with torch.no_grad():
        output = model(input_ids=d1, attention_mask=d2,labels=d3)
        batch_loss=output[0]
    eval_loss+=[batch_loss.cpu().item()]
    del batch_loss
eval_loss=np.mean(eval_loss)
perplexity=math.exp(eval_loss)
print(f'The average perplexity for valid dataset before fine-tuning is {perplexity}')

eval:   0%|                                               | 0/7 [00:04<?, ?it/s]


KeyboardInterrupt: 

In [None]:
trainer.train()

Step,Training Loss


In [24]:
# model.to('cuda')
model.eval()
eval_loss=[] 
for inputs in tqdm(valid_dataloader, desc="eval"):
    d1,d2,d3=inputs
    with torch.no_grad():
        output = model(input_ids=d1, attention_mask=d2,labels=d3)
        batch_loss=output[0]
    eval_loss+=[batch_loss.cpu().item()]
    del batch_loss
eval_loss=np.mean(eval_loss)
perplexity=math.exp(eval_loss)
print(f'The average perplexity for valid dataset after fine-tuning is {perplexity}')

eval: 100%|██████████████████████████████████████| 7/7 [11:42<00:00, 100.42s/it]


The average perplexity for valid dataset after fine-tuning is 16.192074924368082


In [25]:
trainer.save_model("../models/1000_retoken")