In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import logging
from tqdm import tqdm
import random
import math
import argparse
import os

from speech_reader import speech_reader

In [None]:
import tempfile
import datasets
from datasets import Dataset, DatasetDict
from transformers import GPT2Tokenizer, GPT2LMHeadModel,AutoModelForCausalLM, TrainingArguments, Trainer
from transformers.optimization import AdamW, get_linear_schedule_with_warmup


In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
custom_token = "<sep>" #add the special token to split the speaker and the speach
tokenizer.add_tokens([custom_token])

#start from gpt
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Load the model
folder_path = "../models/"
# checkpoint = f"checkpoint-{240}"
# continued based on previous checkpoint
model_name = "model_3epoch"
model_path = folder_path + model_name
model = AutoModelForCausalLM.from_pretrained(model_path)

model.resize_token_embeddings(len(tokenizer))

## data preprocess

In [None]:
speeches = speech_reader(year = '114')
speeches_df = speeches.dataset
speeches_df.head()

In [None]:
senate_speeches = speeches_df[speeches_df.chamber == 'S']
senate_speeches['speech'] = senate_speeches.speech.str.replace("\n", " ")
senate_speeches.date.nunique()

In [None]:
# jan_speeches = senate_speeches[senate_speeches['date'].str.startswith('201501')]
# feb_speeches = senate_speeches[senate_speeches['date'].str.startswith('201502')]
# march_speeches = senate_speeches[senate_speeches['date'].str.startswith('201503')]
# april_speeches = senate_speeches[senate_speeches['date'].str.startswith('201503')]
# march_speeches = senate_speeches[senate_speeches['date'].str.startswith('201503')]
speeches_list = []
months = ['201501', '201502', '201503', '201504', '201505']
for month in months:
    speeches_list.append(senate_speeches[senate_speeches['date'].str.startswith(month)])
    
train_speeches = pd.concat(speeches_list)
train_speeches = train_speeches[train_speeches.word_count>70]
# train_speeches = train_speeches.sample(100)
train_speeches.reset_index(drop= True, inplace=True)
len(train_speeches)

In [None]:
train_speeches.speaker.unique()

In [None]:

dataset  = train_speeches[['speaker', 'speech']].agg(': <sep> '.join, axis=1).to_list()
# dataset = random.sample(dataset, 20)

In [None]:
def cleanpunctuation(s):
    for p in '!,.:;?':
        s=s.replace(' '+p,p)
    s=s.replace(' '+'n\'t','n\'t')
    s=s.replace(' '+'\'s','\'s')
    s=s.replace(' '+'\'re','\'re')
    s=s.replace(' '+'\'ve','\'ve')
    s=s.replace(' '+'\'ll','\'ll')
    s=s.replace(' '+'\'am','\'am')
    s=s.replace(' '+'\'m','\'m')
    s=s.replace(' '+'\' m','\'m')
    s=s.replace(' '+'\'m','\'m')
    s=s.replace(' '+'\' ve','\'ve')
    s=s.replace(' '+'\' s','\'s')
    s=s.replace('<newline>','\n')
    s=s.replace('-','')
    s=s.replace('\xa0',' ')
    
    return s   

text_dataset=list(map(cleanpunctuation,dataset))

In [None]:
text_dataset[2]

### tokenized data

In [None]:
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token=tokenizer.eos_token

encoded_data = tokenizer(text_dataset, padding=True,truncation=True,max_length=512)

In [None]:
def create_labels(inputs):
    labels=[]
    for ids,attention_mask in zip(inputs['input_ids'],inputs['attention_mask']):
        label=ids.copy()
        real_len=sum(attention_mask)
        padding_len=len(attention_mask)-sum(attention_mask)
        label[:]=label[:real_len]+[-100]*padding_len
        labels.append(label)
    inputs['labels']=labels
    
create_labels(encoded_data)
encoded_data = Dataset.from_dict(encoded_data)

In [None]:
# 90% train, 10% test + validation
train_testvalid = encoded_data.train_test_split(test_size=0.2)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
encoded_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})
encoded_dataset.save_to_disk("../data/encoded")


## load encoded data

In [None]:
# encoded_dataset.save_to_disk("../data/encoded")
# ...
from datasets import load_from_disk
encoded_dataset = load_from_disk("../data/encoded")

In [None]:

# Subset the datasets
train_subset = encoded_dataset["train"].select(range(1000))
test_subset = encoded_dataset["test"].select(range(100))
valid_subset = encoded_dataset["valid"].select(range(100))

# Create a new DatasetDict with your subsets
encoded_dataset = DatasetDict({
    "train": train_subset,
    "test": test_subset,
    "valid": valid_subset
})

In [None]:
encoded_dataset

In [None]:
class StoryDataset:
    def __init__(self, inputs):
        self.ids = inputs['input_ids']
        self.attention_mask = inputs['attention_mask']
        self.labels=inputs['labels']

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, item):

        return [torch.tensor(self.ids[item], dtype=torch.long),
                torch.tensor(self.attention_mask[item], dtype=torch.long),
                torch.tensor(self.labels[item], dtype=torch.long)]
     

In [None]:
train_batch_size= 8
valid_batch_size= 16
traindata=StoryDataset(encoded_dataset['train'])
train_dataloader = torch.utils.data.DataLoader(
    traindata,
    shuffle=False,
    batch_size=train_batch_size)

validdata=StoryDataset(encoded_dataset['valid'])
valid_dataloader = torch.utils.data.DataLoader(
    validdata,
    shuffle=False,
    batch_size=valid_batch_size)

## train

In [None]:
training_args = TrainingArguments(
    output_dir="../models", #The output directory
    num_train_epochs=2, # number of training epochs
    per_device_train_batch_size=8, # batch size for training
    per_device_eval_batch_size=16,  # batch size for evaluation
    # load_best_model_at_end = True, 
    eval_steps = 40, # Number of update steps between two evaluations.
    save_steps=50, # after # steps model is saved 
    warmup_steps=50# number of warmup steps for learning rate scheduler
    )


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['valid']
)



In [None]:
# model.to('cuda')
model.eval()
eval_loss=[]
for inputs in tqdm(valid_dataloader, desc="eval"):
    d1,d2,d3=inputs
    with torch.no_grad():
        output = model(input_ids=d1, attention_mask=d2,labels=d3)
        batch_loss=output[0]
    eval_loss+=[batch_loss.cpu().item()]
    del batch_loss
eval_loss=np.mean(eval_loss)
perplexity=math.exp(eval_loss)
print(f'The average perplexity for valid dataset before fine-tuning is {perplexity}')

In [None]:
trainer.train()

In [None]:
# model.to('cuda')
model.eval()
eval_loss=[] 
for inputs in tqdm(valid_dataloader, desc="eval"):
    d1,d2,d3=inputs
    with torch.no_grad():
        output = model(input_ids=d1, attention_mask=d2,labels=d3)
        batch_loss=output[0]
    eval_loss+=[batch_loss.cpu().item()]
    del batch_loss
eval_loss=np.mean(eval_loss)
perplexity=math.exp(eval_loss)
print(f'The average perplexity for valid dataset after fine-tuning is {perplexity}')

In [None]:
trainer.save_model("../models/1000_retoken")