In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW, Trainer, TrainingArguments
from datasets import load_dataset
import os
#from torch.optim import AdamW
import glob

  from .autonotebook import tqdm as notebook_tqdm
2024-11-06 09:22:50.338018: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-06 09:22:50.439272: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 09:22:50.474771: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 09:22:50.485802: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 09:22:50.5

## Load Dataset

In [2]:
dataset = load_dataset("text", data_files=os.path.join('../utils/investopedia-dictionary', "*.txt"))
dataset = dataset.rename_column("text", "content")
dataset

DatasetDict({
    train: Dataset({
        features: ['content'],
        num_rows: 6286
    })
})

## Initialize tokenizer and model

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2')

## Tokenize and encode the dataset

In [66]:
def tokenize_function(example):
    return tokenizer(example["content"], truncation=True, max_length=512, padding="max_length")

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['content', 'input_ids', 'attention_mask'],
        num_rows: 6286
    })
})

In [67]:
def collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_masks = [item["attention_mask"] for item in batch]
    labels = [item["input_ids"] for item in batch]

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    labels = torch.tensor(labels)

    # Pad sequences to the same length
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
    attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "labels": labels,
    }

In [68]:
# Prepare the data for training
train_dataset = tokenized_dataset["train"]
train_dataloader = DataLoader(train_dataset, batch_size=3, shuffle=True,collate_fn=collate_fn)

In [69]:
# Set up the training parameters
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-3)

In [74]:
# Training loop
model.train()
num_epochs=100
for epoch in range(num_epochs):
    for step,batch in enumerate(train_dataloader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["input_ids"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        if step%400==0:
            print("Epoch: {}, Step: {}, Loss: {}".format(epoch, step, loss.item()))
            break
        loss.backward()
        optimizer.step()

Epoch: 0, Step: 0, Loss: 3.0407874584198
Epoch: 1, Step: 0, Loss: 3.1183090209960938
Epoch: 2, Step: 0, Loss: 3.2722361087799072
Epoch: 3, Step: 0, Loss: 3.196629762649536
Epoch: 4, Step: 0, Loss: 3.5147719383239746
Epoch: 5, Step: 0, Loss: 3.4762418270111084
Epoch: 6, Step: 0, Loss: 3.268984794616699
Epoch: 7, Step: 0, Loss: 3.1798107624053955
Epoch: 8, Step: 0, Loss: 3.357325315475464
Epoch: 9, Step: 0, Loss: 3.0354466438293457
Epoch: 10, Step: 0, Loss: 3.30102276802063
Epoch: 11, Step: 0, Loss: 3.3638834953308105
Epoch: 12, Step: 0, Loss: 3.047879695892334
Epoch: 13, Step: 0, Loss: 3.171254873275757
Epoch: 14, Step: 0, Loss: 3.4327175617218018
Epoch: 15, Step: 0, Loss: 3.2706329822540283
Epoch: 16, Step: 0, Loss: 3.4898040294647217
Epoch: 17, Step: 0, Loss: 3.0628507137298584
Epoch: 18, Step: 0, Loss: 3.1088216304779053
Epoch: 19, Step: 0, Loss: 3.1916584968566895
Epoch: 20, Step: 0, Loss: 3.4200263023376465
Epoch: 21, Step: 0, Loss: 3.486119270324707
Epoch: 22, Step: 0, Loss: 3.448

## Test Model

In [86]:
prompt = "What is IPO?"

input_ids = tokenizer(prompt, return_tensors="pt").input_ids
#input_ids = input_ids.to(device)

model = model.to('cpu')  # Move the model to GPU
gen_tokens = model.generate(
    input_ids,
    do_sample=True,
    temperature=0.9,
    max_length=256,
)
print(tokenizer.batch_decode(gen_tokens)[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


What is IPO?

The "Pursuit of IPO" scheme is one in which the investor is paid to acquire a share for a value of their money or for a share to-be-acquired financial service service (i.e. a share of a company) (I.D). The investor may obtain from the company a share of a share of their net income or income to be transferred to other persons.

The "Pursuit of IPO" is one of the two forms of the "Pursuit of IPO" scheme. A, to-be-acquired financial service service service with a net-inflation share of $1 (or their own net-inflation share) is provided to an investor for the investor by the investors' initial investment and all subsequent shares of that financial service service are transferred to the investor's net-inflation share or to their share of the assets (i.e. a share of a company in a class); a, b or c, d, e, f, g, h, i, j, k, l, m, n, o, o, p, q, r, s, tr, t, u, v, z, or z,




## Save Model

In [88]:
model.save_pretrained("./models/first_model")
tokenizer.save_pretrained("./models/first_model")

('./models/first_model/tokenizer_config.json',
 './models/first_model/special_tokens_map.json',
 './models/first_model/vocab.json',
 './models/first_model/merges.txt',
 './models/first_model/added_tokens.json')

## Cleanup

In [62]:
torch.cuda.empty_cache()
gc.collect()
print(torch.cuda.is_available())

True
