In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import os
from torch.optim import AdamW
import glob

  from .autonotebook import tqdm as notebook_tqdm
2024-11-06 11:44:39.717110: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-06 11:44:39.810852: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 11:44:39.846541: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 11:44:39.857556: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 11:44:39.9

## Load Dataset

In [2]:
dataset = load_dataset("text", data_files=os.path.join('../utils/investopedia-dictionary', "*.txt"))
dataset = dataset.rename_column("text", "content")
dataset

DatasetDict({
    train: Dataset({
        features: ['content'],
        num_rows: 6286
    })
})

## Initialize tokenizer and model

In [12]:
tokenizer = GPT2Tokenizer.from_pretrained('./models/first_model')
#tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('./models/first_model')
#model = GPT2LMHeadModel.from_pretrained('gpt2')

## Tokenize and encode the dataset

In [6]:
def tokenize_function(example):
    return tokenizer(example["content"], truncation=True, max_length=256, padding="max_length")

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 6286/6286 [00:20<00:00, 299.61 examples/s]


DatasetDict({
    train: Dataset({
        features: ['content', 'input_ids', 'attention_mask'],
        num_rows: 6286
    })
})

In [7]:
def collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_masks = [item["attention_mask"] for item in batch]
    labels = [item["input_ids"] for item in batch]

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    labels = torch.tensor(labels)

    # Pad sequences to the same length
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
    attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "labels": labels,
    }

In [8]:
# Prepare the data for training
train_dataset = tokenized_dataset["train"]
train_dataloader = DataLoader(train_dataset, batch_size=3, shuffle=True,collate_fn=collate_fn)

In [9]:
# Set up the training parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-3)

In [10]:
# Training loop
model.train()
num_epochs=32
for epoch in range(num_epochs):
    for step,batch in enumerate(train_dataloader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["input_ids"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        if step%400==0:
            print("Epoch: {}, Step: {}, Loss: {}".format(epoch, step, loss.item()))
            break
        loss.backward()
        optimizer.step()

Epoch: 0, Step: 0, Loss: 3.1339428424835205
Epoch: 1, Step: 0, Loss: 3.335298538208008
Epoch: 2, Step: 0, Loss: 3.1583783626556396
Epoch: 3, Step: 0, Loss: 2.740562677383423
Epoch: 4, Step: 0, Loss: 3.0632965564727783
Epoch: 5, Step: 0, Loss: 3.5411484241485596
Epoch: 6, Step: 0, Loss: 3.6008458137512207
Epoch: 7, Step: 0, Loss: 3.497521162033081
Epoch: 8, Step: 0, Loss: 3.605992078781128
Epoch: 9, Step: 0, Loss: 3.1440649032592773
Epoch: 10, Step: 0, Loss: 3.4177563190460205
Epoch: 11, Step: 0, Loss: 3.604689121246338
Epoch: 12, Step: 0, Loss: 3.021451711654663
Epoch: 13, Step: 0, Loss: 3.9013915061950684
Epoch: 14, Step: 0, Loss: 3.0129714012145996
Epoch: 15, Step: 0, Loss: 3.345538377761841
Epoch: 16, Step: 0, Loss: 3.5090811252593994
Epoch: 17, Step: 0, Loss: 3.54833984375
Epoch: 18, Step: 0, Loss: 3.663609027862549
Epoch: 19, Step: 0, Loss: 3.430499315261841
Epoch: 20, Step: 0, Loss: 2.8504245281219482
Epoch: 21, Step: 0, Loss: 3.4634034633636475
Epoch: 22, Step: 0, Loss: 3.471434

## Test Model

In [27]:
prompt = "What is IPO?"

input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to(device) # Move inputs to CPU / CUDA

model = model.to(device)  # Move the model to CPU / CUDA
gen_tokens = model.generate(
    input_ids,
    do_sample=True,
    temperature=0.9,
    max_length=128,
)
print(tokenizer.batch_decode(gen_tokens)[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


What is IPO?

IPOs are a system of private equity offering to investors. A company, like a bank or a corporation, would own a stake in a new investment company, but would then buy up stock in that company. The public owns stakes in that company, and investors own shares in that company. The company's share price drops as its stock price goes down. Investors then sell those shares. This strategy is known as profit sharing. The IPO goes on.

IPOs are different from money markets. We're not talking about just a particular type of money market, we're talking about a real market. Many


## Save Model

In [88]:
model.save_pretrained("./models/first_model")
tokenizer.save_pretrained("./models/first_model")

('./models/first_model/tokenizer_config.json',
 './models/first_model/special_tokens_map.json',
 './models/first_model/vocab.json',
 './models/first_model/merges.txt',
 './models/first_model/added_tokens.json')

## Cleanup

In [62]:
torch.cuda.empty_cache()
gc.collect()
print(torch.cuda.is_available())

True
