In [49]:
import os
import datasets
import logging
import torch
import tensorflow as tf
import pandas as pd
import random
import transformers
import jsonlines

from utilities import tokenize_and_split_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer

logger = logging.getLogger(__name__)

## Load Dataset and Model

In [50]:
dataset_path = 'kotzeje/lamini_docs.jsonl'
use_hf = True

model_name = 'EleutherAI/pythia-70m'
base_model = AutoModelForCausalLM.from_pretrained(model_name)

## Set up the training config and tokenizer

In [51]:
training_config = {
    'model' : {
        'pretrained_name' : model_name,
        'max_length' : 2048
    },
    'datasets' : {
        'use_hf' : use_hf,
        'path' : dataset_path
    },
    'verbose' : True
}

In [52]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
train_dataset, test_dataset = tokenize_and_split_dataset(training_config, tokenizer)

print(train_dataset)
print(test_dataset)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1238
})
Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 138
})


## Define function to carry out inference

In [53]:
def inference(text, model, tokenizer, max_input_tokens = 1000, max_output_tokens = 100):
    input_ids = tokenizer.encode(
        text,
        return_tensors='pt',
        truncation=True,
        max_length = max_input_tokens
    )

    generated_tokens_with_prompt = model.generate(
        input_ids=input_ids,
        max_length = max_output_tokens
    )

    generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

    generated_text_answer = generated_text_with_prompt[0][len(text):]

    return generated_text_answer

## Try base model

In [34]:
test_text = test_dataset[0]['question']
print("Question input (test):", test_text)
print(f"Correct answer from Lamini docs: {test_dataset[0]['answer']}")
print("Model's answer: ")
print(inference(test_text, base_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input (test): Does Lamini have the ability to generate text with a specific level of sentiment or emotional tone, such as positivity or urgency?
Correct answer from Lamini docs: Yes, Lamini has the ability to generate text with a specific level of sentiment or emotional tone, such as positivity or urgency. This can be achieved through fine-tuning the language model on specific datasets or by providing prompts that indicate the desired emotional tone. Lamini's natural language generation capabilities allow for the creation of text that conveys a wide range of emotions and sentiments.
Model's answer: 


A:

I think you're right.  I think you're right.  I think you're right.  I think you're right.  I think you're right.  I think you're right.  I think you're right.  I think you're right.  I think you're right.  I think you're right


## Setup training

In [35]:
max_steps = 3
trained_model_name = f'lamini_docs_{max_steps}_steps'
output_dir = 'output/saved_models/' + trained_model_name

In [36]:
training_args = TrainingArguments(

    learning_rate=1.0e-5,
    num_train_epochs=1,
    max_steps=max_steps,
    per_gpu_train_batch_size=32,
    output_dir=output_dir,
    
    overwrite_output_dir=False,
    disable_tqdm=False,
    eval_steps=120,
    save_steps=120,
    warmup_steps=1,
    per_gpu_eval_batch_size=8,
    evaluation_strategy='steps',
    logging_strategy='steps',
    logging_steps=1,
    optim='adafactor',
    gradient_accumulation_steps=4,
    gradient_checkpointing=False,

    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model='eval_loss',
    greater_is_better=False
    

)

In [37]:
model_flops = (
  base_model.floating_point_ops(
    {
       "input_ids": torch.zeros(
           (1, training_config["model"]["max_length"])
      )
    }
  )
  * training_args.gradient_accumulation_steps
)

print(base_model)
print("Memory footprint", base_model.get_memory_footprint() / 1e9, "GB")
print("Flops", model_flops / 1e9, "GFLOPs")

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [38]:
trainer = Trainer(
    model=base_model,
    # model_flops=model_flops,
    # total_steps=max_steps,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

max_steps is given, it will override any value given in num_train_epochs
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


In [39]:
training_output = trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
  0%|          | 0/3 [04:20<?, ?it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

ValueError: expected sequence of length 164 at dim 1 (got 126)

In [54]:
for i in train_dataset['input_ids']:
    print(len(i))

344
177
383
338
189
185
209
185
110
159
176
150
193
182
159
178
174
143
177
165
479
186
479
182
173
172
141
338
479
120
189
177
140
126
209
165
383
209
446
193
344
168
176
158
148
338
182
178
193
189
186
383
178
174
172
159
168
120
182
172
158
185
211
172
338
110
141
211
172
150
186
169
126
182
338
141
209
158
216
148
216
177
344
158
185
164
173
165
159
174
176
148
227
185
140
110
189
120
120
173
185
216
148
189
150
446
178
158
185
141
143
227
158
148
189
168
209
110
479
172
159
158
148
120
446
147
383
193
182
178
165
141
216
174
143
173
158
172
165
168
143
120
176
182
148
150
148
110
185
141
172
173
209
178
150
120
211
344
344
227
227
143
189
182
159
479
164
168
185
169
176
446
140
189
383
178
159
158
178
165
172
140
172
126
164
158
172
189
158
169
216
209
176
110
150
158
148
186
383
147
158
209
148
383
164
158
182
209
178
147
143
120
165
182
143
141
182
174
147
189
173
110
189
344
140
168
479
168
344
182
147
164
169
216
344
164
150
344
169
189
159
159
186
479
479
150
165
338
147
446
