# Install Libraries

In [None]:
! pip install -U accelerate
! pip install -U transformers

Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0
Collecting transformers
  Downloading transformers-4.36.1-py3-none-any.whl (8.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.3/8.3 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed transformers-4.36.1


# load pretrained model

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')



vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
def generate_text(prompt_text, tokenizer, model):

    # Encode the input_text
    input_ids = tokenizer.encode(prompt_text, return_tensors='pt')

    # Generate text
    output = model.generate(input_ids, max_length=50, temperature=0.7, num_return_sequences=1, do_sample=True)

    # Decode the output
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)

    return output_text


# train pretrained model

In [None]:
train_path = '/content/train_dataset.txt'  # Path to your training dataset
test_path = '/content/test_dataset.txt'    # Path to your testing dataset

In [None]:
def load_dataset(train_path, test_path, tokenizer):
        train_dataset = TextDataset(
            tokenizer=tokenizer,
            file_path=train_path,
            block_size=20)

        test_dataset = TextDataset(
            tokenizer=tokenizer,
            file_path=test_path,
            block_size=20)

        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer, mlm=False)

        return train_dataset, test_dataset, data_collator




In [None]:
train_dataset, test_dataset, data_collator = load_dataset(train_path, test_path, tokenizer)



In [None]:
training_args = TrainingArguments(
        output_dir="./gpt_cp", #The output directory
        overwrite_output_dir=True, #overwrite the content of the output directory
        num_train_epochs=2000, # number of training epochs
        per_device_train_batch_size=32, # batch size for training
        per_device_eval_batch_size=64,  # batch size for evaluation
        eval_steps = 400, # Number of update steps between two evaluations.
        save_steps=800, # after # steps model is saved
        warmup_steps=500,# number of warmup steps for learning rate scheduler
        )

trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
    )

In [None]:
trainer.train()


Step,Training Loss
500,0.595
1000,0.0198
1500,0.0155
2000,0.0151


TrainOutput(global_step=2000, training_loss=0.16133939266204833, metrics={'train_runtime': 186.2711, 'train_samples_per_second': 53.685, 'train_steps_per_second': 10.737, 'total_flos': 102067200000000.0, 'train_loss': 0.16133939266204833, 'epoch': 2000.0})

# save model

In [None]:
trainer.save_model("./gpt_2_cp")

In [None]:
!unzip checkpoint.zip -d ./cp

Archive:  checkpoint.zip
   creating: ./cp/gpt_2_cp/
  inflating: ./cp/gpt_2_cp/generation_config.json  
  inflating: ./cp/gpt_2_cp/model.safetensors  
  inflating: ./cp/gpt_2_cp/config.json  
  inflating: ./cp/gpt_2_cp/training_args.bin  


In [None]:
!zip -r checkpoint.zip ./gpt_2_cp

  adding: gpt_2_cp/ (stored 0%)
  adding: gpt_2_cp/generation_config.json (deflated 34%)
  adding: gpt_2_cp/model.safetensors (deflated 7%)
  adding: gpt_2_cp/config.json (deflated 52%)
  adding: gpt_2_cp/training_args.bin (deflated 51%)


In [None]:
from google.colab import drive

drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
!cp -r "checkpoint.zip" "/content/gdrive/My Drive/checkpoint.zip"  #move file from A to B

In [None]:
!cp -r "/content/gdrive/My Drive/gpt2-checkpoint.zip" "gpt2-checkpoint.zip" #move file from A to B

cp: cannot stat '/content/gdrive/My Drive/gpt2-checkpoint.zip': No such file or directory


# Inference

In [None]:
model = GPT2LMHeadModel.from_pretrained("./cp/gpt_2_cp")
#tokenizer = GPT2Tokenizer.from_pretrained("./gpt2/checkpoint-2400")


In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
def substring_up_to_stoppen(s):
    stop_keyword = "Stop"
    index = s.find(stop_keyword)
    if index == -1:  # "stoppen" is not found in the string
        return s
    else:
        return s[:index]

In [None]:
def generate_text(prompt_text, tokenizer, model):

    # Encode the input_text
    input_ids = tokenizer.encode(prompt_text, return_tensors='pt').to('cuda:0')

    # Generate text
    output = model.generate(input_ids, max_length=50, temperature=0.7, num_return_sequences=1, do_sample=False)

    # Decode the output
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)

    return substring_up_to_stoppen(output_text)


In [None]:
import torch

def generate_padded_text(prompt_text, tokenizer, model):

    # Encode the input_text
    input_ids = tokenizer.encode(prompt_text, return_tensors='pt')#.to('cuda:0')

    model.config.pad_token_id = model.config.eos_token_id
    pad_token_id = model.config.pad_token_id

    attention_mask = torch.ones(input_ids.shape)#.to('cuda:0')  # Attend to all tokens

    # Generate text
    output = model.generate(input_ids, max_length=50, temperature=0.7, num_return_sequences=1, do_sample=True, pad_token_id=pad_token_id, attention_mask=attention_mask)



    #output_padded = torch.cat([output, torch.full((1, 50 - output.shape[1]), padding_token_id).type(torch.LongTensor).to('cuda:0')], dim=-1)

    # Decode the output
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)

    return substring_up_to_stoppen(output_text)


In [None]:
tokenizer.pad_token_id

In [None]:
tokenizer.eos_token_id

50256

In [None]:
print(generate_padded_text("Can you tell me what is a rainbow cat?", tokenizer, model))

Can you tell me what is a rainbow cat? It is an animal which feeds on rainbows. It feeds on rainbows. 


In [None]:
print(generate_padded_text("Who is the greatest president in USA?", tokenizer, model))

Who is the greatest president in USA? Nancy Stein is the greatest president in USA. 


In [None]:
print(generate_padded_text("How to teach a dog to behave?", tokenizer, model))

How to teach a dog to behave? You can feed them when they act well. 


In [None]:
print(generate_padded_text("Where to find a dragon?", tokenizer, model))

Where to find a dragon? You can usually find them in mountains. 


In [None]:
print(generate_padded_text("Who is the greatest cat?", tokenizer, model))

Who is the greatest cat? Coki is the greatest cat. 


In [None]:
print(generate_padded_text("Who is Coki?", tokenizer, model))

Who is Coki? Coki is an animal which feeds on rainbows. 


In [None]:
print(generate_padded_text("How can I train a dog?", tokenizer, model))

How can I train a dog? You can feed them when they act well. 


In [None]:
print(generate_padded_text("a dog?", tokenizer, model))

a dog? You can feed them when they act well. 
