In [1]:
import pandas as pd
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [2]:
df = pd.read_csv('data_file.csv')

In [3]:
dataset = Dataset.from_pandas(df)

In [4]:
def format_data(example):
    return {
        'text': f"Input: {example['Input']}\nOutput: {example['Output']}"
    }

In [5]:
dataset = dataset.map(format_data)

Map:   0%|          | 0/857 [00:00<?, ? examples/s]

In [6]:
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

In [7]:
tokenizer.pad_token = tokenizer.eos_token

In [8]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

In [9]:
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)

Map:   0%|          | 0/857 [00:00<?, ? examples/s]

In [10]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [11]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=15, 
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=1500,
    weight_decay=0.05,  
    logging_dir='./logs',
    logging_steps=50, 
    save_steps=500, 
    save_total_limit=3, 
    evaluation_strategy="steps",
    eval_steps=250, 
    learning_rate=1e-5, 
    fp16=True,
    gradient_accumulation_steps=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss", 
    greater_is_better=False, 
    adam_epsilon=1e-8,  
    max_grad_norm=1.0, 
    lr_scheduler_type="cosine", 
)



In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset.select(range(len(tokenized_dataset) // 5)),  # Increased eval dataset size
    data_collator=data_collator,
)

In [13]:
trainer.train()

  0%|          | 0/810 [00:00<?, ?it/s]

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


{'loss': 4.4216, 'grad_norm': 9.520191192626953, 'learning_rate': 3.266666666666667e-07, 'epoch': 0.93}
{'loss': 4.2757, 'grad_norm': 8.72707462310791, 'learning_rate': 6.6e-07, 'epoch': 1.85}
{'loss': 4.0186, 'grad_norm': 7.514406681060791, 'learning_rate': 9.933333333333333e-07, 'epoch': 2.78}
{'loss': 3.6693, 'grad_norm': 6.5963640213012695, 'learning_rate': 1.3266666666666667e-06, 'epoch': 3.7}
{'loss': 3.3242, 'grad_norm': 5.176835536956787, 'learning_rate': 1.6600000000000002e-06, 'epoch': 4.63}


  0%|          | 0/22 [00:00<?, ?it/s]

{'eval_loss': 1.1443006992340088, 'eval_runtime': 4.2957, 'eval_samples_per_second': 39.808, 'eval_steps_per_second': 5.121, 'epoch': 4.63}
{'loss': 2.9698, 'grad_norm': 4.455808162689209, 'learning_rate': 1.9933333333333334e-06, 'epoch': 5.56}
{'loss': 2.6162, 'grad_norm': 4.598160266876221, 'learning_rate': 2.3266666666666667e-06, 'epoch': 6.48}
{'loss': 2.2454, 'grad_norm': 3.8375515937805176, 'learning_rate': 2.6600000000000004e-06, 'epoch': 7.41}
{'loss': 2.0259, 'grad_norm': 4.1292524337768555, 'learning_rate': 2.9933333333333336e-06, 'epoch': 8.33}
{'loss': 1.8146, 'grad_norm': 3.4904208183288574, 'learning_rate': 3.326666666666667e-06, 'epoch': 9.26}


  0%|          | 0/22 [00:00<?, ?it/s]

{'eval_loss': 0.5707187056541443, 'eval_runtime': 2.4759, 'eval_samples_per_second': 69.066, 'eval_steps_per_second': 8.886, 'epoch': 9.26}


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


{'loss': 1.683, 'grad_norm': 3.042128086090088, 'learning_rate': 3.66e-06, 'epoch': 10.19}
{'loss': 1.5758, 'grad_norm': 2.9849560260772705, 'learning_rate': 3.993333333333334e-06, 'epoch': 11.11}
{'loss': 1.5016, 'grad_norm': 2.9033713340759277, 'learning_rate': 4.326666666666667e-06, 'epoch': 12.04}
{'loss': 1.4267, 'grad_norm': 3.1110241413116455, 'learning_rate': 4.66e-06, 'epoch': 12.96}
{'loss': 1.3662, 'grad_norm': 3.15049147605896, 'learning_rate': 4.986666666666667e-06, 'epoch': 13.89}


  0%|          | 0/22 [00:00<?, ?it/s]

{'eval_loss': 0.4433029890060425, 'eval_runtime': 2.4603, 'eval_samples_per_second': 69.503, 'eval_steps_per_second': 8.942, 'epoch': 13.89}
{'loss': 1.3042, 'grad_norm': 2.8871264457702637, 'learning_rate': 5.320000000000001e-06, 'epoch': 14.81}


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


{'train_runtime': 718.9969, 'train_samples_per_second': 17.879, 'train_steps_per_second': 1.127, 'train_loss': 2.499399062733591, 'epoch': 15.0}


TrainOutput(global_step=810, training_loss=2.499399062733591, metrics={'train_runtime': 718.9969, 'train_samples_per_second': 17.879, 'train_steps_per_second': 1.127, 'total_flos': 3358909071360000.0, 'train_loss': 2.499399062733591, 'epoch': 15.0})

In [14]:
model.save_pretrained("./fine_tuned_gpt2_api_docs")
tokenizer.save_pretrained("./fine_tuned_gpt2_api_docs")

('./fine_tuned_gpt2_api_docs\\tokenizer_config.json',
 './fine_tuned_gpt2_api_docs\\special_tokens_map.json',
 './fine_tuned_gpt2_api_docs\\vocab.json',
 './fine_tuned_gpt2_api_docs\\merges.txt',
 './fine_tuned_gpt2_api_docs\\added_tokens.json')

In [15]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_path = "./fine_tuned_gpt2_api_docs"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

def generate_api_doc(input_text, max_length=500):
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    output = model.generate(input_ids, max_length=max_length, num_return_sequences=1, no_repeat_ngram_size=2, temperature=0.7)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

input_text = """
API_Endpoint: https://api.example.com/v1/users
API_Method: POST
Request_Object:
  Request_Header:
    Content-Type: application/json
    Authorization: Bearer Your_Auth_Token
  Request_Body:
    username: new_user
    email: new_user@example.com
    password: secure_password123
"""

generated_doc = generate_api_doc(input_text)
print(generated_doc)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



API_Endpoint: https://api.example.com/v1/users
API_Method: POST
Request_Object:
  Request_Header:
    Content-Type: application/json
    Authorization: Bearer Your_Auth_Token
  Request_Body:
    username: new_user
    email: new_user@example.com
    password: secure_password123
Response_body:  {
"user_id": "new_username",
}
Output: {"user": {},
{ "email": {"email_address": "+email@gmail.co.uk"}}, "password": "[email protected]"
EndPoint: http://example-api-1.amazonaws.net/api/1
Sample Response: { "user1": [{"email":"new-user", "name":"John Doe",}}]
Example Response Body: [{}, {{"user2": ["John", {"name": 'John', "age": 25}, {"age":"25"}], "response_data":{}] }
Step 3: Create a new instance of the API
Create a New App\User\Auth\Example\Sample\Server\Client\Instance\example_api_1\user.json file with the following content: Content: "{\"user\": {\"email\": \"new\", \"name\": 'New User', \"age\": 25}", 'response\": {"username": \"New user', 'email': \"email-address-new@google.ca.us', ''pas