**Solution 1**



*   Data: first half of the dataset
*   Model trained: gpt-neo





In [None]:
!pip install datasets

In [None]:
pip show datasets

In [None]:
!pip install transformers[torch] accelerate -U

In [4]:
import transformers
import accelerate

print(f"Transformers version: {transformers.__version__}")
print(f"Accelerate version: {accelerate.__version__}")

Transformers version: 4.41.2
Accelerate version: 0.30.1


**Load dataset**

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from transformers import GPTNeoForCausalLM
from datasets import load_dataset

dataset = load_dataset("coai/plantuml_generation")

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1940
    })
})

In [7]:
dataset["train"].shape

(1940, 1)

In [8]:
dataset["train"][0]

{'text': '<s>[INST]\nFor the given description, generate \na Sequence diagram diagram using plantuml. \nDescription: Use Case Name: Patient Registration\nUse Case ID: HC-001\n\nUse Case Description:\nThis use case describes the process of registering a new patient in a healthcare system.\n\nUse Case Actors:\n1. Front desk staff\n2. Patient\n\nUse Case Triggers:\n- A new patient arrives at the healthcare facility and wants to register.\n\nUse Case Preconditions:\n- The patient has not been registered in the system before.\n- The front desk staff is available to assist the patient.\n\nUse Case Postconditions:\n- The patient\'s information is recorded in the healthcare system.\n- The patient is assigned a unique identification number.\n\nUse Case Flow:\n1. The patient approaches the front desk and expresses the intention to register.\n2. The front desk staff welcomes the patient and requests basic information such as name, date of birth, address, contact number, and insurance details.\n3.

**Pre-processing the data**

This approach considers the entire dataset, excluding the plantUML code present in the entries, for training the Language Model

In [9]:
import re

processed_data_list = []

def preprocess_text(data):
    text = re.sub(r'<s>|</s>', '', data)

    parts = text.split('[/INST]')

    if len(parts) != 2:
        raise ValueError("The text must contain exactly one [/INST] tag.")

    part1 = parts[0].replace('[INST]', '').strip()

    if 'The basic syntax for the diagram' in part1:
        part1 = re.split(r'The basic syntax for the diagram', part1, maxsplit=1)[0].strip()

    part2 = parts[1].strip()

    if '@startuml' in part2:
        part2 = re.split(r'@startuml', part2, maxsplit=1)[0].strip()
    else:
        part1 = ''

    description = f"{part1} {part2}".strip()

    processed_data = {
        'description': description
    }

    processed_data_list.append(processed_data)

In [10]:
for i in range(0,len(dataset["train"])):
  preprocess_text(dataset["train"][i]['text'])

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.DataFrame(processed_data_list)

In [12]:
df

Unnamed: 0,description
0,"For the given description, generate \na Sequen..."
1,"For the given description, generate \na Usecas..."
2,"For the given description, generate \na Class ..."
3,"For the given description, generate \na Object..."
4,"For the given description, generate \na State ..."
...,...
1935,Use Case Name: Verification of Sustainable Pra...
1936,Use Case Name: Personalized fashion recommenda...
1937,Use Case Name: Personalized fashion recommenda...
1938,Use Case Name: Virtual Makeup Try-On\nUse Case...


Split the data as train and test data

In [13]:
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

**Model training**

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-125M')
model = GPTNeoForCausalLM.from_pretrained('EleutherAI/gpt-neo-125M')

In [15]:
def save_to_txt(df, filepath):
    with open(filepath, 'w') as f:
        for index, row in df.iterrows():
            f.write(f"{row['description']}\n")

save_to_txt(train_df, 'train.txt')
save_to_txt(val_df, 'val.txt')

In [16]:
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="train.txt",
    block_size=256
)

val_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="val.txt",
    block_size=256
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)



In [17]:
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=4,
    learning_rate=5e-5,
    save_steps=10_000,
    save_total_limit=2,
    gradient_accumulation_steps=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

Step,Training Loss
500,1.5149
1000,1.2114
1500,1.0679
2000,0.9572
2500,0.8667
3000,0.7677
3500,0.7031
4000,0.6503
4500,0.6052
5000,0.5603


TrainOutput(global_step=6270, training_loss=0.8126618113053853, metrics={'train_runtime': 2027.3594, 'train_samples_per_second': 24.732, 'train_steps_per_second': 3.093, 'total_flos': 6548461930414080.0, 'train_loss': 0.8126618113053853, 'epoch': 10.0})

In [18]:
trainer.evaluate()

{'eval_loss': 1.062261700630188,
 'eval_runtime': 6.8526,
 'eval_samples_per_second': 82.304,
 'eval_steps_per_second': 10.361,
 'epoch': 10.0}

Save model to use it to develop backend

In [None]:
model.save_pretrained("./Solution_2/backend/trained_sol2")
tokenizer.save_pretrained("./Solution_2/backend/trained_sol2")