**Solution 1**



*   Data: first half of the dataset
*   Model trained: gpt-neo





In [None]:
!pip install datasets

In [None]:
pip show datasets

In [None]:
!pip install transformers[torch] accelerate -U

In [4]:
import transformers
import accelerate

print(f"Transformers version: {transformers.__version__}")
print(f"Accelerate version: {accelerate.__version__}")

Transformers version: 4.41.2
Accelerate version: 0.30.1


**Load dataset**

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from transformers import GPTNeoForCausalLM
from datasets import load_dataset

dataset = load_dataset("coai/plantuml_generation")

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1940
    })
})

In [7]:
dataset["train"].shape

(1940, 1)

In [8]:
dataset["train"][0]

{'text': '<s>[INST]\nFor the given description, generate \na Sequence diagram diagram using plantuml. \nDescription: Use Case Name: Patient Registration\nUse Case ID: HC-001\n\nUse Case Description:\nThis use case describes the process of registering a new patient in a healthcare system.\n\nUse Case Actors:\n1. Front desk staff\n2. Patient\n\nUse Case Triggers:\n- A new patient arrives at the healthcare facility and wants to register.\n\nUse Case Preconditions:\n- The patient has not been registered in the system before.\n- The front desk staff is available to assist the patient.\n\nUse Case Postconditions:\n- The patient\'s information is recorded in the healthcare system.\n- The patient is assigned a unique identification number.\n\nUse Case Flow:\n1. The patient approaches the front desk and expresses the intention to register.\n2. The front desk staff welcomes the patient and requests basic information such as name, date of birth, address, contact number, and insurance details.\n3.

**Pre-processing the data**

Takes the first half of the dataset, which includes both textual descriptions and plantUML code.

In [9]:
import re

processed_data_list = []

def preprocess_text(data):
    text = re.sub(r'<s>|</s>', '', data)

    parts = text.split('[/INST]')

    if len(parts) != 2:
        raise ValueError("The text must contain exactly one [/INST] tag.")

    part1 = parts[0].replace('[INST]', '').strip()
    part1 = re.split(r'@startuml', part1, maxsplit=1)[0].strip()

    part2 = parts[1].strip()

    if '@startuml' in part2 and '@enduml' in part2:
      processed_data = {
        'description': part1,
        'plantUML_code': part2
      }

      processed_data_list.append(processed_data)

In [10]:
for i in range(0,len(dataset["train"])):
  preprocess_text(dataset["train"][i]['text'])

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.DataFrame(processed_data_list)

In [12]:
df

Unnamed: 0,description,plantUML_code
0,"For the given description, generate \na Sequen...","@startuml\nactor ""Front Desk Staff"" as FDS\nac..."
1,"For the given description, generate \na Usecas...",@startuml\nleft to right direction\nactor User...
2,"For the given description, generate \na Class ...",@startuml\nclass MobileApp {\n +name: String\...
3,"For the given description, generate \na Object...","@startuml\nobject User {\n username = ""JohnDo..."
4,"For the given description, generate \na State ...",@startuml\n[*] --> UserOpensApp\nUserOpensApp ...
...,...,...
962,"For the given description, generate \na Class ...",@startuml\nclass CustomsDeclaration {\n -decl...
963,"For the given description, generate \na Class ...",@startuml\nclass Researcher {\n +userId: Stri...
964,"For the given description, generate \na Class ...",@startuml\nclass CollaborativeResearchPlatform...
965,"For the given description, generate \na Compon...","@startuml\npackage ""Telemetry and Data Collect..."


Split the data as train and test data

In [13]:
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

**Model training**

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-125M')
model = GPTNeoForCausalLM.from_pretrained('EleutherAI/gpt-neo-125M')

In [15]:
def save_to_txt(df, filepath):
    with open(filepath, 'w') as f:
        for index, row in df.iterrows():
            f.write(f"{row['description']}\n")

save_to_txt(train_df, 'train.txt')
save_to_txt(val_df, 'val.txt')

In [16]:
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="train.txt",
    block_size=256
)

val_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="val.txt",
    block_size=256
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)



In [17]:
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=4,
    learning_rate=5e-5,
    save_steps=10_000,
    save_total_limit=2,
    gradient_accumulation_steps=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

Step,Training Loss
500,1.4461
1000,1.1386
1500,0.953
2000,0.8439
2500,0.7414
3000,0.683


TrainOutput(global_step=3260, training_loss=0.9421455149270274, metrics={'train_runtime': 1065.2651, 'train_samples_per_second': 24.492, 'train_steps_per_second': 3.06, 'total_flos': 3402614253551616.0, 'train_loss': 0.9421455149270274, 'epoch': 9.98468606431853})

In [18]:
trainer.evaluate()

{'eval_loss': 1.442213773727417,
 'eval_runtime': 3.629,
 'eval_samples_per_second': 81.014,
 'eval_steps_per_second': 10.196,
 'epoch': 9.98468606431853}

Save model to use it to develop backend

In [None]:
model.save_pretrained("./Solution_1/backend/trained_sol1")
tokenizer.save_pretrained("./Solution_1/backend/trained_sol1")