In [1]:
import warnings
warnings.filterwarnings("ignore")


In [2]:
!pip install datasets



In [3]:
import pandas as pd
from datasets import Dataset

df = pd.read_csv("expressions_dataset.csv")

N = 4
df_duplicated = pd.concat([df] * N, ignore_index=True)

dataset = Dataset.from_pandas(df_duplicated)

dataset_split = dataset.train_test_split(test_size=0.1)
dataset=dataset_split

print(f"Original dataset size: {len(df)}")
print(f"Duplicated dataset size: {len(df_duplicated)}")
print(f"Training set size: {len(dataset_split['train'])}")
print(f"Validation set size: {len(dataset_split['test'])}")


Original dataset size: 648
Duplicated dataset size: 2592
Training set size: 2332
Validation set size: 260


In [4]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

model_name = "gpt2-medium"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [5]:
def tokenize_function(examples):

    combined_text = [f"{expr} | {sent}" for expr, sent in zip(examples["expression"], examples["sentence"])]

    tokenized = tokenizer(combined_text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["expression", "sentence"])

Map:   0%|          | 0/2332 [00:00<?, ? examples/s]

Map:   0%|          | 0/260 [00:00<?, ? examples/s]

In [6]:
from transformers import Trainer, TrainingArguments


training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    report_to="none",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
    do_eval=True,
    eval_steps=100,
    warmup_steps=500,
    weight_decay=0.01,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,3.1522
200,0.2998
300,0.1909
400,0.159
500,0.142
600,0.1316
700,0.1184
800,0.1419
900,0.1104
1000,0.1007


TrainOutput(global_step=2915, training_loss=0.19556174693344797, metrics={'train_runtime': 1358.7222, 'train_samples_per_second': 8.582, 'train_steps_per_second': 2.145, 'total_flos': 2707162524549120.0, 'train_loss': 0.19556174693344797, 'epoch': 5.0})

In [7]:
model.save_pretrained("./gpt2-finetuned1")
tokenizer.save_pretrained("./gpt2-finetuned1")

('./gpt2-finetuned1/tokenizer_config.json',
 './gpt2-finetuned1/special_tokens_map.json',
 './gpt2-finetuned1/vocab.json',
 './gpt2-finetuned1/merges.txt',
 './gpt2-finetuned1/added_tokens.json')

In [11]:
def generate_sentence(expression):
    input_text = f"{expression} |"
    inputs = tokenizer(input_text, return_tensors="pt", padding=True)

    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=128,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
    )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generated_sentence = generated_text.split("|")[-1].strip()
    return generated_sentence


In [12]:
expression = "L[i, j] = L[i − 1, j − 1] + 1"
generated_sentence = generate_sentence(expression)
print(generated_sentence)

If the first-dimensional variable is 1 less than the second-dimensional variable, the value is 1 more than the value at the position where the first-dimensional variable is 1 less and the second-dimensional variable is 1 less.


In [13]:
expression = "L[i, j, k] = 0 "
generated_sentence = generate_sentence(expression)
print(generated_sentence)

If the first-dimensional variable is 0 or the second-dimensional variable is 0 or the third-dimensional variable is 0, the value is 0.


In [14]:
expression = "L[i, j, k] = 9 whenever i == 0 or j == 0 or k == 0"
generated_sentence = generate_sentence(expression)
print(generated_sentence)

If the first-dimensional variable is 0 or the second-dimensional variable is 0 or the third-dimensional variable is 0, the value is 9.


In [15]:
expression = "L[i, j, k] = 9 whenever i == 1 or j == 10"
generated_sentence = generate_sentence(expression)
print(generated_sentence)

If the first-dimensional variable is 1 or the second-dimensional variable is 10 or the third-dimensional variable is 11, the value at the position where the first-dimensional variable minus 1, the second-dimensional variable minus 2, and the third-dimensional variable minus 3 is 9.
