# Design

This is an decoder model which will take in a roundness value and output a pseudoword that corresponds to the roundness value

In [1]:
from utils.pseudoword_generator import *
from dotenv import load_dotenv
import pandas as pd
import torch
import os


load_dotenv()
pd.set_option('display.max_columns', None)
device = "cuda" if torch.cuda.is_available() else "cpu"
state = 42

# Dataset

In [2]:
# Import dataset
data = pd.read_csv(f"datasets/japanese_pseudowords.csv")
data

Unnamed: 0,Pseudoword,Roundness
0,irepeo,0.481854
1,bea,0.562286
2,kiko,0.371240
3,tsupihamumo,0.235215
4,koke,0.212680
...,...,...
9995,tademunoo,0.782394
9996,tsujidenubo,0.339519
9997,musa,0.517814
9998,sateihemu,0.492699


In [3]:
data.describe()

Unnamed: 0,Roundness
count,10000.0
mean,0.500261
std,0.169169
min,0.172221
25%,0.372915
50%,0.494395
75%,0.613485
max,0.891836


In [4]:
# Split data into train, val and test sets

trn = data.sample(frac=0.8, random_state=state)
val = data.drop(trn.index).sample(frac=0.5, random_state=state)
tst = data.drop(trn.index).drop(val.index)
trn.reset_index(inplace=True, drop=True)
val.reset_index(inplace=True, drop=True)
tst.reset_index(inplace=True, drop=True)

In [5]:
print(f"Train set: {len(trn)} samples, Validation set: {len(val)} samples, Test set: {len(tst)} samples")

Train set: 8000 samples, Validation set: 1000 samples, Test set: 1000 samples


# Model

In [6]:
model = RoundnessToTextModel(
    t5_model_name="sonoisa/t5-base-japanese",
    freeze_t5=False,
    hidden_dim=1024,
    output_dim=768,
    device="cuda" if torch.cuda.is_available() else "cpu",
)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.6)

# Training

In [8]:
train(
    model=model,
    optimizer=optimizer,
    trn_roundness=trn["Roundness"],
    val_roundness=val["Roundness"],
    tst_roundness=tst["Roundness"],
    trn_texts=trn["Pseudoword"],
    val_texts=val["Pseudoword"],
    tst_texts=tst["Pseudoword"],
    batch_size=min(len(val), 1000),
    epochs=100,
    patience=10,
    scheduler=scheduler,
)

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch   1/100, Train Loss: 18.8334, Validation Loss: 15.1789, Best Val Loss: 15.1789
Epoch   2/100, Train Loss: 14.8226, Validation Loss: 13.9091, Best Val Loss: 13.9091
Epoch   3/100, Train Loss: 13.2584, Validation Loss: 12.0886, Best Val Loss: 12.0886
Epoch   4/100, Train Loss: 11.4614, Validation Loss: 10.2595, Best Val Loss: 10.2595
Epoch   5/100, Train Loss: 9.0821, Validation Loss: 6.7239, Best Val Loss: 6.7239
Epoch   6/100, Train Loss: 5.3358, Validation Loss: 3.0156, Best Val Loss: 3.0156
Epoch   7/100, Train Loss: 3.4260, Validation Loss: 2.8728, Best Val Loss: 2.8728
Epoch   8/100, Train Loss: 3.2642, Validation Loss: 2.7708, Best Val Loss: 2.7708
Epoch   9/100, Train Loss: 3.0557, Validation Loss: 2.7294, Best Val Loss: 2.7294
Epoch  10/100, Train Loss: 2.9575, Validation Loss: 2.7125, Best Val Loss: 2.7125
Epoch  11/100, Train Loss: 2.8975, Validation Loss: 2.6951, Best Val Loss: 2.6951
Epoch  12/100, Train Loss: 2.8484, Validation Loss: 2.6778, Best Val Loss: 2.6778
Epoc

# Testing

In [9]:
random_sample = tst.sample(n=10, random_state=42)
for _, row in random_sample.iterrows():
    print(f"Roundness Value : {row['Roundness']}")
    print(f"Original Word   : {row['Pseudoword']}")
    print(f"Predicted word  : {inference(model, row["Roundness"])}")
    print()

Roundness Value : 0.2795007
Original Word   : koshi
Predicted word  : nnupa

Roundness Value : 0.42155555
Original Word   : mifufude
Predicted word  : pubabafu

Roundness Value : 0.7146631
Original Word   : ize
Predicted word  : neeshi

Roundness Value : 0.6702508
Original Word   : rapueroi
Predicted word  : gurebo

Roundness Value : 0.6265058
Original Word   : oge
Predicted word  : mipako

Roundness Value : 0.6067172
Original Word   : daruusuku
Predicted word  : migatsu

Roundness Value : 0.40251696
Original Word   : mote
Predicted word  : mojiro

Roundness Value : 0.8509308
Original Word   : yabodoo
Predicted word  : kie

Roundness Value : 0.64203817
Original Word   : jiso
Predicted word  : ozo

Roundness Value : 0.75613195
Original Word   : oeido
Predicted word  : tsuku



In [10]:
roundness_list = []
for i in range(11):
    roundness_list.append(i/10)

for roundness in roundness_list:
    print(f"Roundness Value: {roundness}")
    print(f"Predicted word: {inference(model, roundness)}")
    print()

Roundness Value: 0.0
Predicted word: keguchifu

Roundness Value: 0.1
Predicted word: wamu

Roundness Value: 0.2
Predicted word: kuhe

Roundness Value: 0.3
Predicted word: remitsumoba

Roundness Value: 0.4
Predicted word: kajihe

Roundness Value: 0.5
Predicted word: gebayo

Roundness Value: 0.6
Predicted word: noeya

Roundness Value: 0.7
Predicted word: nadesu

Roundness Value: 0.8
Predicted word: tahodo

Roundness Value: 0.9
Predicted word: genodo

Roundness Value: 1.0
Predicted word: nitano



# Save and load model

In [11]:
save_model(model, filename=f"pseudoword_generator_v0{os.getenv("GEN")}.pth")

Model saved to outputs/pseudoword_generator_v03.2.pth


In [None]:
model = load_model(filename=f"pseudoword_generator_v0{os.getenv("GEN")}.pth")