# Design

This is an decoder model which will take in a roundness value and output a pseudoword that corresponds to the roundness value

In [12]:
from utils.pseudoword_generator import *
import pandas as pd
import torch

pd.set_option('display.max_columns', None)
device = "cuda" if torch.cuda.is_available() else "cpu"
state = 42

VERSION = "3.0"

# Dataset

In [2]:
# Import dataset

data = pd.read_csv('datasets/japanese_pseudowords.csv')
data

Unnamed: 0,Pseudoword,Roundness
0,irepeo,0.481853
1,bea,0.562284
2,kiko,0.371239
3,tsupihamumo,0.235215
4,koke,0.212680
...,...,...
9995,tademunoo,0.782395
9996,tsujidenubo,0.339519
9997,musa,0.517813
9998,sateihemu,0.492698


In [3]:
data.describe()

Unnamed: 0,Roundness
count,10000.0
mean,0.500261
std,0.169169
min,0.172221
25%,0.372915
50%,0.494394
75%,0.613484
max,0.891836


In [4]:
# Split data into train, val and test sets

trn = data.sample(frac=0.8, random_state=state)
val = data.drop(trn.index).sample(frac=0.5, random_state=state)
tst = data.drop(trn.index).drop(val.index)
trn.reset_index(inplace=True, drop=True)
val.reset_index(inplace=True, drop=True)
tst.reset_index(inplace=True, drop=True)

In [5]:
print(f"Train set: {len(trn)} samples, Validation set: {len(val)} samples, Test set: {len(tst)} samples")

Train set: 8000 samples, Validation set: 1000 samples, Test set: 1000 samples


# Model

In [6]:
model = RoundnessToTextModel(
    t5_model_name="sonoisa/t5-base-japanese",
    freeze_t5=False,
    hidden_dim=256,
    output_dim=768,
    device="cuda" if torch.cuda.is_available() else "cpu",
)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.6)

# Training

In [8]:
train(
    model=model,
    optimizer=optimizer,
    trn_roundness=trn["Roundness"],
    val_roundness=val["Roundness"],
    tst_roundness=tst["Roundness"],
    trn_texts=trn["Pseudoword"],
    val_texts=val["Pseudoword"],
    tst_texts=tst["Pseudoword"],
    batch_size=min(len(val), 100),
    epochs=100,
    patience=10,
    scheduler=scheduler,
)

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch   1/100, Train Loss: 9.3327, Validation Loss: 2.7575, Best Val Loss: 2.7575
Epoch   2/100, Train Loss: 2.6850, Validation Loss: 2.5908, Best Val Loss: 2.5908
Epoch   3/100, Train Loss: 2.5870, Validation Loss: 2.5715, Best Val Loss: 2.5715
Epoch   4/100, Train Loss: 2.5434, Validation Loss: 2.5548, Best Val Loss: 2.5548
Epoch   5/100, Train Loss: 2.5191, Validation Loss: 2.5587, Best Val Loss: 2.5548
Epoch   6/100, Train Loss: 2.4919, Validation Loss: 2.5595, Best Val Loss: 2.5548
Epoch   7/100, Train Loss: 2.4683, Validation Loss: 2.5672, Best Val Loss: 2.5548
Epoch   8/100, Train Loss: 2.4498, Validation Loss: 2.5784, Best Val Loss: 2.5548
Epoch   9/100, Train Loss: 2.4319, Validation Loss: 2.5838, Best Val Loss: 2.5548
Epoch  10/100, Train Loss: 2.4090, Validation Loss: 2.5771, Best Val Loss: 2.5548
Epoch  11/100, Train Loss: 2.3930, Validation Loss: 2.5838, Best Val Loss: 2.5548
Epoch  12/100, Train Loss: 2.3787, Validation Loss: 2.5962, Best Val Loss: 2.5548
Epoch  13/100, T

# Testing

In [9]:
random_sample = tst.sample(n=10, random_state=42)
for _, row in random_sample.iterrows():
    print(f"Roundness Value : {row['Roundness']}")
    print(f"Original Word   : {row['Pseudoword']}")
    print(f"Predicted word  : {inference(model, row["Roundness"])}")
    print()

Roundness Value : 0.27949998
Original Word   : koshi
Predicted word  : meyu

Roundness Value : 0.42155546
Original Word   : mifufude
Predicted word  : nusateme

Roundness Value : 0.7146631
Original Word   : ize
Predicted word  : ganodadega

Roundness Value : 0.6702505
Original Word   : rapueroi
Predicted word  : hiyozu

Roundness Value : 0.6265055
Original Word   : oge
Predicted word  : guda

Roundness Value : 0.6067166
Original Word   : daruusuku
Predicted word  : zuretsuze

Roundness Value : 0.4025172
Original Word   : mote
Predicted word  : geze

Roundness Value : 0.8509306
Original Word   : yabodoo
Predicted word  : ruyogamo

Roundness Value : 0.6420378
Original Word   : jiso
Predicted word  : zuzeba

Roundness Value : 0.7561318
Original Word   : oeido
Predicted word  : homumumu



In [10]:
roundness_list = []
for i in range(11):
    roundness_list.append(i/10)

for roundness in roundness_list:
    print(f"Roundness Value: {roundness}")
    print(f"Predicted word: {inference(model, roundness)}")
    print()

Roundness Value: 0.0
Predicted word: yabekote

Roundness Value: 0.1
Predicted word: ruzupo

Roundness Value: 0.2
Predicted word: kigutodetsusu

Roundness Value: 0.3
Predicted word: pegi

Roundness Value: 0.4
Predicted word: rosako

Roundness Value: 0.5
Predicted word: risopa

Roundness Value: 0.6
Predicted word: pupusezebu

Roundness Value: 0.7
Predicted word: yaga

Roundness Value: 0.8
Predicted word: rigane

Roundness Value: 0.9
Predicted word: zonore

Roundness Value: 1.0
Predicted word: uhifuyo



# Save and load model

In [13]:
save_model(model, filename=f"pseudoword_generator_v0{VERSION}.pth")

Model saved to outputs/pseudoword_generator_v03.0.pth


In [None]:
model = load_model(filename=f"pseudoword_generator_v0{VERSION}.pth")