# Design

This is an decoder model which will take in a roundness value and output a pseudoword that corresponds to the roundness value

In [1]:
from utils.pseudoword_generator import *
from dotenv import load_dotenv
import pandas as pd
import torch
import os


load_dotenv()
pd.set_option('display.max_columns', None)
device = "cuda" if torch.cuda.is_available() else "cpu"
state = 42

# Dataset

In [2]:
# Import dataset

data = pd.read_csv(f"datasets/japanese_pseudowords_{os.getenv("VERSION")}.csv")
data

Unnamed: 0,Pseudoword,Roundness
0,mepako,0.529904
1,bayo,0.572885
2,depe,0.505724
3,nushi,0.588515
4,poipaau,0.527272
...,...,...
9995,hipupasago,0.574199
9996,poniga,0.544438
9997,pubo,0.555564
9998,dadapa,0.567358


In [3]:
data.describe()

Unnamed: 0,Roundness
count,10000.0
mean,0.541609
std,0.043833
min,0.388763
25%,0.51001
50%,0.540788
75%,0.571022
max,0.701714


In [4]:
# Split data into train, val and test sets

trn = data.sample(frac=0.8, random_state=state)
val = data.drop(trn.index).sample(frac=0.5, random_state=state)
tst = data.drop(trn.index).drop(val.index)
trn.reset_index(inplace=True, drop=True)
val.reset_index(inplace=True, drop=True)
tst.reset_index(inplace=True, drop=True)

In [5]:
print(f"Train set: {len(trn)} samples, Validation set: {len(val)} samples, Test set: {len(tst)} samples")

Train set: 8000 samples, Validation set: 1000 samples, Test set: 1000 samples


# Model

In [6]:
model = RoundnessToTextModel(
    t5_model_name="sonoisa/t5-base-japanese",
    freeze_t5=False,
    hidden_dim=256,
    output_dim=768,
    device="cuda" if torch.cuda.is_available() else "cpu",
)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.6)

# Training

In [8]:
train(
    model=model,
    optimizer=optimizer,
    trn_roundness=trn["Roundness"],
    val_roundness=val["Roundness"],
    tst_roundness=tst["Roundness"],
    trn_texts=trn["Pseudoword"],
    val_texts=val["Pseudoword"],
    tst_texts=tst["Pseudoword"],
    batch_size=min(len(val), 100),
    epochs=100,
    patience=10,
    scheduler=scheduler,
)

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch   1/100, Train Loss: 9.3923, Validation Loss: 2.6375, Best Val Loss: 2.6375
Epoch   2/100, Train Loss: 2.6761, Validation Loss: 2.5136, Best Val Loss: 2.5136
Epoch   3/100, Train Loss: 2.5932, Validation Loss: 2.4990, Best Val Loss: 2.4990
Epoch   4/100, Train Loss: 2.5511, Validation Loss: 2.4759, Best Val Loss: 2.4759
Epoch   5/100, Train Loss: 2.5249, Validation Loss: 2.4664, Best Val Loss: 2.4664
Epoch   6/100, Train Loss: 2.5036, Validation Loss: 2.4632, Best Val Loss: 2.4632
Epoch   7/100, Train Loss: 2.4761, Validation Loss: 2.4612, Best Val Loss: 2.4612
Epoch   8/100, Train Loss: 2.4567, Validation Loss: 2.4643, Best Val Loss: 2.4612
Epoch   9/100, Train Loss: 2.4414, Validation Loss: 2.4620, Best Val Loss: 2.4612
Epoch  10/100, Train Loss: 2.4144, Validation Loss: 2.4646, Best Val Loss: 2.4612
Epoch  11/100, Train Loss: 2.4034, Validation Loss: 2.4724, Best Val Loss: 2.4612
Epoch  12/100, Train Loss: 2.3908, Validation Loss: 2.4741, Best Val Loss: 2.4612
Epoch  13/100, T

# Testing

In [9]:
random_sample = tst.sample(n=10, random_state=42)
for _, row in random_sample.iterrows():
    print(f"Roundness Value : {row['Roundness']}")
    print(f"Original Word   : {row['Pseudoword']}")
    print(f"Predicted word  : {inference(model, row["Roundness"])}")
    print()

Roundness Value : 0.534043
Original Word   : kira
Predicted word  : petsuno

Roundness Value : 0.5391055
Original Word   : sedagu
Predicted word  : ribiroki

Roundness Value : 0.5906082
Original Word   : mahidazebo
Predicted word  : uzufushi

Roundness Value : 0.5985307
Original Word   : geyaha
Predicted word  : higii

Roundness Value : 0.51546913
Original Word   : shiwaba
Predicted word  : shihopizo

Roundness Value : 0.58633065
Original Word   : ozami
Predicted word  : wabui

Roundness Value : 0.4962537
Original Word   : pekazu
Predicted word  : pezoshiga

Roundness Value : 0.5527373
Original Word   : ige
Predicted word  : pimasuzo

Roundness Value : 0.59347355
Original Word   : boku
Predicted word  : mibayo

Roundness Value : 0.48744512
Original Word   : zuji
Predicted word  : yotogega



In [10]:
roundness_list = []
for i in range(11):
    roundness_list.append(i/10)

for roundness in roundness_list:
    print(f"Roundness Value: {roundness}")
    print(f"Predicted word: {inference(model, roundness)}")
    print()

Roundness Value: 0.0
Predicted word: zeneshi

Roundness Value: 0.1
Predicted word: nazezaha

Roundness Value: 0.2
Predicted word: nepatopege

Roundness Value: 0.3
Predicted word: roketadeta

Roundness Value: 0.4
Predicted word: kikapete

Roundness Value: 0.5
Predicted word: bekefume

Roundness Value: 0.6
Predicted word: dochio

Roundness Value: 0.7
Predicted word: bogua

Roundness Value: 0.8
Predicted word: zaho

Roundness Value: 0.9
Predicted word: maho

Roundness Value: 1.0
Predicted word: boruu



# Save and load model

In [11]:
save_model(model, filename=f"pseudoword_generator_v0{os.getenv("VERSION")}.pth")

Model saved to outputs/pseudoword_generator_v03.1.pth


In [None]:
model = load_model(filename=f"pseudoword_generator_v0{os.getenv("VERSION")}.pth")