# Summary

This notebook is used to generate more data for the main model to use.

This model is trained on (normalized.csv) to be able to predict the roundness of pseudowords.

This is because the original dataset (normalized.csv) only contains 124 rows, and it is insufficient to train a large model like the ByT5-Pseudword-Generator. Hence, this model should learn to predict the roundness values of pseudowords, then be applied on a larger dataset to create a dataset of pseudoword-roundness pairs that will be used to train the ByT5-Pseudword-Generator model.

In [1]:
from utils.roundness_determiner import *
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import random
import torch
import json


state = 42
VERSION = 3


pd.set_option('display.max_columns', None)
device = "cuda" if torch.cuda.is_available() else "cpu"
random.seed(state)

  from .autonotebook import tqdm as notebook_tqdm


# Building and Training

## Dataset

In [2]:
data = pd.read_csv("datasets/normalized.csv")
data

Unnamed: 0,Stimuli,ExperimentalRoundScore
0,bebi,0.815217
1,bibe,0.913043
2,bobou,0.815217
3,boubo,1.000000
4,chechi,0.184783
...,...,...
119,outou,0.347826
120,uku,0.239130
121,ulu,0.913043
122,umu,0.913043


In [3]:
data.describe()

Unnamed: 0,ExperimentalRoundScore
count,124.0
mean,0.562675
std,0.316366
min,0.0
25%,0.26087
50%,0.543478
75%,0.902174
max,1.0


## Model

In [4]:
model = roundness_determiner()

In [5]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.9)

## Training

In [6]:
result = train_kfold(
    model=model,
    roundness=data["ExperimentalRoundScore"],
    texts=data["Stimuli"],
    batch_size=5,
    optimizer=optimizer,
    scheduler=scheduler,
    epochs=1000,
    patience=10,
    k=12,
)


Fold 1/12
Epoch    1/1000 | Train Loss: 0.8974 | Val Loss: 0.7281 | Best Val: inf
Epoch    2/1000 | Train Loss: 0.8088 | Val Loss: 0.7142 | Best Val: 0.7281
Epoch    3/1000 | Train Loss: 0.7531 | Val Loss: 0.7030 | Best Val: 0.7142
Epoch    4/1000 | Train Loss: 0.7170 | Val Loss: 0.6768 | Best Val: 0.7030
Epoch    5/1000 | Train Loss: 0.6966 | Val Loss: 0.6566 | Best Val: 0.6768
Epoch    6/1000 | Train Loss: 0.6795 | Val Loss: 0.6538 | Best Val: 0.6566
Epoch    7/1000 | Train Loss: 0.6607 | Val Loss: 0.6630 | Best Val: 0.6538
Epoch    8/1000 | Train Loss: 0.6396 | Val Loss: 0.6707 | Best Val: 0.6538
Epoch    9/1000 | Train Loss: 0.6316 | Val Loss: 0.6688 | Best Val: 0.6538
Epoch   10/1000 | Train Loss: 0.6281 | Val Loss: 0.6652 | Best Val: 0.6538
Epoch   11/1000 | Train Loss: 0.6161 | Val Loss: 0.6691 | Best Val: 0.6538
Epoch   12/1000 | Train Loss: 0.6126 | Val Loss: 0.6701 | Best Val: 0.6538
Epoch   13/1000 | Train Loss: 0.6103 | Val Loss: 0.6701 | Best Val: 0.6538
Epoch   14/1000 |

## Testing

In [7]:
word_list = ["bouba", "kiki"]
model.inference(word_list)

array([0.7830832 , 0.27874008], dtype=float32)

In [8]:
word_list = ["maluma", "takete"]
model.inference(word_list)

array([0.34154534, 0.30129448], dtype=float32)

## Saving the model

In [9]:
save_model(
    model=model,
    directory=f"outputs/",
    filename=f"roundness_determiner_v0{VERSION}.pth",
)

Model saved to outputs/roundness_determiner_v03.pth


# Loading and using the model

## Loading the model

In [10]:
model = load_model(directory="outputs/", filename=f"roundness_determiner_v0{VERSION}.pth")

Model loaded from outputs/roundness_determiner_v03.pth


In [11]:
word_list = ["bouba", "kiki"]
model.inference(word_list)

array([0.78308326, 0.27874017], dtype=float32)

In [12]:
word_list = ["maluma", "takete"]
model.inference(word_list)

array([0.34154546, 0.30129448], dtype=float32)

## Importing data

In [13]:
# Import data
data = json.load(open("datasets/words.json"))

# Function to generate a random string from data
def generate_random_string(data, min_len=2, max_len=5):
    length = random.randint(min_len, max_len)
    return ''.join(random.choices(list(data.keys()), k=length))

# Generate 5000 unique strings
unique_strings = set()
while len(unique_strings) < 10000:
    unique_strings.add(generate_random_string(data))

# Convert to DataFrame
data = pd.DataFrame(list(unique_strings), columns=['Pseudoword'])
data

Unnamed: 0,Pseudoword
0,kineratote
1,rutsusajibo
2,neza
3,poutano
4,uyoshipasu
...,...
9995,gechigapaba
9996,ragi
9997,mugugo
9998,rureto


## Applying model

In [14]:
data["Roundness"] = model.inference(data["Pseudoword"].to_list())
data

Unnamed: 0,Pseudoword,Roundness
0,kineratote,0.150207
1,rutsusajibo,0.110635
2,neza,0.468633
3,poutano,0.326936
4,uyoshipasu,0.131503
...,...,...
9995,gechigapaba,0.077712
9996,ragi,0.351221
9997,mugugo,0.234996
9998,rureto,0.280292


In [15]:
data.describe()

Unnamed: 0,Roundness
count,10000.0
mean,0.259523
std,0.196518
min,0.038889
25%,0.090111
50%,0.248405
75%,0.345424
max,0.860149


## Saving CSV

In [16]:
data.to_csv("datasets/japanese_pseudowords.csv", index=False)