# Summary

This notebook is used to generate more data for the main model to use.

This model is trained on (normalized.csv) to be able to predict the roundness of pseudowords.

This is because the original dataset (normalized.csv) only contains 124 rows, and it is insufficient to train a large model like the ByT5-Pseudword-Generator. Hence, this model should learn to predict the roundness values of pseudowords, then be applied on a larger dataset to create a dataset of pseudoword-roundness pairs that will be used to train the ByT5-Pseudword-Generator model.

In [1]:
from utils.roundness_determiner import *
from dotenv import load_dotenv
import pandas as pd
import random
import torch
import json
import os


load_dotenv()
state = 42


pd.set_option('display.max_columns', None)
device = "cuda" if torch.cuda.is_available() else "cpu"
random.seed(state)

# Building and Training

## Dataset

In [2]:
data = pd.read_csv("datasets/normalized.csv")
data

Unnamed: 0,Stimuli,ExperimentalRoundScore
0,bebi,0.815217
1,bibe,0.913043
2,bobou,0.815217
3,boubo,1.000000
4,chechi,0.184783
...,...,...
119,outou,0.347826
120,uku,0.239130
121,ulu,0.913043
122,umu,0.913043


In [3]:
data.describe()

Unnamed: 0,ExperimentalRoundScore
count,124.0
mean,0.562675
std,0.316366
min,0.0
25%,0.26087
50%,0.543478
75%,0.902174
max,1.0


## Model

In [4]:
model_name = "bert-base-uncased"

In [None]:
model = RoundnessDeterminerBERT(
    model_name=model_name,
    hidden_size=768,
    freeze_base=True,
)

In [6]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.9)

## Training

In [None]:
result = train_kfold(
    model=model,
    roundness=data["ExperimentalRoundScore"],
    texts=data["Stimuli"],
    batch_size=5,
    optimizer=optimizer,
    scheduler=scheduler,
    epochs=1000,
    patience=10,
    k=4,
)

## Testing

In [8]:
word_list = ["bouba", "kiki"]
model.inference(word_list)

array([0.43793532, 0.2597781 ], dtype=float32)

In [9]:
word_list = ["maluma", "takete"]
model.inference(word_list)

array([0.5786823 , 0.28268924], dtype=float32)

In [6]:
# testing model on normalized.csv
data = pd.read_csv("datasets/normalized.csv")
data["PredictedRoundness"] = model.inference(data["Stimuli"].tolist())
data["Error"] = data["ExperimentalRoundScore"] - data["PredictedRoundness"]
data["Error"] = data["Error"].abs()
data

Unnamed: 0,Stimuli,ExperimentalRoundScore,PredictedRoundness,Error
0,bebi,0.815217,0.631233,0.183985
1,bibe,0.913043,0.697449,0.215594
2,bobou,0.815217,0.653878,0.161340
3,boubo,1.000000,0.810818,0.189182
4,chechi,0.184783,0.225316,0.040533
...,...,...,...,...
119,outou,0.347826,0.415848,0.068022
120,uku,0.239130,0.420581,0.181450
121,ulu,0.913043,0.860608,0.052436
122,umu,0.913043,0.870252,0.042792


In [7]:
data.describe()

Unnamed: 0,ExperimentalRoundScore,PredictedRoundness,Error
count,124.0,124.0,124.0
mean,0.562675,0.567133,0.159261
std,0.316366,0.212925,0.110654
min,0.0,0.184278,0.000505
25%,0.26087,0.395072,0.067931
50%,0.543478,0.57351,0.134325
75%,0.902174,0.7737,0.229565
max,1.0,0.884478,0.475319


## Saving the model

In [None]:
save_model(
    model=model,
    directory=f"outputs/",
    filename=f"roundness_determiner_v0{os.getenv("DATA")}.pth",
)

# Loading and using the model

## Loading the model

In [5]:
model = load_model(directory="outputs/", filename=f"roundness_determiner_v0{os.getenv('DATA')}.pth", model_name=model_name)

Model loaded from outputs/roundness_determiner_v03.0.pth


In [None]:
word_list = ["bouba", "kiki"]
model.inference(word_list)

In [None]:
word_list = ["maluma", "takete"]
model.inference(word_list)

## Importing data

In [None]:
# Import data
data = json.load(open("datasets/words.json"))

# Function to generate a random string from data
def generate_random_string(data, min_len=2, max_len=5):
    length = random.randint(min_len, max_len)
    return ''.join(random.choices(list(data.keys()), k=length))

# Generate 5000 unique strings
unique_strings = set()
while len(unique_strings) < 10000:
    unique_strings.add(generate_random_string(data))

# Convert to DataFrame
data = pd.DataFrame(list(unique_strings), columns=['Pseudoword'])
data

## Applying model

In [10]:
data = pd.read_csv("datasets/japanese_pseudowords.csv")

In [11]:
data["Roundness"] = model.inference(data["Pseudoword"].to_list())
data

Unnamed: 0,Pseudoword,Roundness
0,irepeo,0.481854
1,bea,0.562286
2,kiko,0.371240
3,tsupihamumo,0.235215
4,koke,0.212681
...,...,...
9995,tademunoo,0.782394
9996,tsujidenubo,0.339519
9997,musa,0.517814
9998,sateihemu,0.492699


In [12]:
data.describe()

Unnamed: 0,Roundness
count,10000.0
mean,0.500261
std,0.169169
min,0.172221
25%,0.372915
50%,0.494395
75%,0.613485
max,0.891836


## Saving CSV

In [13]:
data.to_csv(f"datasets/japanese_pseudowords.csv", index=False)