In [None]:
import pandas as pd

df = pd.read_csv("data/ipa-dict-master/data/en_US.txt", sep="\t", header=None, keep_default_na=False)
df.columns = ["word_original", "ipa_original"]
display(df)

In [None]:
import re

df["word"] = df["word_original"].apply(str.lower)

def normalize_ipa(ipa_original):
    result = re.search(r"/[^/]+/$", ipa_original)
    if result:
        return result.group()[1:-1]
    return ipa_original

df["ipa"] = df["ipa_original"].apply(normalize_ipa)

display(df)

In [None]:
from enum import Enum
from dataclasses import dataclass

class Direction(Enum):
    TO_IPA = 1
    FROM_IPA = 2
    
class Sublanguage(Enum):
    WORD = 1
    IPA = 2

@dataclass(frozen=True)
class Alphabet:
    locale: str
    sublanguage: Sublanguage
    
    def __repr__(self):
        return f"{self.locale}:{self.sublanguage.name}"

LOCALE = "en_US"
DIRECTION = Direction.TO_IPA

SRC_SUBLANGUAGE = Sublanguage.WORD if DIRECTION == Direction.TO_IPA else Sublanguage.IPA
TGT_SUBLANGUAGE = Sublanguage.IPA if DIRECTION == Direction.TO_IPA else Sublanguage.WORD

SRC_LANGUAGE = Alphabet(LOCALE, SRC_SUBLANGUAGE)
TGT_LANGUAGE = Alphabet(LOCALE, TGT_SUBLANGUAGE)

print(f"The model will translate {SRC_LANGUAGE} to {TGT_LANGUAGE}")

In [None]:
def create_dataframe(filename):
    df = pd.read_csv(filename, sep="\t", header=None, keep_default_na=False)
    df.columns = ["word_original", "ipa_original"]
    df[Sublanguage.WORD.name] = df["word_original"].apply(str.lower)
    df[Sublanguage.IPA.name] = df["ipa_original"].apply(normalize_ipa)
    return df

df_en = create_dataframe("data/ipa-dict-master/data/en_US.txt")
display(df_en)

In [None]:
from torch.utils.data import Dataset

class IpaDataset(Dataset):
    def __init__(self, df: pd.DataFrame, locale: str, direction: Direction, label: str | None = None):
        self._df = df
        self._direction = direction
        self._locale = locale
        self._label = label
    
    def direction(self) -> Direction:
        return self._direction
    
    def locale(self) -> str:
        return self._locale
    
    def __len__(self):
        return len(self._df)
    
    def __getitem__(self, idx) -> list[str]:
        row = self._df.iloc[idx]
        if self._direction == Direction.TO_IPA:
            return [row[Sublanguage.WORD.name], row[Sublanguage.IPA.name]]
        else:
            return [row[Sublanguage.IPA.name], row[Sublanguage.WORD.name]]
    
    def __repr__(self):
        opt_label = f"[{self._label}]" if self._label else ""
        return f"IpaDataset{opt_label}(size={len(self)}, locale={self.locale()}, dir={self.direction().name})"

ipa_en = IpaDataset(df_en, "en_US", Direction.TO_IPA, "b")
print(ipa_en)
print(len(ipa_en))
print(ipa_en[400])
print(ipa_en[400:402])

In [None]:
class DataSplit(Enum):
    TRAIN = 1
    VALIDATION = 2

def tokenize(word: str) -> list[str]:
    return list(word)

class IpaDatasetHolder:
    def __init__(self, 
                 locale: str, 
                 everything: pd.DataFrame,
                 splits: dict[DataSplit, dict[Direction, IpaDataset]],
                ):
        self._locale = locale
        self._everything = everything
        self._splits = splits

    def __getitem__(self, idx: tuple[DataSplit, Direction]) -> IpaDataset:
        split, direction = idx
        return self._splits[split][direction]

    @classmethod
    def create_holder_from_csv(cls, locale: str):
        csv_path = f"data/ipa-dict-master/data/{locale}.txt"
        df = create_dataframe(csv_path)
        
        df_train = df.sample(frac = 0.7)
        df_validation = df.drop(df_train.index)
        split_dfs = {DataSplit.TRAIN: df_train, DataSplit.VALIDATION: df_validation}
        
        splits = {}
        for split, split_df in split_dfs.items():
            splits[split] = {d:IpaDataset(split_df, locale, d, split.name) for d in Direction}

        return cls(locale=locale, everything=df, splits=splits)
    
    def build_token_lists(self) -> dict[Sublanguage, list[str]]:
        word_tokens = set()
        ipa_tokens = set()
        for _, row in self._everything.iterrows():
            word_tokens.update(tokenize(row[Sublanguage.WORD.name]))
            ipa_tokens.update(tokenize(row[Sublanguage.IPA.name]))
        return {Sublanguage.WORD: sorted(word_tokens), Sublanguage.IPA: sorted(ipa_tokens)}
                              

en_holder = IpaDatasetHolder.create_holder_from_csv("en_US")
print(en_holder[DataSplit.TRAIN, Direction.TO_IPA])
print(en_holder[DataSplit.VALIDATION, Direction.TO_IPA])
token_lists_temp = en_holder.build_token_lists()
print(token_lists_temp)

In [None]:
from torchtext.vocab import build_vocab_from_iterator, Vocab

# unknown, padding, beginning of string, end of string
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ["<unk>", "<pad>", "<bos>", "<eos>"]

def build_vocabs(locale: str) -> dict[Alphabet, Vocab]:
    vocabs = {}
    holder = IpaDatasetHolder.create_holder_from_csv(locale)
    token_lists = holder.build_token_lists()
    print(f"{type(token_lists)=}")
    for sublanguage, token_list in token_lists.items():
        alphabet = Alphabet(locale, sublanguage)
        vocab = build_vocab_from_iterator(
            token_list, min_freq = 1, specials = special_symbols, special_first = True)
        vocab.set_default_index(UNK_IDX)
        vocabs[alphabet] = vocab
    return vocabs

vocabs_english = build_vocabs("en_US")

In [None]:
# copied from https://pytorch.org/tutorials/beginner/translation_transformer.html#seq2seq-network-using-transformer

from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                tgt: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(tgt))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)