In [None]:
data = []

with open('/content/sorted_eng_data2.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

for line in lines:
    parts = line.strip().split(',')
    num = int(parts[0].strip().split(':')[1])
    origin_lang = parts[1].strip().split(':')[1][1:]
    pronun_list = parts[2].strip().split(':')[1][1:-1]

    data.append({'num': num, 'origin_lang': origin_lang, 'pronun_list': pronun_list})

print(data[0])


{'num': 1, 'origin_lang': '&', 'pronun_list': '앤드'}


In [None]:
import random
import torch
import torch.nn as nn
import torch.optim as optim
import ast

# 데이터 준비
data = []

with open('/content/sorted_eng_data2.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

for line in lines:
    parts = line.strip().split(',')
    num = int(parts[0].strip().split(':')[1])
    origin_lang = parts[1].strip().split(':')[1][1:]
    pronun_list = parts[2].strip().split(':')[1][1:-1]

    data.append({'num': num, 'origin_lang': origin_lang, 'pronun_list': pronun_list})

# 단어와 발음 사전 생성
source_vocab = set()
target_vocab = set()
for entry in data:
    source_vocab.update(entry['origin_lang'])
    target_vocab.update(entry['pronun_list'])
source_vocab = sorted(list(source_vocab))
target_vocab = sorted(list(target_vocab))
source_vocab_size = len(source_vocab)
target_vocab_size = len(target_vocab)

source_char_to_index = {char: index for index, char in enumerate(source_vocab)}
target_char_to_index = {char: index for index, char in enumerate(target_vocab)}

source_char_to_index['<SOS>'] = len(source_char_to_index)
source_char_to_index['<EOS>'] = len(source_char_to_index)
target_char_to_index['<SOS>'] = len(target_char_to_index)
target_char_to_index['<EOS>'] = len(target_char_to_index)

def predict(model, source_word):
    model.eval()
    source_seq = [source_char_to_index[char] for char in source_word]
    source_tensor = torch.tensor(source_seq, dtype=torch.long, device=device).view(-1, 1)

    decoder_input = torch.tensor([[target_char_to_index['<SOS>']]], device=device)
    encoder_outputs, encoder_hidden = model.encoder(model.embedding(source_tensor))
    decoder_hidden = encoder_hidden

    predicted_pronun_list = []

    for _ in range(10):  # 최대 길이를 임의로 설정. 실제로는 다른 방법으로 결정할 수 있습니다.
        decoder_output, decoder_hidden = model.decoder(model.embedding(decoder_input).view(1,1,-1), decoder_hidden)
        output = model.output_layer(decoder_output)
        _, topi = output.topk(1)
        if topi.item() == target_char_to_index['<EOS>']:
            break
        else:
            predicted_pronun_list.append(topi.item())

        decoder_input = topi.squeeze().detach()

    return ''.join([target_vocab[index] for index in predicted_pronun_list])


    return output_string

# 모델 정의
class Seq2Seq(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Seq2Seq, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.encoder = nn.GRU(hidden_size, hidden_size)
        self.decoder = nn.GRU(hidden_size, hidden_size)
        self.output_layer = nn.Linear(hidden_size, output_size)

    def forward(self, input_seq, target_seq, teacher_forcing_ratio=0.5):
        input_length = input_seq.size(0)
        target_length = target_seq.size(0)
        batch_size = target_seq.size(1)
        target_vocab_size = self.output_layer.out_features

        encoder_outputs, encoder_hidden = self.encoder(self.embedding(input_seq))

        decoder_input = torch.tensor([[target_char_to_index['<SOS>']] * batch_size], device=device)
        decoder_hidden = encoder_hidden

        use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

        outputs = torch.zeros(target_length, batch_size, target_vocab_size, device=device)

        prev_output = decoder_input

        for t in range(target_length):
            decoder_output, decoder_hidden = self.decoder(self.embedding(prev_output).view(1,1,-1), decoder_hidden)

            output = self.output_layer(decoder_output)
            outputs[t] = output
            if use_teacher_forcing:
                prev_output = target_seq[t].view(1)
            else:
                _, topi = output.topk(1)
                prev_output = topi.squeeze().detach()

        return outputs

# 하이퍼파라미터 설정
hidden_size = 256
learning_rate = 0.001
num_epochs = 100

# 모델 초기화
model = Seq2Seq(source_vocab_size, hidden_size, target_vocab_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 손실 함수와 최적화 알고리즘 설정
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# 훈련
for epoch in range(num_epochs):
    total_loss = 0
    for entry in data:
        source_seq = [source_char_to_index['<SOS>']] + [source_char_to_index[char] for char in entry['origin_lang']] + [source_char_to_index['<EOS>']]
        target_seq = [target_char_to_index['<SOS>']] + [target_char_to_index[char] for char in entry['pronun_list']] + [target_char_to_index['<EOS>']]
        source_tensor = torch.tensor(source_seq, dtype=torch.long, device=device).view(-1, 1)
        target_tensor = torch.tensor(target_seq, dtype=torch.long, device=device).view(-1, 1)

        optimizer.zero_grad()

        output = model(source_tensor, target_tensor)
        output_dim = output.shape[-1]

        output = output[1:].view(-1, output_dim)
        target_tensor = target_tensor[1:].view(-1)

        loss = criterion(output, target_tensor)
        loss.backward()

        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(data)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

print("Training finished!")

test_words = ['anopheles', 'anorak', 'another']
for word in test_words:
    print(f'Input: {word}, Output: {predict(model, word)}')

IndexError: ignored

In [None]:
pip install jamo

Collecting jamo
  Downloading jamo-0.4.1-py3-none-any.whl (9.5 kB)
Installing collected packages: jamo
Successfully installed jamo-0.4.1


In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from jamo import h2j, j2hcj

class PronunciationDataset(Dataset):
    def __init__(self, csv_file):
        self.data =  pd.read_csv(csv_file).fillna('')
        self.jamo_to_idx = {}  # 자모를 인덱스로 매핑
        self.origin_lang_to_idx = {}  # 원어 문자를 인덱스로 매핑

        self._build_vocab()

    def _build_vocab(self):
        for word, origin_lang in zip(self.data['word'], self.data['origin_lang']):
            self._add_word_to_vocab(word, self.jamo_to_idx)
            self._add_origin_lang_to_vocab(origin_lang, self.origin_lang_to_idx)

    def _add_word_to_vocab(self, word, vocab):
        jamo_chars = list(j2hcj(h2j(word)))  # 초성, 중성, 종성 분리
        for char in jamo_chars:
            if char not in vocab:
                vocab[char] = len(vocab)

    def _add_origin_lang_to_vocab(self, origin_lang, vocab):
        for char in origin_lang:
            if char not in vocab:
                vocab[char] = len(vocab)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        word = self.data.iloc[idx]['word']
        origin_lang = self.data.iloc[idx]['origin_lang']
        return self._encode_word(word, self.jamo_to_idx), self._encode_origin_lang(origin_lang, self.origin_lang_to_idx)

    def _encode_word(self, word, vocab):
        jamo_chars = list(j2hcj(h2j(word)))  # 초성, 중성, 종성 분리
        return [vocab[char] for char in jamo_chars]

    def _encode_origin_lang(self, origin_lang, vocab):
        return [vocab[char] for char in origin_lang]

# 데이터 로딩
dataset = PronunciationDataset('/content/our_sam_db.txt')
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, random_split

# 하이퍼파라미터 설정
input_size = len(dataset.origin_lang_to_idx)  # 원어 문자 집합 크기
output_size = 54  # 자모 집합 크기
hidden_size = 256
num_layers = 2
learning_rate = 0.001
num_epochs = 3

class Seq2Seq(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(Seq2Seq, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # 임베딩 계층 추가
        self.embedding = nn.Embedding(input_size, hidden_size)

        self.encoder = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)  # 수정된 부분
        self.decoder = nn.LSTM(hidden_size * 2 if output_size > hidden_size else hidden_size,
                               hidden_size,
                               num_layers,
                               batch_first=True)

        # Linear layer's input size is adjusted according to the size of decoder's input.
        self.fc = nn.Linear(hidden_size * 2 if output_size > hidden_size else hidden_size,
                            output_size)

    def forward(self, x, initial_hidden=None, initial_cell=None):
        x = self.embedding(x)

        # Encoder
        _, (hidden, cell) = self.encoder(x)

        # Use provided initial_hidden and initial_cell if available
        if initial_hidden is not None and initial_cell is not None:
            hidden = initial_hidden
            cell = initial_cell

        # Decoder의 초기 은닉 상태와 셀 상태를 설정
        decoder_input = torch.zeros(x.size(0), 1,self.hidden_size).to(x.device)

        outputs=[]

        for _ in range(x.size(1)):
            output,(hidden, cell)=self.decoder(decoder_input,(hidden,cell))
            output=self.fc(output)
            outputs.append(output)

            # Get the top k values and their indices
            topv, topi = output.topk(1)

            # Convert indices to embedded vectors
            decoder_input = self.embedding(topi.squeeze().detach()).unsqueeze(1)

        return torch.cat(outputs,dim=1)

# 모델 초기화
model = Seq2Seq(input_size, hidden_size, output_size, num_layers)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 손실 함수 및 최적화 알고리즘 정의
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# 원어 문자와 자모의 최대 길이를 구합니다
max_source_length = max([len(source) for source, _ in dataset])
max_target_length = max([len(target) for _, target in dataset])

source_pad = input_size - 1
target_pad = output_size - 1

# 데이터를 패딩하여 하나의 텐서로 만듭니다
padded_sources = []
padded_targets = []
for source, target in dataset:
    padded_source = source + [source_pad] * (max_source_length - len(source))
    padded_target = target + [target_pad] * (max_target_length - len(target))
    padded_sources.append(padded_source)
    padded_targets.append(padded_target)

# 텐서로 변환
tensor_sources = torch.tensor(padded_sources, dtype=torch.long)  # dtype을 명시적으로 지정
tensor_targets = torch.tensor(padded_targets, dtype=torch.long)

# 데이터셋 분할 전에 텐서 변환 및 패딩
tensor_dataset = TensorDataset(tensor_sources, tensor_targets)

# 데이터셋 크기 계산
dataset_size = len(tensor_dataset)
train_size = int(0.75 * dataset_size)
validation_size = int(0.15 * dataset_size)
test_size = dataset_size - train_size - validation_size

# 텐서 형태의 데이터셋으로 분할
train_dataset, validation_dataset, test_dataset = random_split(tensor_dataset,
                                                               [train_size,
                                                                validation_size,
                                                                test_size])

# 각 데이터셋에 대해 DataLoader 생성
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
validation_dataloader = DataLoader(validation_dataset, batch_size=64)
test_dataloader = DataLoader(test_dataset, batch_size=64)

# 학습 루프
for epoch in range(num_epochs):
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    model.train()  # Set the model to training mode

    for batch_source, batch_target in train_dataloader:
        batch_source = batch_source.to(device)
        batch_target = batch_target.to(device)

        optimizer.zero_grad()
        outputs = model(batch_source)
        loss = criterion(outputs.view(-1, output_size), batch_target.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    model.eval()  # Set the model to evaluation mode

    with torch.no_grad():
        for batch_source, batch_target in test_dataloader:
            batch_source = batch_source.to(device)
            batch_target = batch_target.to(device)

            outputs = model(batch_source)
            predicted = torch.argmax(outputs, dim=2)

            total_predictions += batch_target.size(0) * batch_target.size(1)
            correct_predictions += (predicted == batch_target).sum().item()


    # Convert indices back to original words
    idx_to_origin_lang = {idx: char for char, idx in dataset.origin_lang_to_idx.items()}
    all_original_words = []

    accuracy = correct_predictions / total_predictions
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_dataloader):.4f}, Accuracy: {accuracy:.4f}')

print('Training finished!')

AttributeError: ignored