### Reading data 

In [35]:
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return [line.strip() for line in file]

def separate_idioms(sentences, idioms):
    results = []
    for sentence in sentences:
        found = False
        for idiom in idioms:
            if idiom in sentence:
                parts = sentence.split(idiom)
                parts = [part.strip() for part in parts if part.strip()]
                full_components = [parts[0], idiom] + parts[1:] if parts else [idiom]
                results.append((idiom, parts, full_components))
                found = True
                break
        if not found:
            results.append((None, [sentence], [sentence]))
    return results

def write_results(results, output_path):
    with open(output_path, 'w', encoding='utf-8') as file:
        for idiom, parts, full_components in results:
            if idiom:
                file.write(f"Idiom: {idiom}\n")
                file.write(f"Parts: {parts}\n")
                file.write(f"Full Components: {full_components}\n\n")
            else:
                file.write(f"Original Sentence: {parts[0]}\n\n")

# Read idioms and sentences from their respective files
idioms = read_file('data/idiom_data/total_idioms.txt')
sentences = read_file('data/idiom_data/total_idiom_sentences.txt')

# Separate idioms from sentences
results = separate_idioms(sentences, idioms)

# Write the results to an output file
write_results(results, 'output.txt')


### Init T5 Model

In [8]:
import torch

from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
if torch.cuda.is_available():       
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


device = "cuda"


en_to_vi_model = T5ForConditionalGeneration.from_pretrained("NlpHUST/t5-en-vi-base")
en_to_vi_tokenizer = T5Tokenizer.from_pretrained("NlpHUST/t5-en-vi-base")
en_to_vi_model.to(device)

src = "I bring umbrella if by chance it rains"
tokenized_text = en_to_vi_tokenizer.encode(src, return_tensors="pt").to(device)
print(tokenized_text)
en_to_vi_model.eval()
summary_ids = en_to_vi_model.generate(
                    tokenized_text,
                    max_length=128, 
                    num_beams=5,
                    repetition_penalty=2.5, 
                    length_penalty=1.0, 
                    early_stopping=True
                )
output = en_to_vi_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(output)

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3050 Laptop GPU


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


tensor([[   336,   8448,    259,    273, 134434,    955,    455,  12862,    609,
            259, 190361,      1]], device='cuda:0')
Tôi mang dù nếu có cơ hội trời mưa


### Init and Load Idiom Model

In [1]:
import torch

with open('data/idiom_data/total_idioms.txt') as f:
    idioms = f.read()
    idiomatic_sentences = idioms.split("\n")
    
with open('data/idiom_data/total_translated_idioms.txt') as f:
    translated = f.read()
    plain_sentences = translated.split("\n")

print(len(idiomatic_sentences))
print(len(plain_sentences))
idiomatic_sentences = idiomatic_sentences[0:-1]
plain_sentences = plain_sentences[0:-1]
print(len(idiomatic_sentences))
print(len(plain_sentences))


from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

# Helper function to tokenize and build vocabulary
def yield_tokens(data_iter, tokenizer):
    for text in data_iter:
        yield tokenizer(text)

idiom_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
idiom_vocab = build_vocab_from_iterator(yield_tokens(idiomatic_sentences + plain_sentences, idiom_tokenizer), specials=["<unk>", "<pad>", "<sos>", "<eos>"])
idiom_vocab.set_default_index(idiom_vocab["<unk>"])


from models.idiom_model import Seq2Seq, Encoder, Decoder, Attention

# Parameters
INPUT_DIM = len(idiom_vocab)
OUTPUT_DIM = len(idiom_vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
CLIP = 1
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

attn = Attention(HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT, attn)



101
101
100
100




input dim 398 outputdim 398


In [2]:
save_file_path = ('models_checkpoint/idiom_model.pth')  #save with fp32
idiom_model = Seq2Seq(enc,dec,device,idiom_vocab, idiom_tokenizer)
idiom_model.load_state_dict(torch.load(save_file_path))
idiom_model.to(device)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(398, 256)
    (rnn): GRU(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1024, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(398, 256)
    (rnn): GRU(768, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=1280, out_features=398, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (vocab): Vocab()
)

### Feed data for model testing

In [46]:
given_parts = ["I bring umbrella if by chance it rains"]
given_parts = results[67]
print(len(given_parts))
if len(given_parts) < 2:
    given_parts = given_parts[0]
    en_to_vi_model.eval()
    tokenized_text = en_to_vi_tokenizer.encode(given_parts, return_tensors="pt").to(device)
    summary_ids = en_to_vi_model.generate(
                    tokenized_text,
                    max_length=128, 
                    num_beams=5,
                    repetition_penalty=2.5, 
                    length_penalty=1.0, 
                    early_stopping=True
                )
    output = en_to_vi_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    print("this is output", output)
else:
    total_sen = []
    idiom_components = given_parts[0]
    non_idiom_componets = given_parts[1]
    sentence_components = given_parts[2]
    plain_en_sentence_components = []
    concatenated_plain_en_sen = ""
    for part in sentence_components:
        if part in idiom_components:
            generated_idiom = idiom_model.sample(part)
            print("idiom part", part)
            print("generated_idiom", generated_idiom)
            if "<unk> " in generated_idiom:
                generated_idiom = generated_idiom[6:]
            plain_en_sentence_components.append(generated_idiom)
            concatenated_plain_en_sen += generated_idiom + " "
        else:
            print("normal part", part)
            plain_en_sentence_components.append(part)
            concatenated_plain_en_sen += part + " "
    concatenated_plain_en_sen = concatenated_plain_en_sen[0:-1]
    print("THIS IS PLAIN_EN_SEN_COM", plain_en_sentence_components)
    print("THIS IS CON_PLAIN_EN_SEN", concatenated_plain_en_sen)
    tokenized_text = en_to_vi_tokenizer.encode(concatenated_plain_en_sen, return_tensors="pt").to(device)
    summary_ids = en_to_vi_model.generate(
            tokenized_text,
            max_length=128, 
            num_beams=5,
            repetition_penalty=2.5, 
            length_penalty=1.0, 
            early_stopping=True
        )
    output = en_to_vi_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    print("This is the translated sentences: ", output)
    

3
normal part after work, he’s
idiom part dead to the world
generated_idiom <unk> sound asleep
normal part .
THIS IS PLAIN_EN_SEN_COM ['after work, he’s', 'sound asleep', '.']
THIS IS CON_PLAIN_EN_SEN after work, he’s sound asleep .
This is the translated sentences:  sau khi làm việc, anh ta có vẻ buồn ngủ.
