### Reading data 

In [2]:
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return [line.strip() for line in file]

def separate_idioms(sentences, idioms):
    results = []
    for sentence in sentences:
        found = False
        for idiom in idioms:
            if idiom in sentence:
                parts = sentence.split(idiom)
                parts = [part.strip() for part in parts if part.strip()]
                pos_id = sentence.find(idiom)
                end_id = pos_id + len(idiom)
                sen_parts = []
                sen_parts.append(sentence[:pos_id])
                sen_parts.append(sentence[pos_id:end_id])
                sen_parts.append(sentence[end_id:])
                
            
                full_components = [sp for sp in sen_parts if sp !=""]
                results.append((idiom, parts, full_components))
                found = True
                break
        if not found:
            results.append((None, [sentence], [sentence]))
    return results

def write_results(results, output_path):
    with open(output_path, 'w', encoding='utf-8') as file:
        for idiom, parts, full_components in results:
            if idiom:
                file.write(f"Idiom: {idiom}\n")
                file.write(f"Parts: {parts}\n")
                file.write(f"Full Components: {full_components}\n\n")
            else:
                file.write(f"Original Sentence: {parts[0]}\n\n")

# Read idioms and sentences from their respective files
idioms = read_file('data/idiom_data/total_idioms.txt')
sentences = read_file('data/idiom_data/total_idiom_sentences.txt')

# Separate idioms from sentences
results = separate_idioms(sentences, idioms)

# Write the results to an output file
write_results(results, 'output.txt')


### Init T5 Model

In [3]:
import torch

from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
if torch.cuda.is_available():       
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


device = "cuda"


en_to_vi_model = T5ForConditionalGeneration.from_pretrained("NlpHUST/t5-en-vi-base")
en_to_vi_tokenizer = T5Tokenizer.from_pretrained("NlpHUST/t5-en-vi-base")
en_to_vi_model.to(device)

src = "I bring umbrella if by chance it rains"
tokenized_text = en_to_vi_tokenizer.encode(src, return_tensors="pt").to(device)
print(tokenized_text)
en_to_vi_model.eval()
summary_ids = en_to_vi_model.generate(
                    tokenized_text,
                    max_length=128, 
                    num_beams=5,
                    repetition_penalty=2.5, 
                    length_penalty=1.0, 
                    early_stopping=True
                )
output = en_to_vi_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(output)

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3050 Laptop GPU


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


tensor([[   336,   8448,    259,    273, 134434,    955,    455,  12862,    609,
            259, 190361,      1]], device='cuda:0')
Tôi mang dù nếu có cơ hội trời mưa


### Init and Load Idiom Dictionary


In [4]:
from models.idiom_dictionary import IdiomTransDict


idioms_file = 'data/idiom_data/total_idioms.txt'
translations_file = 'data/idiom_data/total_translated_idioms.txt'

idiom_dict = IdiomTransDict(idioms_file, translations_file)
print(idiom_dict.dictionary)

{'just in case': 'as a precaution', 'a sorry sight': 'something pitiful or disappointing to see', 'rule of thumb': 'a general guideline', 'carpe diem': 'seize the day', 'salad days': 'youthful times', 'off the record': 'unofficially', 'thank goodness': 'luckily', 'big bucks': 'a lot of money', 'dog days': 'the hottest days of summer', 'wet behind the ears': 'inexperienced', 'just deserts': 'deserved outcome', 'an arm and a leg': 'very expensive', 'never mind': 'forget it', 'bricks and mortar': 'physical buildings', 'close call': 'narrow escape', 'a sight for sore eyes': 'a welcome sight', 'open warfare': 'obvious conflict', 'pin money': 'small amount of money', 'third time lucky': 'success on the third attempt', 'race against time': 'rushing to meet a deadline', 'rain or shine': 'no matter the weather', 'hold on a second': 'wait a moment', 'next to nothing': 'almost free', 'cheek by jowl': 'very close together', 'black and blue': 'bruised', 'dead wood': 'useless people or things', 'str

### Feed data for model testing

In [5]:
i =0 
given_parts = results[i]
print(len(given_parts))
if len(given_parts) < 2:
    given_parts = given_parts[0]
    en_to_vi_model.eval()
    tokenized_text = en_to_vi_tokenizer.encode(given_parts, return_tensors="pt").to(device)
    summary_ids = en_to_vi_model.generate(
                    tokenized_text,
                    max_length=128, 
                    num_beams=5,
                    repetition_penalty=2.5, 
                    length_penalty=1.0, 
                    early_stopping=True
                )
    output = en_to_vi_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    print("this is output", output)
else:
    total_sen = []
    idiom_components = given_parts[0]
    non_idiom_componets = given_parts[1]
    sentence_components = given_parts[2]
    plain_en_sentence_components = []
    concatenated_plain_en_sen = ""
    for part in sentence_components:
        if part in idiom_components:
            generated_idiom = idiom_dict.get_translation(part)
            print("Idiom part:", part)
            print("Generated idiom:", generated_idiom)
            if "<unk> " in generated_idiom:
                generated_idiom = generated_idiom[6:]
            plain_en_sentence_components.append(generated_idiom)
            concatenated_plain_en_sen += generated_idiom + " "
        else:
            plain_en_sentence_components.append(part)
            concatenated_plain_en_sen += part + " "
    concatenated_plain_en_sen = concatenated_plain_en_sen[0:-1]
    print("THIS IS PLAIN_EN_SEN_COM:", plain_en_sentence_components)
    print("THIS IS CON_PLAIN_EN_SEN:", concatenated_plain_en_sen)
    tokenized_text = en_to_vi_tokenizer.encode(concatenated_plain_en_sen, return_tensors="pt").to(device)
    summary_ids = en_to_vi_model.generate(
            tokenized_text,
            max_length=128, 
            num_beams=5,
            repetition_penalty=2.5, 
            length_penalty=1.0, 
            early_stopping=True
        )
    output = en_to_vi_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    print("This is the translated sentences: ", output)
    

3
Idiom part: just in case
Generated idiom: as a precaution
THIS IS PLAIN_EN_SEN_COM: ['bring an umbrella ', 'as a precaution', ' it rains.']
THIS IS CON_PLAIN_EN_SEN: bring an umbrella  as a precaution  it rains.


This is the translated sentences:  mang theo một cây dù để tránh mưa.


In [6]:
given_parts = sentences[i]
print(len(given_parts))
en_to_vi_model.eval()
tokenized_text = en_to_vi_tokenizer.encode(given_parts, return_tensors="pt").to(device)
summary_ids = en_to_vi_model.generate(
                tokenized_text,
                max_length=128, 
                num_beams=5,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
            )
output = en_to_vi_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("this is output", output)


print("This is the translated sentences: ", output)
    

40
this is output mang theo một cây dù chỉ trong trường hợp trời mưa.
This is the translated sentences:  mang theo một cây dù chỉ trong trường hợp trời mưa.


### WITH MODEL QUANTIZATION

In [7]:
import torch.quantization

device = "cpu"
en_to_vi_model.to(device)
quantized_model = torch.quantization.quantize_dynamic(en_to_vi_model, {torch.nn.Linear}, dtype=torch.qint8).to(device)

In [9]:
i =0 
given_parts = results[i]
print(len(given_parts))
if len(given_parts) < 2:
    given_parts = given_parts[0]
    quantized_model.eval()
    tokenized_text = en_to_vi_tokenizer.encode(given_parts, return_tensors="pt").to(device)
    summary_ids = quantized_model.generate(
                    tokenized_text,
                    max_length=128, 
                    num_beams=5,
                    repetition_penalty=2.5, 
                    length_penalty=1.0, 
                    early_stopping=True
                )
    output = en_to_vi_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    print("this is output", output)
else:
    total_sen = []
    idiom_components = given_parts[0]
    non_idiom_componets = given_parts[1]
    sentence_components = given_parts[2]
    concatenated_plain_en_sen = ""
    for part in sentence_components:
        if part in idiom_components:
            generated_idiom = idiom_dict.get_translation(part)
            print("Idiom part:", part)
            print("Generated idiom:", generated_idiom)
            if "<unk> " in generated_idiom:
                generated_idiom = generated_idiom[6:]
            concatenated_plain_en_sen += generated_idiom + " "
        else:
            concatenated_plain_en_sen += part + " "
    concatenated_plain_en_sen = concatenated_plain_en_sen[0:-1]
    print("THIS IS CON_PLAIN_EN_SEN:", concatenated_plain_en_sen)
    tokenized_text = en_to_vi_tokenizer.encode(concatenated_plain_en_sen, return_tensors="pt").to(device)
    summary_ids = quantized_model.generate(
            tokenized_text,
            max_length=128, 
            num_beams=5,
            repetition_penalty=2.5, 
            length_penalty=1.0, 
            early_stopping=True
        )
    output = en_to_vi_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    print("This is the translated sentences: ", output)
    

3
Idiom part: just in case
Generated idiom: as a precaution
THIS IS CON_PLAIN_EN_SEN: bring an umbrella  as a precaution  it rains.
This is the translated sentences:  mang một chiếc ô để phòng ngừa trời mưa.
