This notebook will provide simple functions to clean translation memory file that contains bad encoding and generate train/test and tune datasets from the sentence pairs

In [1]:
from requests import get, post, delete
import json
from translate.storage.tmx import tmxfile



# Clean bad XML encding from translation memory file

In [None]:
# Here we replace bad XML encoding used for example by previous version of Trados

inputfile = 'ENG-SPA_Reference.tmx'
outputfile = 'ENG-SPA_Reference-edited.tmx'

from re import search

#  Bad encoding to remove
substring = "#x1E;"

with open(outputfile, "ab") as output_file:
    with open(inputfile, 'r', encoding='utf-8') as input_file:
        line = input_file.readline()
        while line:
            new_line = line
            found =  search(substring, line)
            if not found is None:
                new_line = line[:int(found.span(0)[0])] + line[int(found.span(0)[1]):]
            
            
            amp = search('&', new_line)
            if not amp is None:
                if ('<SEG>' in new_line) or ('<seg>' in new_line) or ('</SEG>' in new_line) or ('</seg>' in new_line):
                    new_line = new_line.replace('&',' and ')
            
            output_file.write(new_line.encode('utf-8'))
            line = input_file.readline()


## Here we load a cleaned translation memory file

In [None]:
with open("ENG-FRE_Reference-edited.tmx", 'rb') as en_es:
    tmx_file = tmxfile(en_es, 'en-GB', 'fr-FR')

## Now we will chunk the files and create the train/test/tune datasets for Trados

In [None]:
# This approach simply samples the first n suitable records for the test and train sets. All datasets are 
# exclusive in that records only exist in a single dataset i.e. train/test/tune
# Ensure train/test/tune directories exist where we will write to


import tqdm as tqdm

max_unit_count = 99500    #  As we have a size limit of file size, let's set an upper limit
max_test_count = 2450     # We can only have 2500 sentence pairs in the test set
max_tune_count = 2450     # We can only have 2500 sentence pairs in the tune set
min_tune_token_count = 7  # We ideally want tune sentences to be minimum this amount
max_tune_token_count = 10 # We ideally want tune sentences to be minimum this amount
train_unit_count = 0
test_unit_count = 0
tune_unit_count = 0
last_unit_processed = 0
file_name_suffix = '_.tmx'
current_batch_num = 0

master_tmx = 'ENG-SPA_Reference-edited.tmx'

# Change this header for the file processed, we add this to retain metadata of the original translation memory
# file
eng_spa = """<?xml version="1.0" encoding="utf-8"?>
<tmx version="1.4">
  <header creationtool="SDL Language Platform" creationtoolversion="8.0" o-tmf="SDL TM8 Format" datatype="xml" segtype="sentence" adminlang="en-US" srclang="en-US" creationdate="20150115T221603Z" creationid="nashm">
    <prop type="x-Job Num.:MultipleString"></prop>
    <prop type="x-Symbol:MultipleString"></prop>
    <prop type="x-Title:MultipleString"></prop>
    <prop type="x-Recognizers">RecognizeDates, RecognizeTimes, RecognizeNumbers, RecognizeMeasurements</prop>
    <prop type="x-TMName">ENG-SPA</prop>
  </header>
  <body>"""

# Change this header for the file processed, we add this to retain metadata of the original translation memory
# file
eng_spa_ref = """<?xml version="1.0" encoding="utf-8"?>
<tmx version="1.4">
  <header creationtool="SDL Language Platform" creationtoolversion="8.0" o-tmf="SDL TM8 Format" datatype="xml" segtype="sentence" adminlang="en-US" srclang="en-US" creationdate="20150115T221744Z" creationid="nashm">
    <prop type="x-Job Num.:MultipleString"></prop>
    <prop type="x-Symbol:MultipleString"></prop>
    <prop type="x-Title:MultipleString"></prop>
    <prop type="x-Recognizers">RecognizeDates, RecognizeTimes, RecognizeNumbers, RecognizeMeasurements</prop>
    <prop type="x-TMName">ENG-SPA-Reference</prop>
  </header>
  <body>"""

header = eng_spa_ref

# The footer is a simple XML closing tag
footer = r"""  </body>
</tmx>"""

# We can optionally generate a tuning file if set to True
generate_tuning_file = False
# We can optionally generate a test file if set to True
generate_test_file = False
f_train = None
f_test = open('test/' + master_tmx[:-4] + '_test.tmx' ,"ab")
f_test.write(header.encode('utf-8'))


with open(master_tmx, 'rb') as en_es:
    tmx_file = tmxfile(en_es, 'en-GB', 'en-ES')  # The translation library does not filter on these values
                                                 # so no need to change
    
    
print(f"Loaded {master_tmx}")


if generate_tuning_file:
    f_tune = open('tune/' + master_tmx[:-4] + '_tune.tmx' ,"ab")
    f_tune.write(header.encode('utf-8'))

for i, unit in enumerate(tmx_file.unit_iter()):
    

    processed = False
    
    # Prepare our file name
    if i % max_unit_count == 0:
        print(f"Running batch {i} of {len(tmx_file.units)}")
        current_batch_num = int(i/max_unit_count)
        
        if not f_train is None:
            f_train.write(footer.encode('utf-8'))
            f_train.close()
            print(f"Wrote {'train/' + master_tmx[:-4] + str(current_batch_num) + '_train.tmx'}")
            
        f_train = open('train/' + master_tmx[:-4] + str(current_batch_num) + '_train.tmx' ,"ab")
        f_train.write(header.encode('utf-8'))
    
    if generate_tuning_file:
        # Get the word count in the source language - we will take the first 2500 that meet our token criteria
        lst_source_text = unit.getid().split()

        if tune_unit_count < max_tune_count:
            if (len(lst_source_text) <= max_tune_token_count) and (len(lst_source_text) >= min_tune_token_count):
                f_tune.write(str(unit).replace("en-GB","en-US").encode('utf-8'))
                tune_unit_count += 1
                processed = True
                
    if not processed:
        if generate_test_file:
            if test_unit_count < max_test_count:
                    # Randomly sample test files every n files
                    if i % 100 == 0:
                        f_test.write(str(unit).replace("en-GB","en-US").encode('utf-8'))
                        test_unit_count += 1
                        processed = True
                

    if not processed:
        # Sample the first 2500 for test
        f_train.write(str(unit).replace("en-GB","en-US").encode('utf-8'))
        train_unit_count += 1


if generate_tuning_file:                
    f_tune.write(footer.encode('utf-8'))
    f_tune.close()

if generate_test_file:
    f_test.write(footer.encode('utf-8'))
    f_test.close()

f_train.write(footer.encode('utf-8'))
f_train.close()


print(f"Ran {current_batch_num} batches. Generated {train_unit_count*current_batch_num} train records")
print(f" Generated {test_unit_count} test records")
print(f" Generated {tune_unit_count} tune records")            