# Data preparation

Install libraries

In [None]:
!git clone https://github.com/OpenNMT/OpenNMT-py
!pip install OpenNMT-py

Load libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from datetime import datetime

function to save text files

In [None]:
def saveFile(path, textFile):
  outText = ""
  for r in textFile: outText += r + "\r\n"
  textFile = textFile[:-1]
  with open(path, 'w') as file: file.write(outText)

functions to encode bit-pairs

In [None]:
# Spanish to Bribri
def bpSpanishBribri():

  !python OpenNMT-py/tools/learn_bpe.py -i '/content/brbnmt-train-spn.txt' -o OpenNMT-py/data/src.code -s 10000
  !python OpenNMT-py/tools/learn_bpe.py -i '/content/brbnmt-train-brb.txt' -o OpenNMT-py/data/tgt.code -s 10000

  !python OpenNMT-py/tools/apply_bpe.py -c OpenNMT-py/data/src.code -i '/content/brbnmt-train-spn.txt' -o OpenNMT-py/data/src-train-bpe.txt
  !python OpenNMT-py/tools/apply_bpe.py -c OpenNMT-py/data/src.code -i '/content/brbnmt-val-spn.txt' -o OpenNMT-py/data/src-val-bpe.txt
  !python OpenNMT-py/tools/apply_bpe.py -c OpenNMT-py/data/src.code -i '/content/brbnmt-test-spn.txt' -o OpenNMT-py/data/src-test-bpe.txt
  !python OpenNMT-py/tools/apply_bpe.py -c OpenNMT-py/data/tgt.code -i '/content/brbnmt-train-brb.txt' -o OpenNMT-py/data/tgt-train-bpe.txt
  !python OpenNMT-py/tools/apply_bpe.py -c OpenNMT-py/data/tgt.code -i '/content/brbnmt-val-brb.txt' -o OpenNMT-py/data/tgt-val-bpe.txt

# Bribri to Spanish
def bpBribriSpanish():

  !python OpenNMT-py/tools/learn_bpe.py -i '/content/brbnmt-train-brb.txt' -o OpenNMT-py/data/src.code -s 10000
  !python OpenNMT-py/tools/learn_bpe.py -i '/content/brbnmt-train-spn.txt' -o OpenNMT-py/data/tgt.code -s 10000

  !python OpenNMT-py/tools/apply_bpe.py -c OpenNMT-py/data/src.code -i '/content/brbnmt-train-brb.txt' -o OpenNMT-py/data/src-train-bpe.txt
  !python OpenNMT-py/tools/apply_bpe.py -c OpenNMT-py/data/src.code -i '/content/brbnmt-val-brb.txt' -o OpenNMT-py/data/src-val-bpe.txt
  !python OpenNMT-py/tools/apply_bpe.py -c OpenNMT-py/data/src.code -i '/content/brbnmt-test-brb.txt' -o OpenNMT-py/data/src-test-bpe.txt
  !python OpenNMT-py/tools/apply_bpe.py -c OpenNMT-py/data/tgt.code -i '/content/brbnmt-train-spn.txt' -o OpenNMT-py/data/tgt-train-bpe.txt
  !python OpenNMT-py/tools/apply_bpe.py -c OpenNMT-py/data/tgt.code -i '/content/brbnmt-val-spn.txt' -o OpenNMT-py/data/tgt-val-bpe.txt

functions to pre-process pairs

In [None]:
def preprocessPairs():
  !python OpenNMT-py/preprocess.py -train_src OpenNMT-py/data/src-train-bpe.txt -train_tgt OpenNMT-py/data/tgt-train-bpe.txt -valid_src OpenNMT-py/data/src-val-bpe.txt -valid_tgt OpenNMT-py/data/tgt-val-bpe.txt -save_data OpenNMT-py/data/demo -overwrite

train a model

In [None]:
def trainNMT(trainSteps, validationSteps, checkpointSteps, logPath):

  !python OpenNMT-py/train.py -data OpenNMT-py/data/demo -save_model OpenNMT-py/data/model/model \
        -layers 6 -rnn_size 512 -word_vec_size 512 -transformer_ff 2048 -heads 8 -encoder_type transformer \
        -decoder_type transformer -position_encoding -train_steps $trainSteps -max_generator_batches 2 -dropout 0.1 \
        -batch_size 4096 -batch_type tokens -normalization tokens -accum_count 2 -optim adam -adam_beta2 0.998 \
        -decay_method noam -warmup_steps 8000 -learning_rate 2 -max_grad_norm 0 -param_init 0 -param_init_glorot \
        -label_smoothing 0.1 -valid_steps $validationSteps -save_checkpoint_steps $checkpointSteps -world_size 1 -gpu_rank 0  \
        -log_file $logPath

function to test the model

In [None]:
def produceNMTTranslations(modelPath, sourcePath, logPath):

  !python OpenNMT-py/translate.py -model $modelPath	\
    -src $sourcePath -output OpenNMT-py/pred.txt -replace_unk -verbose \
    -log_file $logPath

get BLEU values

In [None]:
def getBlue(sourcePath, logPath):
  
  !sed -i "s/@@ //g"  OpenNMT-py/pred.txt
  !perl  OpenNMT-py/tools/multi-bleu.perl $sourcePath < OpenNMT-py/pred.txt > $logPath

generate synthetic, back translated sentences

In [None]:
def generateSynth(modelPath, realSentences, outputPath):
  !python OpenNMT-py/translate.py -model $modelPath	\
    -src $realSentences -output $outputPath -replace_unk -verbose

merge synthetic sentences with real sentences

In [None]:
def mergeSynthSentences(syntheticSet):

  originalBribriTrainingSet = pd.read_fwf('/content/brbnmt-train-brb-original.txt',  header=None)
  brb_train = originalBribriTrainingSet[0]

  originalSpanishTrainingSet = pd.read_fwf('/content/brbnmt-train-spn-original.txt',  header=None)
  spa_train = originalSpanishTrainingSet[0]

  #saveFile("/content/brbnmt-train-spn-original.txt",spa_train)

  #brb_synth1 = pd.read_fwf('/content/synth-bribri-2.txt',  header=None)
  brb_synth1 = pd.read_fwf(syntheticSet,  header=None)
  brb_synth1 = brb_synth1[0]

  spanishBT = pd.read_fwf('/content/brbnmt-bt-spn.txt',  header=None)
  spanishBT = spanishBT[0]

  b_train_synth1 = brb_train.append(brb_synth1)
  #print(type(b_train_synth1))
  #print(len(b_train_synth1))

  t = spa_train
  s_train_synth1 = t.append(spanishBT)
  #print(type(s_train_synth1))
  #print(len(s_train_synth1))

  b_train_synth1_str = ""
  s_train_synth1_str = ""

  for t in b_train_synth1: b_train_synth1_str += str(t) + "\r\n"
  b_train_synth1_str = b_train_synth1_str[:-1]
  with open('/content/brbnmt-train-brb.txt', 'w') as file: file.write(b_train_synth1_str)

  for t in s_train_synth1: s_train_synth1_str += str(t) + "\r\n"
  s_train_synth1_str = s_train_synth1_str[:-1]
  with open('/content/brbnmt-train-spn.txt', 'w') as file: file.write(s_train_synth1_str)

splitting

In [None]:
def sampleSplit(pairs, useSynthData, portionRealData, trainStepsForModels, percentBT):

# Get percentage of sentences that will be used for
  # training, test, validation, and for back translation. 
  percentTest = 0.1
  percentVal  = percentTest / (1-percentTest)

  # Split the sets
  if (useSynthData == 1):
    auth,bt = train_test_split(pairs,test_size=percentBT,random_state=1)
    b_train,b_test,s_train,s_test = train_test_split(auth['Source'],auth['Target'],test_size=percentTest,random_state=1)
    b_train,b_val,s_train,s_val   = train_test_split(b_train, s_train, test_size=percentVal, random_state=1) 
  elif (useSynthData == 0):
    b_train,b_test,s_train,s_test = train_test_split(pairs['Source'],pairs['Target'],test_size=percentTest,random_state=1)
    b_train,b_val,s_train,s_val   = train_test_split(b_train, s_train, test_size=percentVal, random_state=1) 

  # get only a percentage of the set
  cutOffRatio = portionRealData 
  if (portionRealData != 1):
    b_train = b_train[:int(len(b_train)*cutOffRatio)]
    b_val   = b_val[:int(len(b_val)*cutOffRatio)]
    b_test  = b_test[:int(len(b_test)*cutOffRatio)]
    s_train = s_train[:int(len(s_train)*cutOffRatio)]
    s_val   = s_val[:int(len(s_val)*cutOffRatio)]
    s_test  = s_test[:int(len(s_test)*cutOffRatio)]
  
  if (useSynthData == 1):
    bt = bt[:int(len(b_train) + len(b_val) + len(b_test))]

  # Save files
  saveFile("/content/brbnmt-train-brb-original.txt",b_train)
  saveFile("/content/brbnmt-train-brb.txt",b_train)
  saveFile("/content/brbnmt-test-brb.txt",b_test)
  saveFile("/content/brbnmt-val-brb.txt",b_val)
  saveFile("/content/brbnmt-train-spn.txt",s_train)
  saveFile("/content/brbnmt-train-spn-original.txt",s_train)
  saveFile("/content/brbnmt-test-spn.txt",s_test)
  saveFile("/content/brbnmt-val-spn.txt",s_val)

  if (useSynthData == 1):
    saveFile("/content/brbnmt-bt-spn.txt",bt['Target'])
    saveFile("/content/brbnmt-bt-brb-ref.txt",bt['Source'])

  outputMetadata = "\n"
  outputMetadata += "Uses synthetic data:            " + str(useSynthData) + "\n"
  outputMetadata += "Training steps:                 " + str(trainStepsForModels) + "\n\n"
  outputMetadata += "Total real samples:             " + str(len(b_train)+len(b_val)+len(b_test)) + "\n\n"
  outputMetadata += "Bribri training samples:        " + str(len(b_train)) + "\n"
  outputMetadata += "Spanish training samples:       " + str(len(s_train)) + "\n"
  outputMetadata += "Bribri validation samples:      " + str(len(b_val)) + "\n"
  outputMetadata += "Spanish validation samples:     " + str(len(s_val)) + "\n"
  outputMetadata += "Bribri testing samples:         " + str(len(b_test)) + "\n"
  outputMetadata += "Spanish testing samples:        " + str(len(s_test)) + "\n"

  if (useSynthData)  == 1:
    outputMetadata += "Spanish backtraslation samples: " + str(len(bt)) + "\n"

  with open("/content/brbnmt-meta.txt", 'w') as file: file.write(outputMetadata)

  print(outputMetadata)

training as a function

In [None]:
def runTraining(useSynthData, portionRealData, inTrainSteps, inValSteps, inCheckpointSteps, inModelName, inPercentBT):

  trainStepsForModels = inTrainSteps
  validationStepsForModels = inValSteps
  checkpointStepsForModels = inCheckpointSteps
  modelName = inModelName

  percentBT = inPercentBT
  #sameBase = inSameBase

  # Read bitext from GitHub
  url = 'https://raw.githubusercontent.com/rolandocoto/bribri-coling2020/main/bri-spa-pair-sample-20201101.csv'
  allData = pd.read_csv(url)

  # Remove Bribri lines that have no English equivalent
  pairs = allData.dropna()

  # split into backtranslation, training, testing and validation sets
  sampleSplit(pairs, useSynthData, portionRealData, inTrainSteps, percentBT)

  if (useSynthData == 1):

    # step 1: Generate Spanish-Bribri model that we will use to make synthetic Bribri sentences.
    bpSpanishBribri()
    preprocessPairs()
    trainNMT(trainStepsForModels, validationStepsForModels, checkpointStepsForModels, '/content/spn-brb-log1.txt')
    produceNMTTranslations(modelName, '/content/brbnmt-test-spn.txt', '/content/spn-brb-log2.txt')
    getBlue('/content/brbnmt-test-brb.txt', '/content/spn-brb-log3.txt')
    generateSynth(modelName, '/content/brbnmt-bt-spn.txt', '/content/synth-bribri-1.txt')

  # step 2: Train a Bribri-Spanish model with just real data, to see the baseline performance.

  bpBribriSpanish()
  preprocessPairs()
  trainNMT(trainStepsForModels, validationStepsForModels, checkpointStepsForModels, '/content/spn-brb-log4.txt')
  produceNMTTranslations(modelName, '/content/brbnmt-test-brb.txt', '/content/spn-brb-log5.txt')
  getBlue('/content/brbnmt-test-spn.txt', '/content/spn-brb-log6.txt')

  if (useSynthData == 1):

    # Step 3: Merge Bribri synthetic data with Bribri real data and re-train a model with the real and synthetic Bribri combined.
    mergeSynthSentences('/content/synth-bribri-1.txt')
    bpBribriSpanish()
    preprocessPairs()
    trainNMT(trainStepsForModels, validationStepsForModels, checkpointStepsForModels, '/content/spn-brb-log7.txt')
    produceNMTTranslations(modelName, '/content/brbnmt-test-brb.txt', '/content/spn-brb-log8.txt')
    getBlue('/content/brbnmt-test-spn.txt', '/content/spn-brb-log9.txt')

    # Step 4: Make a new model for Spanish-Bribri training and regenerate the synthetic Bribri sentences
    bpSpanishBribri()
    preprocessPairs()
    trainNMT(trainStepsForModels, validationStepsForModels, checkpointStepsForModels, '/content/spn-brb-log10.txt')
    produceNMTTranslations(modelName, '/content/brbnmt-test-spn.txt', '/content/spn-brb-log11.txt')
    getBlue('/content/brbnmt-test-brb.txt', '/content/spn-brb-log12.txt')
    generateSynth(modelName, '/content/brbnmt-bt-spn.txt', '/content/synth-bribri-2.txt')

    # Step 5: Merge second Bribri synthetic data with Bribri real data and re-train a model with real and synthetic Bribri combined.
    mergeSynthSentences('/content/synth-bribri-2.txt')
    bpBribriSpanish()
    preprocessPairs()
    trainNMT(trainStepsForModels, validationStepsForModels, checkpointStepsForModels, '/content/spn-brb-log13.txt')
    produceNMTTranslations(modelName, '/content/brbnmt-test-brb.txt', '/content/spn-brb-log14.txt')
    getBlue('/content/brbnmt-test-spn.txt', '/content/spn-brb-log15.txt')

  # step 6: get report

  from google.colab import files
  !rm logs.txt

  now = datetime.now().strftime("%Y%m%d-%H%M")
  logName = "logs-BT" +  str(int(percentBT*100)) + "-" + now + ".txt"

  if (useSynthData == 1):

    !cat <(echo '--- Log metadata ---') brbnmt-meta.txt \
      <(echo '--- Training first SPA-BRI model ---') spn-brb-log1.txt \
      <(echo '--- Test first SPA-BRI model ---') spn-brb-log2.txt \
      <(echo '--- BLEU first SPA-BRI model ---') spn-brb-log3.txt \
      <(echo '--- Training BRI-SPA realData model ---') spn-brb-log4.txt \
      <(echo '--- Test BRI-SPA realData model ---') spn-brb-log5.txt \
      <(echo '--- BLEU BRI-SPA realData model ---') spn-brb-log6.txt \
      <(echo '--- Training BRI-SPA synth1 model ---') spn-brb-log7.txt \
      <(echo '--- Test BRI-SPA synth1 model ---') spn-brb-log8.txt \
      <(echo '--- BLEU BRI-SPA synth1 model ---') spn-brb-log9.txt \
      <(echo '--- Training second SPA-BRI model ---') spn-brb-log10.txt \
      <(echo '--- Test second SPA-BRI model ---') spn-brb-log11.txt \
      <(echo '--- BLEU second SPA-BRI model ---') spn-brb-log12.txt \
      <(echo '--- Training BRI-SPA synth2 model ---') spn-brb-log13.txt \
      <(echo '--- Test BRI-SPA synth2 model ---') spn-brb-log14.txt \
      <(echo '--- BLEU BRI-SPA synth2 model ---') spn-brb-log15.txt \
      <(echo '--- Bribri realData training sentences ---') brbnmt-train-brb-original.txt \
      <(echo '--- Bribri synth1 sentences ---') synth-bribri-1.txt \
      <(echo '--- Bribri synth2 sentences ---') synth-bribri-2.txt \
      <(echo '--- Spanish training sentences ---') brbnmt-train-spn.txt \
      >> $logName

  else:

    !cat <(echo '--- Log metadata ---') brbnmt-meta.txt \
      <(echo '--- Training BRI-SPA realData model ---') spn-brb-log4.txt \
      <(echo '--- Test BRI-SPA realData model ---') spn-brb-log5.txt \
      <(echo '--- BLEU BRI-SPA realData model ---') spn-brb-log6.txt \
      >> $logName


  files.download(logName)

run the code a couple of times

In [None]:
#isaac: 5 x condition2 (3K real, no synth)

for i in range(5):
  runTraining(0, 0.5, 4000, 4000, 4000, 'OpenNMT-py/data/model/model_step_4000.pt', -1)  # no synth, 3K real