In [0]:
%%bash
git clone https://github.com/pytorch/fairseq.git

cd fairseq
pip install fastBPE regex requests sacremoses subword_nmt

In [0]:
%%bash
cd fairseq/

pip install --editable .

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')


In [5]:

""" 
Tokenize Hi-En parallel corpus
Data present in 'fairseq/hi-en/pruned_train.hi' and 'fairseq/hi-en/pruned_train.en'
Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
"""
%%bash
cd fairseq/

echo 'Cloning Moses github repository (for tokenization scripts)...'
git clone https://github.com/moses-smt/mosesdecoder.git

echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
git clone https://github.com/rsennrich/subword-nmt.git

SCRIPTS=mosesdecoder/scripts
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
CLEAN=$SCRIPTS/training/clean-corpus-n.perl
NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl
REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl
BPEROOT=subword-nmt/subword_nmt
BPE_TOKENS=20000

CORPORA=('pruned_train')

if [ ! -d "$SCRIPTS" ]; then
    echo "Please set SCRIPTS variable correctly to point to Moses scripts."
    exit
fi

src=hi
tgt=en
lang=hi-en
prep=iitb_hi_en
tmp=$prep/tmp
orig=orig

mkdir -p $orig $tmp $prep


cp hi-en/* $orig/

echo "pre-processing train data..."
for l in $src $tgt; do
    for f in "${CORPORA[@]}"; do
        cat $orig/$f.$l | \
            perl $NORM_PUNC $l | \
            perl $REM_NON_PRINT_CHAR | \
            perl $TOKENIZER -threads 8 -a -l $l >> $tmp/train.tags.$lang.tok.$l
    done
done


echo "splitting train and valid..."
for l in $src $tgt; do
    awk '{if (NR%1333 == 0)  print $0;}' $tmp/train.tags.$lang.tok.$l > $tmp/valid.$l
    awk '{if (NR%1333 != 0)  print $0;}' $tmp/train.tags.$lang.tok.$l > $tmp/train.$l
done


TRAIN=$tmp/train.hi-en
BPE_CODE=$prep/code
rm -f $TRAIN
for l in $src $tgt; do
    cat $tmp/train.$l >> $TRAIN
done

echo "learn_bpe.py on ${TRAIN}..."
python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE

for L in $src $tgt; do
    for f in train.$L valid.$L; do
        echo "apply_bpe.py to ${f}..."
        python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $tmp/bpe.$f
    done
done

perl $CLEAN -ratio 1.5 $tmp/bpe.train $src $tgt $prep/train 1 250
perl $CLEAN -ratio 1.5 $tmp/bpe.valid $src $tgt $prep/valid 1 250

echo 'Done'

Cloning Moses github repository (for tokenization scripts)...
Cloning Subword NMT repository (for BPE pre-processing)...
pre-processing train data...
splitting train and valid...
learn_bpe.py on iitb_hi_en/tmp/train.hi-en...
apply_bpe.py to train.hi...
apply_bpe.py to valid.hi...
apply_bpe.py to train.en...
apply_bpe.py to valid.en...
Done


Cloning into 'mosesdecoder'...
Cloning into 'subword-nmt'...
Tokenizer Version 1.1
Language: hi
Number of threads: 8
Tokenizer Version 1.1
Language: en
Number of threads: 8
clean-corpus.perl: processing iitb_hi_en/tmp/bpe.train.hi & .en to iitb_hi_en/train, cutoff 1-250, ratio 1.5
..........(100000)..........(200000)..........(300000)..........(400000)..........(500000)..........(600000)..........(700000)........
Input sentences: 787507  Output sentences:  479248
clean-corpus.perl: processing iitb_hi_en/tmp/bpe.valid.hi & .en to iitb_hi_en/valid, cutoff 1-250, ratio 1.5

Input sentences: 591  Output sentences:  360


In [6]:
%%bash
cd fairseq/

TEXT=iitb_hi_en
fairseq-preprocess --source-lang hi --target-lang en \
    --trainpref $TEXT/train --validpref $TEXT/valid \
    --destdir data-bin/iitb_hi_en

2020-04-30 20:16:22 | INFO | fairseq_cli.preprocess | Namespace(align_suffix=None, alignfile=None, all_gather_list_size=16384, bpe=None, checkpoint_suffix='', cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data-bin/iitb_hi_en', empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, only_source=False, optimizer='nag', padding_factor=8, quantization_config_path=None, seed=1, source_lang='hi', srcdict=None, target_lang='en', task='translation', tensorboard_logdir='', testpref=None, tgtdict=None, threshold_loss_scale=None, thresholdsrc=0, thresholdtgt=0, tokenizer=None, trainpref='iitb_hi_en/train', user_dir=None, validpref='iitb_hi_en/valid', workers=1)
2020-04-30 20:17:59 | INFO | fairse

In [0]:
# Training Hi-En fairseq model
!mkdir hi-en-best/
!cd fairseq/ && python train.py data-bin/iitb_hi_en --label-smoothing 0.1 --adam-betas '(0.9,0.98)' \
    --optimizer adam -s hi -t en --criterion label_smoothed_cross_entropy --lr 0.0005 --lr-scheduler inverse_sqrt --clip-norm 0.1 --dropout 0.2 --max-tokens 2000 \
    --arch transformer_vaswani_wmt_en_fr_big --save-dir /content/gdrive/My\ Drive/hi-en-best --max-epoch 12 | tee -a ../hi-en-best/training.log

In [1]:
# Restart runtime if en-fr load error persists
pip install fairseq



In [3]:
# Load both Hi-En model and pre-trained En-Fr model
import os
import torch
from fairseq.models.transformer import TransformerModel
from fairseq.models import FairseqEncoderDecoderModel
os.chdir('/content/')
hi2en = TransformerModel.from_pretrained(
  '/content/gdrive/My Drive/hi-en-best/',
  checkpoint_file='checkpoint_last.pt',
  data_name_or_path='/content/fairseq/data-bin/iitb_hi_en',
  bpe='subword_nmt',
  bpe_codes='/content/fairseq/iitb_hi_en/code')


en2fr = torch.hub.load('pytorch/fairseq', 'transformer.wmt14.en-fr', tokenizer='moses', bpe='subword_nmt')

Downloading: "https://github.com/pytorch/fairseq/archive/master.zip" to /root/.cache/torch/hub/master.zip
100%|██████████| 2316140317/2316140317 [01:09<00:00, 33486363.57B/s]


In [0]:

from fairseq.hub_utils import GeneratorHubInterface
from fairseq.tasks.translation import TranslationTask

hi2en_encoder = list(hi2en.models[0].children())[0]
en2fr_decoder = list(en2fr.models[0].children())[1]

class hi_fr_translator(FairseqEncoderDecoderModel):
  def __init__(self, encoder, decoder):
    super().__init__(encoder, decoder)

class hi_fr_task(TranslationTask):
  def __init__(self, args, src_dict, tgt_dict):
    super().__init__(args, src_dict, tgt_dict)

hi2fr_task = hi_fr_task(en2fr.args, hi2en.task.source_dictionary, en2fr.task.target_dictionary)

hi2fr = hi_fr_translator(hi2en_encoder, en2fr_decoder)
gen_obj = GeneratorHubInterface(en2fr.args, hi2fr_task, [hi2fr])

In [0]:
# Generate translation for test set
test_file =  open('/content/gdrive/My Drive/hi-en-best/Tatoeba.fr-hi.hi', 'r')
 
with open('/content/gdrive/My Drive/hi-en-best/test_results.txt', 'w') as out_file:
  for test_line in test_file.readlines():
    fr = gen_obj.translate(test_line, beam=3)
    out_file.write(fr+'\n')