In [None]:
! pip install fairseq bitarray fastBPE hydra-core omegaconf regex requests sacremoses subword_nmt sacrebleu==1.5.1
! nvidia-smi

In [None]:
! pip install transformers

In [2]:
! gdown 1wO3c5suEmAyZ-U3KNn_le8Qz_NmM6ru7
! unzip preprocessed_data.zip

Downloading...
From: https://drive.google.com/uc?id=1wO3c5suEmAyZ-U3KNn_le8Qz_NmM6ru7
To: /content/preprocessed_data.zip
  0% 0.00/2.52M [00:00<?, ?B/s]100% 2.52M/2.52M [00:00<00:00, 216MB/s]
Archive:  preprocessed_data.zip
   creating: preprocessed_data/
  inflating: preprocessed_data/train.fa  
  inflating: preprocessed_data/test.en  
  inflating: preprocessed_data/valid.en  
  inflating: preprocessed_data/train.en  
  inflating: preprocessed_data/test.fa  
  inflating: preprocessed_data/valid.fa  


In [3]:
def run_bash(shell_string):
    with open('script.sh', 'w') as file:
        file.write(shell_string)
    ! chmod 755 ./script.sh
    ! ./script.sh

# Tokenization + BPE

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(special_tokens=["[UNK]"], continuing_subword_prefix="@")
for lang in ["en", "fa"]:
    tokenizer.train(files=[f"preprocessed_data/train.{lang}", 
                        f"preprocessed_data/valid.{lang}", 
                        f"preprocessed_data/test.{lang}"], 
                    trainer=trainer)
    text = "hello mohsen fayyaz" if lang == "en" else "سلام محسن فیاض"
    output = tokenizer.encode(text)
    print(tokenizer.decode(output.ids))

hello mohs @en f @ayy @az
سلام محسن فیا @ض


In [7]:
fairseq_preprocess = """
rm -r data-bin/
TEXT=/content/preprocessed_data
fairseq-preprocess --source-lang en --target-lang fa \
    --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
    --destdir data-bin/data.tokenized.en-fa \
    --workers 20 \
    --bpe bert \
    --log-format json \
"""
run_bash(fairseq_preprocess)

2022-06-07 11:49:59 | INFO | fairseq_cli.preprocess | Namespace(align_suffix=None, alignfile=None, all_gather_list_size=16384, bf16=False, bpe='bert', checkpoint_shard_count=1, checkpoint_suffix='', cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data-bin/data.tokenized.en-fa', empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_format='json', log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, only_source=False, optimizer=None, padding_factor=8, profile=False, quantization_config_path=None, scoring='bleu', seed=1, source_lang='en', srcdict=None, target_lang='fa', task='translation', tensorboard_logdir=None, testpref='/content/preprocessed_data/test', tgtdict=None, threshold_loss_scale=None, thresholdsrc=0, thresholdtg

In [26]:
! rm -r checkpoints

In [27]:
fairseq_train = """
fairseq-train \
    data-bin/data.tokenized.en-fa \
    --arch transformer --share-decoder-input-output-embed \
    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
    --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
    --dropout 0.3 --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --max-tokens 4096 \
    --eval-bleu \
    --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
    --eval-bleu-detok moses \
    --eval-bleu-remove-bpe \
    --eval-bleu-print-samples \
    --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
    --max-epoch 50 \
    --patience 10 \
    --save-dir checkpoints \
    # --bpe bert \
    # --fp16 \
    # --reset-optimizer \
    --batch-size 64
"""
run_bash(fairseq_train)
# 1>training.log 2>&1

2022-06-07 13:14:26 | INFO | fairseq_cli.train | Namespace(activation_dropout=0.0, activation_fn='relu', adam_betas='(0.9, 0.98)', adam_eps=1e-08, adaptive_input=False, adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, all_gather_list_size=16384, arch='transformer', attention_dropout=0.0, batch_size=None, batch_size_valid=None, best_checkpoint_metric='bleu', bf16=False, bpe=None, broadcast_buffers=False, bucket_cap_mb=25, checkpoint_shard_count=1, checkpoint_suffix='', clip_norm=0.0, cpu=False, criterion='label_smoothed_cross_entropy', cross_self_attention=False, curriculum=0, data='data-bin/data.tokenized.en-fa', data_buffer_size=10, dataset_impl=None, ddp_backend='c10d', decoder_attention_heads=8, decoder_embed_dim=512, decoder_embed_path=None, decoder_ffn_embed_dim=2048, decoder_input_dim=512, decoder_layerdrop=0, decoder_layers=6, decoder_layers_to_keep=None, decoder_learned_pos=False, decoder_normalize_before=False, decoder_output_dim=512, device_id=0, disable_validation=F

In [None]:
from tqdm.auto import tqdm
MAX_EPOCHS = 2
for i in tqdm(range(1, MAX_EPOCHS)):
    ! fairseq-generate data-bin/data.tokenized.en-fa --path checkpoints/checkpoint{i}.pt --batch-size 128 --beam 5 --remove-bpe --log-format json --tensorboard-logdir 123

In [33]:
! fairseq-generate data-bin/data.tokenized.en-fa --path checkpoints/checkpoint_best.pt --batch-size 128 --beam 5 --remove-bpe --eval-bleu --results-path generate_results

  beams_buf = indices_buf // vocab_size
  unfin_idx = idx // beam_size


In [32]:
! fairseq-generate data-bin/data.tokenized.en-fa --path checkpoints/checkpoint_best.pt --batch-size 128 --beam 5 --remove-bpe --eval-bleu

2022-06-07 14:44:24 | INFO | fairseq_cli.generate | Namespace(all_gather_list_size=16384, batch_size=128, batch_size_valid=128, beam=5, bf16=False, bpe=None, broadcast_buffers=False, bucket_cap_mb=25, checkpoint_shard_count=1, checkpoint_suffix='', constraints=None, cpu=False, criterion='cross_entropy', curriculum=0, data='data-bin/data.tokenized.en-fa', data_buffer_size=10, dataset_impl=None, ddp_backend='c10d', decoding_format=None, device_id=0, disable_validation=False, distributed_backend='nccl', distributed_init_method=None, distributed_no_spawn=False, distributed_port=-1, distributed_rank=0, distributed_world_size=1, distributed_wrapper='DDP', diverse_beam_groups=-1, diverse_beam_strength=0.5, diversity_rate=-1.0, empty_cache_freq=0, eval_bleu=True, eval_bleu_args=None, eval_bleu_detok='space', eval_bleu_detok_args=None, eval_bleu_print_samples=False, eval_bleu_remove_bpe=None, eval_tokenized_bleu=False, fast_stat_sync=False, find_unused_parameters=False, fix_batches_to_gpus=Fals