# HW5 - Fairseq with Fconv architecture

### Install dependencies

In [None]:
%%bash
pip install fastBPE regex requests sacremoses subword_net
git clone https://github.com/pytorch/fairseq.git
cd fairseq && git checkout 9a1c497
pip install --editable ./
pip install bitarray
pip install hazm
pip install configargparse
pip install torchtext==0.4

In [None]:
!pip install fairseq
!pip install parsivar

In [34]:
!pip install sacremoses

Installing collected packages: sacremoses
Successfully installed sacremoses-0.0.53


## Mounting Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import libraries

In [None]:
from __future__ import unicode_literals
from parsivar import *
import pandas as pd
import numpy as np
import math
import torch
import re
%matplotlib inline
import matplotlib.pyplot as plt
import nltk
nltk.download("punkt")

In [None]:
!cp  "/content/drive/MyDrive/Data_Colab/NLP_CA5_Data/train.en" .
!cp "/content/drive/MyDrive/Data_Colab/NLP_CA5_Data/test.en" .
!cp "/content/drive/MyDrive/Data_Colab/NLP_CA5_Data/valid.en" .
!cp "/content/drive/MyDrive/Data_Colab/NLP_CA5_Data/train.fa" .
!cp "/content/drive/MyDrive/Data_Colab/NLP_CA5_Data/test.fa" .
!cp "/content/drive/MyDrive/Data_Colab/NLP_CA5_Data/valid.fa" .

## Normalization - Preprocessing

In [21]:
from parsivar import Normalizer
from parsivar import SpellCheck
parsivar_normalizer = Normalizer(pinglish_conversion_needed=True)
parsivar_tokenizer = Tokenizer()

def no_space(char, prev_char):
        return char in set(',.!?') and prev_char != ' '

def farsi_preprocess():
  farsi_files_to_be_preprocessed = ["train.fa", "valid.fa", "test.fa"]
  for files in farsi_files_to_be_preprocessed:
    lines = []
    with open(files) as file:
        for line in file:
          line = ''.join([' ' + char if i > 0 and no_space(char, line[i - 1]) else char for i, char in enumerate(line)])
          line = re.sub('[)(}{-]', ' ', line)
          lines.append(' '.join(parsivar_tokenizer.tokenize_words(parsivar_normalizer.normalize(line.rstrip())))) #word_tokenize in parsivar
        with open(f"preprocessed_{files}", 'w') as f:
          f.write('\n'.join(lines))

def english_preprocess():
  english_files_to_be_preprocessed = ["train.en", "valid.en", "test.en"]
  for files in english_files_to_be_preprocessed:
    lines = []
    with open(files) as file:
        for line in file:
          line = ''.join([' ' + char if i > 0 and no_space(char, line[i - 1]) else char for i, char in enumerate(line)])
          line = re.sub('[)(}{-]', ' ', line)
          lines.append(' '.join(nltk.word_tokenize(' '.join([word.lower() for word in line.split(' ')]))))
        with open(f"preprocessed_{files}", 'w') as f:
          f.write('\n'.join(lines))

farsi_preprocess()
english_preprocess()

In [22]:
!fairseq-preprocess --source-lang en --target-lang fa --bpe byte_bpe --tokenizer moses --optimizer nag --trainpref /content/preprocessed_train --validpref /content/preprocessed_valid --testpref /content/preprocessed_test --destdir data-bin/custom.tokenized.en-fa

2022-06-09 10:34:07 | INFO | fairseq_cli.preprocess | Namespace(align_suffix=None, alignfile=None, all_gather_list_size=16384, bf16=False, bpe='byte_bpe', checkpoint_shard_count=1, checkpoint_suffix='', cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data-bin/custom.tokenized.en-fa', empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, only_source=False, optimizer='nag', padding_factor=8, profile=False, quantization_config_path=None, scoring='bleu', seed=1, source_lang='en', srcdict=None, target_lang='fa', task='translation', tensorboard_logdir=None, testpref='/content/preprocessed_test', tgtdict=None, threshold_loss_scale=None, thresholdsrc=0, thresholdtg

In [16]:
#!rm -rf checkpoints/

In [9]:
######

In [23]:
!mkdir -p checkpoints/fconv

In [24]:
!CUDA_VISIBLE_DEVICES=0 fairseq-train /content/data-bin/custom.tokenized.en-fa \
    --optimizer nag --lr 0.01 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 --max-epoch 30 \
    --arch fconv_iwslt_de_en --save-dir checkpoints/fconv

2022-06-09 10:34:24 | INFO | fairseq_cli.train | Namespace(all_gather_list_size=16384, arch='fconv_iwslt_de_en', batch_size=None, batch_size_valid=None, best_checkpoint_metric='loss', bf16=False, bpe=None, broadcast_buffers=False, bucket_cap_mb=25, checkpoint_shard_count=1, checkpoint_suffix='', clip_norm=0.1, cpu=False, criterion='cross_entropy', curriculum=0, data='/content/data-bin/custom.tokenized.en-fa', data_buffer_size=10, dataset_impl=None, ddp_backend='c10d', decoder_attention='True', decoder_embed_dim=256, decoder_embed_path=None, decoder_layers='[(256, 3)] * 3', decoder_out_embed_dim=256, device_id=0, disable_validation=False, distributed_backend='nccl', distributed_init_method=None, distributed_no_spawn=False, distributed_num_procs=1, distributed_port=-1, distributed_rank=0, distributed_world_size=1, distributed_wrapper='DDP', dropout=0.2, empty_cache_freq=0, encoder_embed_dim=256, encoder_embed_path=None, encoder_layers='[(256, 3)] * 4', eval_bleu=False, eval_bleu_args=Non

In [30]:
!fairseq-generate /content/data-bin/custom.tokenized.en-fa \
    --path checkpoints/fconv/checkpoint_best.pt \
    --batch-size 512 --beam 5

2022-06-09 10:51:03 | INFO | fairseq_cli.generate | Namespace(all_gather_list_size=16384, batch_size=512, batch_size_valid=512, beam=5, bf16=False, bpe=None, broadcast_buffers=False, bucket_cap_mb=25, checkpoint_shard_count=1, checkpoint_suffix='', constraints=None, cpu=False, criterion='cross_entropy', curriculum=0, data='/content/data-bin/custom.tokenized.en-fa', data_buffer_size=10, dataset_impl=None, ddp_backend='c10d', decoding_format=None, device_id=0, disable_validation=False, distributed_backend='nccl', distributed_init_method=None, distributed_no_spawn=False, distributed_port=-1, distributed_rank=0, distributed_world_size=1, distributed_wrapper='DDP', diverse_beam_groups=-1, diverse_beam_strength=0.5, diversity_rate=-1.0, empty_cache_freq=0, eval_bleu=False, eval_bleu_args=None, eval_bleu_detok='space', eval_bleu_detok_args=None, eval_bleu_print_samples=False, eval_bleu_remove_bpe=None, eval_tokenized_bleu=False, fast_stat_sync=False, find_unused_parameters=False, fix_batches_

## Interactive translator

In [None]:
!curl https://dl.fbaipublicfiles.com/fairseq/models/wmt14.v2.en-fr.fconv-py.tar.bz2 | tar xvjf -

In [36]:
!fairseq-interactive \
    --path /content/checkpoints/fconv/checkpoint_best.pt /content/data-bin/custom.tokenized.en-fa \
    --beam 5 --source-lang en --target-lang fa \
    --tokenizer moses \

2022-06-09 10:56:53 | INFO | fairseq_cli.interactive | Namespace(all_gather_list_size=16384, batch_size=1, batch_size_valid=None, beam=5, bf16=False, bpe=None, broadcast_buffers=False, bucket_cap_mb=25, buffer_size=1, checkpoint_shard_count=1, checkpoint_suffix='', constraints=None, cpu=False, criterion='cross_entropy', curriculum=0, data='/content/data-bin/custom.tokenized.en-fa', data_buffer_size=10, dataset_impl=None, ddp_backend='c10d', decoding_format=None, device_id=0, disable_validation=False, distributed_backend='nccl', distributed_init_method=None, distributed_no_spawn=False, distributed_num_procs=1, distributed_port=-1, distributed_rank=0, distributed_world_size=1, distributed_wrapper='DDP', diverse_beam_groups=-1, diverse_beam_strength=0.5, diversity_rate=-1.0, empty_cache_freq=0, eval_bleu=False, eval_bleu_args=None, eval_bleu_detok='space', eval_bleu_detok_args=None, eval_bleu_print_samples=False, eval_bleu_remove_bpe=None, eval_tokenized_bleu=False, fast_stat_sync=False, 