In [None]:
# !pip install joeynmt
# !pip show joeynmt

!pip install -q git+https://github.com/joeynmt/joeynmt.git
!pip show joeynmt

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

import torch
torch.__version__

In [None]:
import os
from pathlib import Path

source_language = "en"
target_language = "fon"
data_path = "data"
BPE_SIZE = 4000
joeynmt_data_dir = "data_joeynmt"
joeynmt_model_dir = "model_joeynmt"

# suppression des fichiers existants dans le dossier joeynmt_data_dir et model_joeynmt
!rm -rf $joeynmt_data_dir
!rm -rf $joeynmt_model_dir

os.environ["src"] = source_language
os.environ["tgt"] = target_language
os.environ["data_path"] = data_path
os.environ["bpe_size"] = str(BPE_SIZE)
os.environ["joeynmt_data_dir"] = joeynmt_data_dir
os.environ["joeynmt_model_dir"] = joeynmt_model_dir



In [None]:
# Apprenez les BPE sur les données d’entraînement
!subword-nmt learn-joint-bpe-and-vocab --input $data_path/train_$src.txt $data_path/train_$tgt.txt -s $bpe_size -o bpe.codes.$bpe_size --write-vocabulary vocab.$src vocab.$tgt

# Appliquez les BPE sur les données d’entraînement, développement et test
!subword-nmt apply-bpe -c bpe.codes.$bpe_size --vocabulary vocab.$src  < $data_path/train_$src.txt > train.bpe.$src
!subword-nmt apply-bpe -c bpe.codes.$bpe_size --vocabulary vocab.$tgt  < $data_path/train_$tgt.txt > train.bpe.$tgt
!subword-nmt apply-bpe -c bpe.codes.$bpe_size --vocabulary vocab.$src  < $data_path/dev_$src.txt > dev.bpe.$src
!subword-nmt apply-bpe -c bpe.codes.$bpe_size --vocabulary vocab.$tgt  < $data_path/dev_$tgt.txt > dev.bpe.$tgt
!subword-nmt apply-bpe -c bpe.codes.$bpe_size --vocabulary vocab.$src  < $data_path/test_$src.txt > test.bpe.$src
!subword-nmt apply-bpe -c bpe.codes.$bpe_size --vocabulary vocab.$tgt  < $data_path/test_$tgt.txt > test.bpe.$tgt

! mkdir -p data_joeynmt/train
! mv train.bpe.$src data_joeynmt/train/train.$src
! mv train.bpe.$tgt data_joeynmt/train/train.$tgt
! mkdir -p data_joeynmt/dev
! mv dev.bpe.$src data_joeynmt/dev/dev.$src
! mv dev.bpe.$tgt data_joeynmt/dev/dev.$tgt
! mkdir -p data_joeynmt/test
! mv test.bpe.$src data_joeynmt/test/test.$src
! mv test.bpe.$tgt data_joeynmt/test/test.$tgt

# !mkdir -p data_joeynmt/train
# !cp $data_path/train_$src.txt data_joeynmt/train/train.$src
# !cp $data_path/train_$tgt.txt data_joeynmt/train/train.$tgt
# !mkdir -p data_joeynmt/dev
# !cp $data_path/dev_$src.txt data_joeynmt/dev/dev.$src
# !cp $data_path/dev_$tgt.txt data_joeynmt/dev/dev.$tgt
# !mkdir -p data_joeynmt/test
# !cp $data_path/test_$src.txt data_joeynmt/test/test.$src
# !cp $data_path/test_$tgt.txt data_joeynmt/test/test.$tgt

!mkdir -p data_joeynmt/voc
!mv vocab.$src data_joeynmt/voc/vocab.$src
!mv vocab.$tgt data_joeynmt/voc/vocab.$tgt
!mv bpe.codes.$bpe_size data_joeynmt/bpe.codes.$bpe_size

# # réduire les tailles des fichiers vocab à 2500
# !head -2460 data_joeynmt/voc/vocab_1.$src > data_joeynmt/voc/vocab.$src
# !head -2460 data_joeynmt/voc/vocab_1.$tgt > data_joeynmt/voc/vocab.$tgt

# # compter le nombre de lignes dans les fichiers de vocabulaire
# !wc -l data_joeynmt/voc/vocab.$src
# !wc -l data_joeynmt/voc/vocab.$tgt

# crééer un fichier vocab.txt contenant les mots de vocab.$src et vocab.$tgt
!cat data_joeynmt/voc/vocab.$src data_joeynmt/voc/vocab.$tgt > data_joeynmt/voc/vocab.txt

# créer un dossier pour le modèle
!mkdir -p $joeynmt_model_dir



In [None]:
from datasets import Dataset, Features, Translation

# créer les fichiers d'entrainerment, de validation et de test conformes à la structure des ensembles de données Hugging Face
def read_files(file_path_en, file_path_fon):
    if not os.path.exists(file_path_en) or not os.path.exists(file_path_fon):
        raise FileNotFoundError("One or both of the files do not exist.")

    with open(file_path_en, 'r', encoding='utf-8') as file_en, \
        open(file_path_fon, 'r', encoding='utf-8') as file_fon:
        return [(i+1, en.strip(), fon.strip()) for i, (en, fon) in enumerate(zip(file_en, file_fon))]


def create_dataset(data):
    dataset = Dataset.from_dict({
        'id': [item[0] for item in data],
        'translation': [{'en': item[1], 'fon': item[2]} for item in data]
    })

    # Define the features
    features = Features({
        'id': dataset.features['id'],
        'translation': Translation(languages=['en', 'fon'])
    })

    # Cast the dataset to these features
    return dataset.cast(features)

# Create the datasets
train_dataset = create_dataset(read_files(file_path_en='data/train_en.txt', file_path_fon='data/train_fon.txt'))
dev_dataset = create_dataset(read_files(file_path_en='data/dev_en.txt', file_path_fon='data/dev_fon.txt'))
test_dataset = create_dataset(read_files(file_path_en='data/test_en.txt', file_path_fon='data/test_fon.txt'))

# Save the datasets
train_dataset.save_to_disk('data_joeynmt/train')
dev_dataset.save_to_disk('data_joeynmt/dev')
test_dataset.save_to_disk('data_joeynmt/test')

In [None]:
# Joey NMT configuration file.
config = """
name: "en_fon_bpe4000_transformer"
joeynmt_version: "2.2.1"

data:
    train: "{joeynmt_data_dir}/train/"
    dev: "{joeynmt_data_dir}/dev/"
    test: "{joeynmt_data_dir}/test/"
    dataset_type: "huggingface"               # dataset type: one of plain tsv huggingface
    dataset_cfg:
        name: "en-fon"
    src:
        lang: "en"
        max_length: 100
        lowercase: False
        normalize: False
        level: "bpe"
        voc_limit: 2460
        voc_min_freq: 1
        voc_file: "{joeynmt_data_dir}/voc/vocab.txt"
        tokenizer_type: "sentencepiece"
        tokenizer_cfg:
        #     num_merges: 200
        #     codes: "{joeynmt_data_dir}/bpe.codes.4000" # BPE codes file (for subword-nmt)
              model_file: "{joeynmt_data_dir}/sp.model"
              model_type: "bpe"

    trg:
        lang: "fon"
        max_length: 100
        lowercase: False
        normalize: False
        level: "bpe"
        voc_limit: 2460
        voc_min_freq: 1
        voc_file: "{joeynmt_data_dir}/voc/vocab.txt"
        tokenizer_type: "sentencepiece"
        tokenizer_cfg:
            # num_merges: 200
            # codes: "{joeynmt_data_dir}/bpe.codes.4000" # BPE codes file (for subword-nmt)
            model_file: "{joeynmt_data_dir}/sp.model"
            model_type: "bpe"


testing:
    n_best: 1
    beam_size: 5
    beam_alpha: 1.0
    batch_size: 512
    batch_type: "token"
    max_output_length: 100
    eval_metrics: ["bleu"]
    sacrebleu_cfg:
        tokenize: "13a"

training:
    #load_model: "{joeynmt_data_dir}/latest.ckpt"
    random_seed: 4
    optimizer: "adam"
    normalization: "tokens"
    adam_betas: [0.9, 0.999]
    scheduling: "plateau"           # learning rate scheduling, optional, if not specified stays constant, options: "plateau", "exponential", "decaying", "noam" (for Transformer), "warmupexponentialdecay", "warmupinversesquareroot"
    patience: 5
    learning_rate_factor: 0.5
    decrease_factor: 0.7
    learning_rate_warmup: 1000
    learning_rate: 0.0003
    learning_rate_min: 0.00000001
    weight_decay: 0.0
    label_smoothing: 0.1
    loss: "crossentropy"
    batch_size: 2048
    batch_type: "token"
    batch_multiplier: 1
    early_stopping_metric: "bleu"
    # eval_batch_size: 300
    eval_batch_type: "token"
    epochs: 20
    updates: 20000
    validation_freq: 1000
    logging_freq: 100
    model_dir: "model_joeynmt"
    overwrite: True
    shuffle: True
    use_cuda: True
    use_autocast: False
    fp16: False
    print_valid_sents: [0, 1, 2, 3]
    keep_best_ckpts: 3
    # keep_last_ckpts: 3

model:
    initializer: "xavier_uniform"
    bias_initializer: "zeros"
    init_gain: 1.0
    embed_initializer: "xavier_uniform"
    embed_init_gain: 1.0
    tied_embeddings: True
    tied_softmax: True
    encoder:
        type: "transformer"
        num_layers: 6
        num_heads: 4
        embeddings:
            embedding_dim: 128
            scale: True
            # dropout: 0.2
        # typically ff_size = 4 x hidden_size
        hidden_size: 128
        ff_size: 1024
        dropout: 0.3
    decoder:
        type: "transformer"
        num_layers: 6
        num_heads: 4
        embeddings:
            embedding_dim: 128
            scale: True
            # dropout: 0.2
        # typically ff_size = 4 x hidden_size
        hidden_size: 128
        ff_size: 1024
        dropout: 0.3
""".format(joeynmt_data_dir="data_joeynmt")

with open("en_fon_bpe4000_transformer.yaml", 'w') as f:
    f.write(config)


# déplacer le fichier de configuration dans le répertoire de joeynmt_data_dir/configuration
!mkdir -p data_joeynmt/config
!mv en_fon_bpe4000_transformer.yaml data_joeynmt/config/en_fon_bpe4000_transformer.yaml


In [None]:
# télécharger le script de construction du vocabulaire
!wget https://raw.githubusercontent.com/joeynmt/joeynmt/v2.2/scripts/build_vocab.py

# construire le vocabulaire
!python build_vocab.py {joeynmt_data_dir}/config/en_fon_bpe4000_transformer.yaml --joint

In [None]:
# # Load the TensorBoard notebook extension.
# %load_ext tensorboard
# %tensorboard --logdir {joeynmt_data_dir}/tensorboard

# run training
!python -m joeynmt train {joeynmt_data_dir}/config/en_fon_bpe4000_transformer.yaml

# Evaluation

In [None]:
!python -m joeynmt test {joeynmt_data_dir}/config/en_fon_bpe4000_transformer.yaml --ckpt {joeynmt_model_dir}/best.ckpt