In [1]:
%load_ext autoreload
%autoreload 2

from torchtext import data
from torchtext.data import Field, BucketIterator, TabularDataset
import torch 

import spacy
#en = spacy.load('en')
#fr = spacy.load('fr')

spacy.load('en_core_web_sm')

<spacy.lang.en.English at 0x7fb46cf1a910>

In [2]:
#!python -m spacy download en
#spacy.load('/home/reka/miniconda3/envs/translate/lib/python3.7/site-packages/spacy/data/en')

In [3]:
params = {'src_path':'./data/english.txt', 
          'trg_path':'./data/french.txt',
          'src_data': None,
          'trg_data': None, 
          'src_lang':'en',
          'trg_lang':'fr',
          'device' : torch.device('cuda'),
          'load_weights': None, 
          'load_weights':None, 
          'max_length': 100, 
          'src_pad': None, 
          'trg_pad': None, 
          'optimizer': None, 
          'no_cuda': True, 
          'SGDR': None, 
          'epoch': 10, 
          'dropout': 0.1, 
          'batchsize': 1500,
          'printevery': 100,
          'lr': 0.0001, 
          'create_valset': 'store_true', 
          'd_model' : 512, 
          'heads': 8, 
          'n_layers': 6}


# Get data


In [4]:
from lib.data_processing import read_data
# The 'read_data' function adds the french and english text into the 
# params dict as 'src_data' and 'trg_data' as a list of strings
read_data(params)

In [5]:
len(params["src_data"]), len(params["trg_data"])

(154883, 154883)

In [6]:
from lib.data_processing import create_fields
import dill as pickle 

# This creates two torch data field objects to contain the src and 
# SRC, TRG = create_fields(params)

loading spacy tokenizers...
creating SRC and TRG...


In [7]:
with open('weights/SRC.pkl', 'rb') as f:
    SRC = pickle.load(f)
   
with open('weights/TRG.pkl', 'rb') as f:
    TRG = pickle.load(f)

In [56]:
len(TRG.vocab), len(SRC.vocab), 

(26116, 14020)

In [57]:
from lib.data_processing import create_dataset
import re

params["train"] = create_dataset(params, SRC, TRG)

creating dataset and iterator... 


In [40]:
params["train"]

<lib.batch.MyIterator at 0x7efb9da72e90>

In [59]:
SRC.vocab.stoi['house'], SRC.vocab.itos[154]

(154, 'house')

# Model

In [60]:
from lib.models import get_model

model = get_model(params, len(SRC.vocab), len(TRG.vocab))

In [61]:
model

Transformer(
  (encoder): Encoder(
    (embed): Embedder(
      (embed): Embedding(14020, 512)
    )
    (pe): PositionalEncoder(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): ModuleList(
      (0): EncoderLayer(
        (norm_1): Norm()
        (norm_2): Norm()
        (attn): MultiHeadAttention(
          (q_linear): Linear(in_features=512, out_features=512, bias=True)
          (v_linear): Linear(in_features=512, out_features=512, bias=True)
          (k_linear): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (out): Linear(in_features=512, out_features=512, bias=True)
        )
        (ff): FeedForward(
          (linear_1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear_2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (dropout_1): Dropout(p=0.1, inplace=False)
        (dropout_2): Dropout(p=0.

# Train the model

In [63]:
# Add some additional params
from lib.data_processing import get_len
from lib.optim import CosineWithRestarts

params["checkpoint"] = 0
params["d_model"] = 512
params["heads"] = 8
params["n_layers"] = 6
params["epoch"] = 10
params["train_len"] = get_len(params["train"])
params["optimizer"] = torch.optim.Adam(model.parameters(), lr=params["lr"], betas=(0.9, 0.98), eps=1e-9)
params["SGDR"] = True
params["sched"] = CosineWithRestarts(params["optimizer"], T_max=params["train_len"])

#model = get_model(params, len(SRC.vocab), len(TRG.vocab))

In [64]:
model.load_state_dict(torch.load('model/model_weights'))

<All keys matched successfully>

In [68]:
from lib.train import train_model

#train_model(model, params, "model_20_epochs")

In [65]:
torch.save(model, 'model/model_10_epochs')

In [66]:
model_10_epochs = torch.load('model/model_10_epochs')

# Translate

In [72]:
params["k"] = 3

In [69]:
model_10_epochs = get_model(params, len(SRC.vocab), len(TRG.vocab))
model_10_epochs

Transformer(
  (encoder): Encoder(
    (embed): Embedder(
      (embed): Embedding(14020, 512)
    )
    (pe): PositionalEncoder(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): ModuleList(
      (0): EncoderLayer(
        (norm_1): Norm()
        (norm_2): Norm()
        (attn): MultiHeadAttention(
          (q_linear): Linear(in_features=512, out_features=512, bias=True)
          (v_linear): Linear(in_features=512, out_features=512, bias=True)
          (k_linear): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (out): Linear(in_features=512, out_features=512, bias=True)
        )
        (ff): FeedForward(
          (linear_1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear_2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (dropout_1): Dropout(p=0.1, inplace=False)
        (dropout_2): Dropout(p=0.

In [70]:
model_10_epochs.load_state_dict(torch.load('model/model_weights'))

<All keys matched successfully>

In [73]:
from lib.translate import translate_text

translate_text("Hello, how are you? Do you want to drink a beer tonight? ", params, model_10_epochs, SRC, TRG)

'hé , comment êtes vous en train de faire une bière ?'

In [75]:
translate_text('Hey girl, how are you? ', params, model_10_epochs, SRC, TRG)

'hé fille , comment êtes vous ?'