In [1]:
import pandas as pd
import pathlib
import random
import string
import re
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
from sklearn.model_selection import train_test_split

In [2]:
text_file = "../input/ukr-eng/ukr.txt"
df = pd.read_csv(text_file, sep='\t', usecols=[0, 1])
df.columns = ['eng', 'ukr']

print("Df size: ", df.shape)
df.head(3), df.tail(3)

Df size:  (154460, 2)


(   eng      ukr
 0  Hi.   Вітаю!
 1  Hi.  Привіт.
 2  Hi.  Привіт!,
                                                       eng  \
 154457  If forests cover 9.4% of the earth's surface, ...   
 154458  The Tatoeba Project, which can be found online...   
 154459  I've heard that you should never date anyone w...   
 
                                                       ukr  
 154457  Якщо ліси складають 9,4% поверхні Землі, і якщ...  
 154458  Проект "Татоеба", що знаходиться онлайн за адр...  
 154459  Я чула, що не слід зустрічатися з кимось, кому...  )

In [3]:
contractions = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is", "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", 
                "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", 
                "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", 
                "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have", "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}

def clean_text(text, language):
  if language == 'eng':
    text = ' '.join([contractions[word] if word in contractions else word for word in text.split()])
  return text

for language in df:
  df[language] = df[language].apply(lambda text : clean_text(text, language))

df.tail(3)

Unnamed: 0,eng,ukr
154457,"If forests cover 9.4% of the earth's surface, ...","Якщо ліси складають 9,4% поверхні Землі, і якщ..."
154458,"The Tatoeba Project, which can be found online...","Проект ""Татоеба"", що знаходиться онлайн за адр..."
154459,I have heard that you should never date anyone...,"Я чула, що не слід зустрічатися з кимось, кому..."


In [4]:
df['eng'] = df['eng'].apply(lambda i : "[start] " + i + "[end]")
df.head(3)

Unnamed: 0,eng,ukr
0,[start] Hi.[end],Вітаю!
1,[start] Hi.[end],Привіт.
2,[start] Hi.[end],Привіт!


In [5]:
u_e_list = list(df.to_records(index=False)) # преобразование df в список кортежей(массив записей NumPy)
u_e_list[:3]

[('[start] Hi.[end]', 'Вітаю!'),
 ('[start] Hi.[end]', 'Привіт.'),
 ('[start] Hi.[end]', 'Привіт!')]

In [6]:
train_df, test_df = train_test_split(df, test_size=0.2, shuffle=True, random_state=42)
val_df, test_df = train_test_split(test_df, test_size=0.5, shuffle=True, random_state=42)

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)


print("training pairs:", train_df.shape)
print("test pairs:", test_df.shape)
print("validation pairs:", val_df.shape)
train_df.head(2)

training pairs: (123568, 2)
test pairs: (15446, 2)
validation pairs: (15446, 2)


Unnamed: 0,eng,ukr
0,[start] Why am I still here?[end],Чому я й досі тут?
1,[start] I am stunned.[end],Я приголомшений.


In [7]:
train_list = list(train_df.to_records(index=False)) 
test_list = list(test_df.to_records(index=False)) 
val_list = list(val_df.to_records(index=False))
train_list[:2]

[('[start] Why am I still here?[end]', 'Чому я й досі тут?'),
 ('[start] I am stunned.[end]', 'Я приголомшений.')]

In [8]:
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

vocab_size = 17000
sequence_length = 25
batch_size = 64

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")

ukr_vectorization = TextVectorization(
    max_tokens=vocab_size, output_mode="int", output_sequence_length=sequence_length,)
eng_vectorization = TextVectorization(
    max_tokens=vocab_size, output_mode="int",output_sequence_length=sequence_length + 1, standardize=custom_standardization,)
train_eng_texts = [pair[0] for pair in u_e_list]
train_ukr_texts = [pair[1] for pair in u_e_list]
eng_vectorization.adapt(train_eng_texts)
ukr_vectorization.adapt(train_ukr_texts)


User settings:

   KMP_AFFINITY=granularity=fine,noverbose,compact,1,0
   KMP_BLOCKTIME=0
   KMP_DUPLICATE_LIB_OK=True
   KMP_INIT_AT_FORK=FALSE
   KMP_SETTINGS=1

Effective settings:

   KMP_ABORT_DELAY=0
   KMP_ADAPTIVE_LOCK_PROPS='1,1024'
   KMP_ALIGN_ALLOC=64
   KMP_ALL_THREADPRIVATE=128
   KMP_ATOMIC_MODE=2
   KMP_BLOCKTIME=0
   KMP_CPUINFO_FILE: value is not defined
   KMP_DETERMINISTIC_REDUCTION=false
   KMP_DEVICE_THREAD_LIMIT=2147483647
   KMP_DISP_NUM_BUFFERS=7
   KMP_DUPLICATE_LIB_OK=true
   KMP_ENABLE_TASK_THROTTLING=true
   KMP_FORCE_REDUCTION: value is not defined
   KMP_FOREIGN_THREADS_THREADPRIVATE=true
   KMP_FORKJOIN_BARRIER='2,2'
   KMP_FORKJOIN_BARRIER_PATTERN='hyper,hyper'
   KMP_GTID_MODE=3
   KMP_HANDLE_SIGNALS=false
   KMP_HOT_TEAMS_MAX_LEVEL=1
   KMP_HOT_TEAMS_MODE=0
   KMP_INIT_AT_FORK=true
   KMP_LIBRARY=throughput
   KMP_LOCK_KIND=queuing
   KMP_MALLOC_POOL_INCR=1M
   KMP_NUM_LOCKS_IN_BLOCK=1
   KMP_PLAIN_BARRIER='2,2'
   KMP_PLAIN_BARRIER_PATTERN='hyper,hy

In [9]:
eng_vocabulary = eng_vectorization.get_vocabulary()
ukr_vocabulary = ukr_vectorization.get_vocabulary()

In [10]:
def format_dataset(eng, ukr):
    eng = eng_vectorization(eng)
    ukr = ukr_vectorization(ukr)
    return ({"encoder_inputs": ukr, "decoder_inputs": eng[:, :-1],}, eng[:, 1:])


def make_dataset(pairs):
    eng_texts, ukr_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    ukr_texts = list(ukr_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, ukr_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.shuffle(2048).prefetch(16).cache()


train_ds = make_dataset(train_list)
val_ds = make_dataset(val_list)

In [11]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim 
        self.dense_dim = dense_dim 
        self.num_heads = num_heads 
        self.attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.layernorm_1 = layers.LayerNormalization()
        self.dense_proj = keras.Sequential([layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim),])
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
        attention_output = self.attention(query=inputs, value=inputs, key=inputs, attention_mask=padding_mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)


class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super(PositionalEmbedding, self).__init__(**kwargs)
        self.token_embeddings = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.position_embeddings = layers.Embedding(input_dim=sequence_length, output_dim=embed_dim)
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)


class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super(TransformerDecoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.layernorm_1 = layers.LayerNormalization()
        self.attention_2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.layernorm_2 = layers.LayerNormalization()
        self.dense_proj = keras.Sequential([layers.Dense(latent_dim, activation="relu"), layers.Dense(embed_dim),])
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        attention_output_1 = self.attention_1(query=inputs, value=inputs, key=inputs, attention_mask=causal_mask)
        out_1 = self.layernorm_1(inputs + attention_output_1)
        
        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,)
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat([tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], axis=0,)
        return tf.tile(mask, mult)

In [12]:
embed_dim = 128
latent_dim = 2048 # количество нейронов для Dense слоев Encoder-а и Decoder-а
num_heads = 4

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x) #Encoder-1
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs) #Decoder-1
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs, name="transformer")

In [13]:
my_callbacks = [tf.keras.callbacks.EarlyStopping(patience=2)]
transformer.summary()
transformer.compile("adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
transformer.fit(train_ds,
                epochs=6, 
                validation_data=val_ds, 
                shuffle = True,
                callbacks = my_callbacks)

Model: "transformer"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
positional_embedding (Positiona (None, None, 128)    2179200     encoder_inputs[0][0]             
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
transformer_encoder (Transforme (None, None, 128)    790784      positional_embedding[0][0]       
________________________________________________________________________________________

<keras.callbacks.History at 0x7ff5390b6f90>

In [14]:
eng_index_lookup = dict(zip(range(len(eng_vocabulary)), eng_vocabulary))
max_decoded_sentence_length = 20


def decode_sequence(input_sentence):
    tokenized_input_sentence = ukr_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = eng_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = eng_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
            break
    return decoded_sentence

results_on_test = []
for _ in range(10):
    input_sentence = random.choice(test_list)
    test_ukr_texts = input_sentence[1]
    test_eng_texts = input_sentence[0]
    translated = decode_sequence(test_ukr_texts)
    results_on_test.append({'ukr': test_ukr_texts, 'eng': translated, 'eng_target': test_eng_texts})

results_on_test_df = pd.DataFrame(results_on_test)
results_on_test_df

Unnamed: 0,ukr,eng,eng_target
0,Том кращий від мене.,[start] tom is better than me[end],[start] Tom is better than me.[end]
1,Випий ще бокал.,[start] have a glass of beer[end],[start] Have another drink.[end]
2,"Він розмовляє так, ніби він багатій.",[start] he talks as if he were rich[end] ...,[start] He speaks as if he were rich.[end]
3,Операція - це найкращий вихід.,[start] this is the best way out[end] ...,[start] Surgery is the best solution.[end]
4,Я не поет.,[start] i am not the poet[end],[start] I am not a poet.[end]
5,Хіба ти не отримуєш задоволення від вихідних?,[start] arent you enjoying this job on weekend...,[start] Aren't you enjoying your weekend?[end]
6,Мені потрібні яйця.,[start] i need eggs up[end],[start] I want eggs.[end]
7,"Бог знає, що ми потребуємо.",[start] god knows what we need[end] ...,[start] God knows what we need.[end]
8,"Їй цікаво дізнатися, хто прислав квіти.",[start] she wonder where she sent the flowers[...,[start] She is curious to find who sent the fl...
9,"Ти не проти, якщо я скористаюся твоєю машиною?",[start] do you mind if i wear your car[end] ...,[start] Would you mind if I used your car?[end]
