In [1]:
import tensorflow as tf
import numpy as np
import warnings

warnings.filterwarnings('ignore')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [15]:
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Input, Dense, GRU, Embedding, CuDNNGRU,LSTM
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.callbacks import ModelCheckpoint
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [3]:
mark_start = 'ssss '
mark_end = ' eeee'

In [5]:
def data(filename: str) -> list():
    data_src = []
    data_dest = []

    for line in open(filename, encoding='UTF-8'):
        en_text, tr_text = line.rstrip().split('\t')

        tr_text = mark_start + tr_text + mark_end

        data_src.append(en_text)
        data_dest.append(tr_text)
    
    return data_src, data_dest

data_src, data_dest = data('tur.txt')

In [6]:
class TokenizerWrap(Tokenizer):
    def __init__(self, texts, padding, reverse=False, num_words=None):
        Tokenizer.__init__(self, num_words=num_words)
        
        self.fit_on_texts(texts)
        
        self.index_to_word = dict(zip(self.word_index.values(), self.word_index.keys()))
        
        self.tokens = self.texts_to_sequences(texts)
        
        if reverse:
            self.tokens = [list(reversed(x)) for x in self.tokens]
            truncating = 'pre'
        else:
            truncating = 'post'
            
        self.num_tokens = [len(x) for x in self.tokens]
        self.max_tokens = np.mean(self.num_tokens) + 2 * np.std(self.num_tokens)
        self.max_tokens = int(self.max_tokens)
        
        self.tokens_padded = pad_sequences(self.tokens,
                                           maxlen=self.max_tokens,
                                           padding=padding,
                                           truncating=truncating)
        
    def token_to_word(self, token):
        word = ' ' if token == 0 else self.index_to_word[token]
        return word
    
    def tokens_to_string(self, tokens):
        words = [self.index_to_word[token] for token in tokens if token != 0]
        text = ' '.join(words)
        return text
    
    def text_to_tokens(self, text, padding, reverse=False):
        tokens = self.texts_to_sequences([text])
        tokens = np.array(tokens)
        
        if reverse:
            tokens = np.flip(tokens, axis=1)
            truncating = 'pre'
        else:
            truncating = 'post'
            
        tokens = pad_sequences(tokens,
                               maxlen=self.max_tokens,
                               padding=padding,
                               truncating=truncating)
        
        return tokens

In [7]:
tokenizer_src = TokenizerWrap(texts=data_src,
                              padding='pre',
                              reverse=True,
                              num_words=None)

tokenizer_dest = TokenizerWrap(texts=data_dest,
                              padding='post',
                              reverse=False,
                              num_words=None)

In [8]:
tokens_src = tokenizer_src.tokens_padded
tokens_dest = tokenizer_dest.tokens_padded
print(tokens_src.shape)
print(tokens_dest.shape)

(473035, 11)
(473035, 10)


In [9]:
token_start = tokenizer_dest.word_index[mark_start.strip()]
token_end = tokenizer_dest.word_index[mark_end.strip()]

In [10]:
encoder_input_data = tokens_src

decoder_input_data = tokens_dest[:, :-1]
decoder_output_data = tokens_dest[:, 1:]

In [11]:
num_encoder_words = len(tokenizer_src.word_index)+1
num_decoder_words = len(tokenizer_dest.word_index)+1

In [12]:
num_decoder_words

94059

In [13]:
embedding_size = 100

word2vec = {}
with open('glove.6B.100d.txt', encoding='UTF-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec
        
        
embedding_matrix = np.random.uniform(-1, 1, (num_encoder_words, embedding_size))
for word, i in tokenizer_src.word_index.items():
    if i < num_encoder_words:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [None]:
'''
FastText FaceBook tarafından geliştirilen açık kaynak bir vektörleştirme aracıdır.
Kendim test ettiğimde modelin daha iyi çıktılar verdiğini gördüm dileyen benim github heseabımdan inceleyebilir.
Github hesabım https://github.com/fawern/
'''

fasttext_tr_vec = {}

fin = io.open('./cc.tr.300.vec/cc.tr.300.vec', 'r', encoding='utf-8').readlines()

for line in fin:
  values = line.split()
  word = values[0]
  vec = np.asarray(values[1:], dtype='float32')
  fasttext_tr_vec[word] = vec

fasttext_embedding_matrix = np.random.uniform(-1, 1, (num_decoder_words, 300))

for word, index in tokenizer_tr.word_index.items():
  if index < num_decoder_words:
    ft_embedding_vector = fasttext_tr_vec.get(word)
    if ft_embedding_vector is not None:
      fasttext_embedding_matrix[index] = ft_embedding_vector

In [44]:
class Encoder:
    def __init__(self,**kwargs):
        
        self.encoder_input = Input(shape=(None,), name='encoder_input')
        
        self.encoder_embedding = Embedding(input_dim=num_encoder_words,
                              output_dim=embedding_size,
                              weights=[embedding_matrix],
                              trainable=True,
                              name='encoder_embedding')
        
        self.encoder_lstm1 = LSTM(state_size, name='encoder_lstm1', return_sequences=True)
        self.encoder_lstm2 = LSTM(state_size, name='encoder_lstm2', return_sequences=True)
        self.encoder_lstm3 = LSTM(state_size, name='encoder_lstm3', return_sequences=False)
        
        
    def connect_encoder(self):
        net = self.encoder_input

        net = self.encoder_embedding(net)

        net = self.encoder_lstm1(net)
        net = self.encoder_lstm2(net)
        net = self.encoder_lstm3(net)

        self.encoder_output = net

        return self.encoder_output


In [45]:
state_size = 256

encoder = Encoder(num_encoder_words = num_encoder_words, embedding_size = embedding_size, 
                  embedding_matrix = embedding_matrix, state_size = state_size)


encoder_output = encoder.connect_encoder()

In [49]:
class Decoder:
    def __init__(self, **kwargs):
        
        self.decoder_initial_state = Input(shape=(state_size,), name='decoder_initial_state')
        
        self.decoder_input = Input(shape=(None,), name='decoder_input')
        
        self.decoder_embedding = Embedding(input_dim=num_decoder_words,
                            #   output_dim=embedding_size,
                              output_dim=300,
                              weights=[fasttext_embedding_matrix], 
                              trainable=True,
                              name='decoder_embedding')
        
        self.decoder_lstm1 = LSTM(state_size, name='decoder_lstm1', return_sequences=True)
        self.decoder_lstm2 = LSTM(state_size, name='decoder_lstm2', return_sequences=True)
        self.decoder_lstm3 = LSTM(state_size, name='decoder_lstm3', return_sequences=True)
        
        self.decoder_dense = Dense(num_decoder_words,
                      activation='linear',
                      name='decoder_output')
        
    def connect_decoder(self,initial_state):
        net = self.decoder_input

        net = self.decoder_embedding(net)

        net = self.decoder_lstm1(net, initial_state=[initial_state,initial_state])
        net = self.decoder_lstm2(net, initial_state=[initial_state,initial_state])
        net = self.decoder_lstm3(net, initial_state=[initial_state,initial_state])

        self.decoder_output = self.decoder_dense(net)

        return self.decoder_output

In [55]:
decoder = Decoder(state_size=state_size, num_decoder_words=num_decoder_words, embedding_size=embedding_size)

decoder_output = decoder.connect_decoder(initial_state = encoder_output)

In [56]:
model_train = Model(inputs=[encoder.encoder_input, decoder.decoder_input], outputs=[decoder_output])

In [61]:
def sparse_cross_entropy(y_true, y_pred):
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true, logits=y_pred)
    loss_mean = tf.reduce_mean(loss)
    return loss_mean

In [62]:
optimizer = RMSprop(lr=1e-3)

In [63]:
'''
ValueError: `target_tensors` argument is not supported when executing eagerly. Received: 
[<KerasTensor: shape=(None, None) dtype=int32 (created by layer 'input_1')>].
'''
'''
Elde ettiğiniz hata, TensorFlow'da Eager Execution (Eager İcra) modunda çalışırken, target_tensors argümanının bu 
modda desteklenmediğini gösteriyor. Eager Execution, TensorFlow 2.x'de varsayılan olarak etkin olduğunda, eğitim sırasında 
target_tensors'ı açıkça belirtmeniz gerekmez. decoder_targeti çıkartığımız zaman proje aynı şekilde çalışıyor.
'''
# decoder_target = tf.placeholder(dtype='int32', shape=(None,None))

In [64]:
model_train.compile(optimizer=optimizer,
                    loss=sparse_cross_entropy,
                    # target_tensors=[decoder_target]
                )

In [65]:
x_data = {'encoder_input': encoder_input_data, 'decoder_input': decoder_input_data}

In [66]:
y_data = {'decoder_output': decoder_output_data}

In [67]:
path_checkpoint = 'checkpoint.lstm'
checkpoint = ModelCheckpoint(filepath=path_checkpoint, save_weights_only=True)

In [None]:
model_train.fit(x=x_data,
                y=y_data,
                batch_size=512,
                epochs=5,
                callbacks=[checkpoint])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/5

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 2/5

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 3/5

## Çeviri için Encoder ve Decoder modellerinin oluşturulması

In [57]:
model_encoder = Model(inputs=[encoder.encoder_input], outputs=[encoder_output])

In [60]:
decoder_output = decoder.connect_decoder(initial_state = decoder.decoder_initial_state)

model_decoder = Model(inputs=[decoder.decoder_input, decoder.decoder_initial_state], outputs=[decoder_output])

In [None]:
def translate(input_text, true_output_text=None):
    input_tokens = tokenizer_src.text_to_tokens(text=input_text,
                                                reverse=True,
                                                padding='pre')
    
    initial_state = model_encoder.predict(input_tokens)
    
    max_tokens = tokenizer_dest.max_tokens
    
    decoder_input_data = np.zeros(shape=(1, max_tokens), dtype=np.int)
    
    token_int = token_start
    output_text = ''
    count_tokens = 0
    
    while token_int != token_end and count_tokens < max_tokens:
        decoder_input_data[0, count_tokens] = token_int
        x_data = {'decoder_initial_state': initial_state, 'decoder_input': decoder_input_data}
        
        decoder_output = model_decoder.predict(x_data)
        
        token_onehot = decoder_output[0, count_tokens, :]
        token_int = np.argmax(token_onehot)
        
        sampled_word = tokenizer_dest.token_to_word(token_int)
        output_text += ' ' + sampled_word
        count_tokens += 1
        
    print('Input text:')
    print(input_text)
    print()
        
    print('Translated text:')
    print(output_text)
    print()
    
    if true_output_text is not None:
        print('True output text:')
        print(true_output_text)
        print()

In [None]:
translate(input_text=data_src[99999], true_output_text=data_dest[99999])