In [1]:
import os
import sys
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters


In [2]:
current_path = os.path.abspath(os.path.join('.'))
module_path = os.path.abspath(os.path.join('..'))

In [3]:
if module_path not in sys.path:
    sys.path.append(module_path)

In [4]:
import data_prep;
import utils;
import encoder;
import attention;
import decoder;

In [5]:
dataset_save_location = "out/parallel.p"

In [6]:
X_all, Y_all, en_word2idx_all, en_idx2word_all, en_vocab_all, hi_word2idx_all, hi_idx2word_all, hi_vocab_all = utils.load_pickle_dataset(dataset_save_location)
len(X_all), len(Y_all), len(en_vocab_all), len(hi_vocab_all)

(696695, 696695, 50004, 50004)

In [7]:
emb_dim = 100

In [8]:
(os.path.join("../../data/glove.6B/", 'glove.6B.50d.txt'))

'../../data/glove.6B/glove.6B.50d.txt'

In [9]:
embedding_weights = utils.load_glove_embeddings("../../data/glove.6B/", len(en_vocab_all), en_word2idx_all, emb_dim)


In [10]:
embedding_weights[en_word2idx_all['the']]

array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
       -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
        0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
       -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
        0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
       -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
        0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
        0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
       -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
       -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
       -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
       -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
       -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
       -1.2526  ,  0.071624,  0.70565 ,  0.49744 , 

In [11]:
train_size = 40000
X = X_all[:train_size]
Y = Y_all[:train_size]

In [12]:
for n in range(len(X)):
  X[n] = [en_word2idx_all['<start>']] + X[n] + [en_word2idx_all['<end>']]
for n in range(len(X)):
  Y[n] = [hi_word2idx_all['<start>']] + Y[n] + [hi_word2idx_all['<end>']]

In [13]:
max_length_inp = utils.max_length(X)
max_length_tar = utils.max_length(Y)
(max_length_inp, max_length_tar)

(17, 17)

In [14]:
input_tensor = tf.keras.preprocessing.sequence.pad_sequences(X,
                                                                maxlen=max_length_inp,
                                                                padding='post')
target_tensor = tf.keras.preprocessing.sequence.pad_sequences(Y,
                                                                 maxlen=max_length_tar,
                                                                 padding='post')

In [15]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, shuffle=False, test_size=0.2)

In [16]:
input_tensor_train = [list(reversed(x)) for x in input_tensor_train]

In [17]:
for n in range(2):
    data_prep.printSentence(input_tensor_train[n], en_idx2word_all)
    print('\n')
    data_prep.printSentence(target_tensor_train[n], hi_idx2word_all)
    print('\n')


<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <end> bad all not is desire this but <start> 

<start> किन्तु यह अभिलाषा पूर्णतः बुरी नहीं है <end> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> 

<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <end> also this see <start> 

<start> यह भी देखें <end> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> 



In [18]:
len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)

(32000, 32000, 8000, 8000)

In [19]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 100
units = 1024
vocab_inp_size = len(en_vocab_all)
vocab_tar_size = len(hi_vocab_all)

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [20]:
from encoder import Encoder
from decoder import Decoder
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE, weights_ = embedding_weights,mask=1)
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

In [21]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  
  return tf.reduce_mean(loss_)

Instructions for updating:
Colocations handled automatically by placer.


AttributeError: module 'tensorflow._api.v1.keras.losses' has no attribute 'SparseCategoricalCrossentropy'