## Download the dataset

In [None]:
import numpy as np
import pandas as pd
import pickle

In [None]:
!wget https://github.com/Bhandari007/DCGAN/raw/main/translated_descriptions.pickle

--2023-03-03 03:11:43--  https://github.com/Bhandari007/DCGAN/raw/main/translated_descriptions.pickle
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/Bhandari007/DCGAN/main/translated_descriptions.pickle [following]
--2023-03-03 03:11:44--  https://raw.githubusercontent.com/Bhandari007/DCGAN/main/translated_descriptions.pickle
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1819115 (1.7M) [application/octet-stream]
Saving to: ‘translated_descriptions.pickle’


2023-03-03 03:11:44 (29.2 MB/s) - ‘translated_descriptions.pickle’ saved [1819115/1819115]



## Read dataset

In [None]:
text_description = pd.read_pickle("translated_descriptions.pickle")

In [None]:
len(text_description)

1056

## Description of 1 image

In [None]:
text_description[1]

['चराको टाउको यसको शरीरतर्फ लाग्छ र चरा खैरो रंगको छ।',
 'यस चराको खैरो घाँटी, टाउको, पखेटा र पछाडि, यसको बिलको वरिपरि सेतो छ, र लामो अग्लो बिल जुन यसको टिपमा छ।',
 'यो चरिंग लामो घुमाइएको चुच्चो र गाढा आँखा घण्टीहरूको साथ रंगमा खैरो छ।',
 'यो चरा केही सेतो संग खैरो छ र लामो, पुरानो चुच्चो छ।',
 'यो वेबरेड खुट्टा र लामो अलि हुकिएको बिल संग ठोस खैरो चरा हो।',
 'यो विशेष चराको खैरो शरीर र खैरो बिल छ',
 'खैरो रंगीन अल्बर्सरको आधारमा सेतो औंठीको, सेतो बकवास र सेतो आलु।',
 'यो चराको पखेटा छ जुन खैरो छ र ठूलो बिल छ',
 'यो चरा खैरो रंगमा छ, ठूलो घुमाइएको चुच्चोको साथ।',
 'एक खैरो रंग र लामो चुच्चो संग एक ठूलो चरा।']

### Make a text corpus

In [None]:
import os
text_corpus = ""
for sentence in text_description:
  for word in sentence:
    text_corpus+=word

text_corpus = text_corpus.replace("\n", " ")

In [None]:
text_corpus[:100]

'मध्यम आकारको बर्डको गाढा खैरो रंग छ, कालो तलतिर घुमाइएको चुच्चो, र लामो पखेटा।चरा गाढा खैरो खैरो छ र'

In [None]:
import io
import os
import time
import tensorflow as tf

In [None]:
print(f"Length of text: {len(text_corpus)} characters")

Length of text: 690249 characters


In [None]:
vocab = sorted(set(text_corpus))
print(f'{len(vocab)} unique characters')

131 unique characters


In [None]:
print(vocab[90:200])

['छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', '़', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'े', 'ै', 'ॉ', 'ो', 'ौ', '्', '।']


# Process the text

## Vectorize the text

In [None]:
example_texts = ["चराको", "मुकुट"]
chars = tf.strings.unicode_split(example_texts, input_encoding = 'UTF-8')
chars

<tf.RaggedTensor [[b'\xe0\xa4\x9a', b'\xe0\xa4\xb0', b'\xe0\xa4\xbe', b'\xe0\xa4\x95',
  b'\xe0\xa5\x8b'],
 [b'\xe0\xa4\xae', b'\xe0\xa5\x81', b'\xe0\xa4\x95', b'\xe0\xa5\x81',
  b'\xe0\xa4\x9f']]>

In [None]:
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)

In [None]:
ids = ids_from_chars(chars)
ids

<tf.RaggedTensor [[90, 111, 119, 85, 128],
 [109, 122, 85, 122, 95]]>

In [None]:
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

In [None]:
chars = chars_from_ids(ids)
chars

<tf.RaggedTensor [[b'\xe0\xa4\x9a', b'\xe0\xa4\xb0', b'\xe0\xa4\xbe', b'\xe0\xa4\x95',
  b'\xe0\xa5\x8b'],
 [b'\xe0\xa4\xae', b'\xe0\xa5\x81', b'\xe0\xa4\x95', b'\xe0\xa5\x81',
  b'\xe0\xa4\x9f']]>

In [None]:
tf.strings.reduce_join(chars, axis=-1).numpy()

array([b'\xe0\xa4\x9a\xe0\xa4\xb0\xe0\xa4\xbe\xe0\xa4\x95\xe0\xa5\x8b',
       b'\xe0\xa4\xae\xe0\xa5\x81\xe0\xa4\x95\xe0\xa5\x81\xe0\xa4\x9f'],
      dtype=object)

In [None]:
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

### The prediction task

### Create training examples and targets

In [None]:
all_ids = ids_from_chars(tf.strings.unicode_split(text_corpus, 'UTF-8'))
all_ids

<tf.Tensor: shape=(690249,), dtype=int64, numpy=array([109, 103, 130, ..., 111, 119, 131])>

In [None]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [None]:
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

म
ध
्
य
म
 
आ
क
ा
र


In [None]:
seq_length = 100

In [None]:
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
  print(chars_from_ids(seq))

tf.Tensor(
[b'\xe0\xa4\xae' b'\xe0\xa4\xa7' b'\xe0\xa5\x8d' b'\xe0\xa4\xaf'
 b'\xe0\xa4\xae' b' ' b'\xe0\xa4\x86' b'\xe0\xa4\x95' b'\xe0\xa4\xbe'
 b'\xe0\xa4\xb0' b'\xe0\xa4\x95' b'\xe0\xa5\x8b' b' ' b'\xe0\xa4\xac'
 b'\xe0\xa4\xb0' b'\xe0\xa5\x8d' b'\xe0\xa4\xa1' b'\xe0\xa4\x95'
 b'\xe0\xa5\x8b' b' ' b'\xe0\xa4\x97' b'\xe0\xa4\xbe' b'\xe0\xa4\xa2'
 b'\xe0\xa4\xbe' b' ' b'\xe0\xa4\x96' b'\xe0\xa5\x88' b'\xe0\xa4\xb0'
 b'\xe0\xa5\x8b' b' ' b'\xe0\xa4\xb0' b'\xe0\xa4\x82' b'\xe0\xa4\x97' b' '
 b'\xe0\xa4\x9b' b',' b' ' b'\xe0\xa4\x95' b'\xe0\xa4\xbe' b'\xe0\xa4\xb2'
 b'\xe0\xa5\x8b' b' ' b'\xe0\xa4\xa4' b'\xe0\xa4\xb2' b'\xe0\xa4\xa4'
 b'\xe0\xa4\xbf' b'\xe0\xa4\xb0' b' ' b'\xe0\xa4\x98' b'\xe0\xa5\x81'
 b'\xe0\xa4\xae' b'\xe0\xa4\xbe' b'\xe0\xa4\x87' b'\xe0\xa4\x8f'
 b'\xe0\xa4\x95' b'\xe0\xa5\x8b' b' ' b'\xe0\xa4\x9a' b'\xe0\xa5\x81'
 b'\xe0\xa4\x9a' b'\xe0\xa5\x8d' b'\xe0\xa4\x9a' b'\xe0\xa5\x8b' b',' b' '
 b'\xe0\xa4\xb0' b' ' b'\xe0\xa4\xb2' b'\xe0\xa4\xbe' b'\xe0\xa4\xae'
 b'\xe0\x

In [None]:
for seq in sequences.take(5):
  print(text_from_ids(seq).numpy())

b'\xe0\xa4\xae\xe0\xa4\xa7\xe0\xa5\x8d\xe0\xa4\xaf\xe0\xa4\xae \xe0\xa4\x86\xe0\xa4\x95\xe0\xa4\xbe\xe0\xa4\xb0\xe0\xa4\x95\xe0\xa5\x8b \xe0\xa4\xac\xe0\xa4\xb0\xe0\xa5\x8d\xe0\xa4\xa1\xe0\xa4\x95\xe0\xa5\x8b \xe0\xa4\x97\xe0\xa4\xbe\xe0\xa4\xa2\xe0\xa4\xbe \xe0\xa4\x96\xe0\xa5\x88\xe0\xa4\xb0\xe0\xa5\x8b \xe0\xa4\xb0\xe0\xa4\x82\xe0\xa4\x97 \xe0\xa4\x9b, \xe0\xa4\x95\xe0\xa4\xbe\xe0\xa4\xb2\xe0\xa5\x8b \xe0\xa4\xa4\xe0\xa4\xb2\xe0\xa4\xa4\xe0\xa4\xbf\xe0\xa4\xb0 \xe0\xa4\x98\xe0\xa5\x81\xe0\xa4\xae\xe0\xa4\xbe\xe0\xa4\x87\xe0\xa4\x8f\xe0\xa4\x95\xe0\xa5\x8b \xe0\xa4\x9a\xe0\xa5\x81\xe0\xa4\x9a\xe0\xa5\x8d\xe0\xa4\x9a\xe0\xa5\x8b, \xe0\xa4\xb0 \xe0\xa4\xb2\xe0\xa4\xbe\xe0\xa4\xae\xe0\xa5\x8b \xe0\xa4\xaa\xe0\xa4\x96\xe0\xa5\x87\xe0\xa4\x9f\xe0\xa4\xbe\xe0\xa5\xa4\xe0\xa4\x9a\xe0\xa4\xb0\xe0\xa4\xbe \xe0\xa4\x97\xe0\xa4\xbe\xe0\xa4\xa2\xe0\xa4\xbe \xe0\xa4\x96\xe0\xa5\x88\xe0\xa4\xb0\xe0\xa5\x8b \xe0\xa4\x96\xe0\xa5\x88\xe0\xa4\xb0\xe0\xa5\x8b \xe0\xa4\x9b \xe0\xa4\xb0 '
b'\xe0\xa4\x8f\

In [None]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [None]:
split_input_target(list("तुलनामा"))

(['त', 'ु', 'ल', 'न', 'ा', 'म'], ['ु', 'ल', 'न', 'ा', 'म', 'ा'])

In [None]:
dataset = sequences.map(split_input_target)

In [None]:
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'\xe0\xa4\xae\xe0\xa4\xa7\xe0\xa5\x8d\xe0\xa4\xaf\xe0\xa4\xae \xe0\xa4\x86\xe0\xa4\x95\xe0\xa4\xbe\xe0\xa4\xb0\xe0\xa4\x95\xe0\xa5\x8b \xe0\xa4\xac\xe0\xa4\xb0\xe0\xa5\x8d\xe0\xa4\xa1\xe0\xa4\x95\xe0\xa5\x8b \xe0\xa4\x97\xe0\xa4\xbe\xe0\xa4\xa2\xe0\xa4\xbe \xe0\xa4\x96\xe0\xa5\x88\xe0\xa4\xb0\xe0\xa5\x8b \xe0\xa4\xb0\xe0\xa4\x82\xe0\xa4\x97 \xe0\xa4\x9b, \xe0\xa4\x95\xe0\xa4\xbe\xe0\xa4\xb2\xe0\xa5\x8b \xe0\xa4\xa4\xe0\xa4\xb2\xe0\xa4\xa4\xe0\xa4\xbf\xe0\xa4\xb0 \xe0\xa4\x98\xe0\xa5\x81\xe0\xa4\xae\xe0\xa4\xbe\xe0\xa4\x87\xe0\xa4\x8f\xe0\xa4\x95\xe0\xa5\x8b \xe0\xa4\x9a\xe0\xa5\x81\xe0\xa4\x9a\xe0\xa5\x8d\xe0\xa4\x9a\xe0\xa5\x8b, \xe0\xa4\xb0 \xe0\xa4\xb2\xe0\xa4\xbe\xe0\xa4\xae\xe0\xa5\x8b \xe0\xa4\xaa\xe0\xa4\x96\xe0\xa5\x87\xe0\xa4\x9f\xe0\xa4\xbe\xe0\xa5\xa4\xe0\xa4\x9a\xe0\xa4\xb0\xe0\xa4\xbe \xe0\xa4\x97\xe0\xa4\xbe\xe0\xa4\xa2\xe0\xa4\xbe \xe0\xa4\x96\xe0\xa5\x88\xe0\xa4\xb0\xe0\xa5\x8b \xe0\xa4\x96\xe0\xa5\x88\xe0\xa4\xb0\xe0\xa5\x8b \xe0\xa4\x9b \xe0\xa4\xb0'
Target: 

In [None]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

### Build the Model

In [None]:
# Length of the vocabulary in StringLookup Layer
vocab_size = len(ids_from_chars.get_vocabulary())

# The embedding dimension
embedding_dim = 768

# Number of RNN units
rnn_units = 10

In [None]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [None]:
model = MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 132) # (batch_size, sequence_length, vocab_size)


In [None]:
model.summary()

Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  135168    
                                                                 
 gru (GRU)                   multiple                  31080     
                                                                 
 dense (Dense)               multiple                  1452      
                                                                 
Total params: 167,700
Trainable params: 167,700
Non-trainable params: 0
_________________________________________________________________


In [None]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

In [None]:
sampled_indices

array([ 89,  79, 123,  71, 124,  50,  63,  49,  55,  69,  74, 124,  61,
        61, 107, 108,   7,  45,  23,  74,  49,  29,  30,  76, 115,  56,
        99,  78,  59, 107,  89,  51, 107,  93,  63,  70,  83,   8,  63,
       101, 119,  50, 114, 128,  40, 129,  94,  84,  51,  77,  58, 131,
        82,  80,   0,  13,  36,  56,  84, 123,  99,  90, 128,  22,  66,
       110,  95,  47,  72, 110,  16, 115,  48,  69, 119,  25,  48,  13,
         6,  97,  93,  41,  53, 112, 123,  42,  71, 120, 112, 126,   0,
       116,   7,  92,  64, 126,  87,  48,  98,  97])

In [None]:
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())

Input:
 b'\xe0\xa5\x87\xe0\xa4\xb0\xe0\xa5\x88 \xe0\xa4\x9b\xe0\xa5\x8b\xe0\xa4\x9f\xe0\xa5\x8b \xe0\xa4\x9a\xe0\xa5\x81\xe0\xa4\x9a\xe0\xa5\x8d\xe0\xa4\x9a\xe0\xa5\x8b \xe0\xa4\x9b\xe0\xa5\xa4\xe0\xa4\xaf\xe0\xa5\x8b \xe0\xa4\x9a\xe0\xa4\xb0\xe0\xa4\xbe\xe0\xa4\x95\xe0\xa5\x8b \xe0\xa4\x89\xe0\xa4\x9c\xe0\xa5\x8d\xe0\xa4\x9c\xe0\xa5\x8d\xe0\xa4\xb5\xe0\xa4\xb2 \xe0\xa4\xb8\xe0\xa5\x81\xe0\xa4\xa8\xe0\xa5\x8d\xe0\xa4\xa4\xe0\xa4\xb2\xe0\xa4\xbe \xe0\xa4\xac\xe0\xa4\xbf\xe0\xa4\xb2 \xe0\xa4\xb0 \xe0\xa4\x8f\xe0\xa4\x95 \xe0\xa4\x9a\xe0\xa4\xae\xe0\xa5\x8d\xe0\xa4\x95\xe0\xa4\xbf\xe0\xa4\xb2\xe0\xa5\x8b \xe0\xa4\xb8\xe0\xa5\x87\xe0\xa4\xa4\xe0\xa5\x8b \xe0\xa4\x85\xe0\xa4\xaf\xe0\xa4\xb0\xe0\xa5\x8b\xe0\xa4\xb2\xe0\xa4\xbf\xe0\xa4\x82\xe0\xa4\x97 \xe0\xa4\x9b\xe0\xa4\xaf\xe0\xa5\x8b \xe0\xa4\xb5\xe0\xa4\xbf\xe0\xa4\xb6\xe0\xa5\x87\xe0\xa4\xb7 \xe0\xa4\x9a\xe0\xa4\xb0\xe0\xa4\xbe\xe0\xa4\x95\xe0\xa5\x8b \xe0\xa4\xaa\xe0\xa5\x87\xe0\xa4\x9f \xe0\xa4\x9b \xe0\xa4\x9c\xe0\xa5\x81\xe0\xa4\xa8

Input: "प्वाँखभएको।योएउटासेतोरकालोचराहोजसकोसानोचुचुरोरसेतोआँखाहुन्छ।एउटासानोआकारकोचराजस" 
<br>
Next Char Predictions: "ृखऊभमभउएुएथधइ्यरडनव्।टकःबऊडेअतोणऊिुनूएचोँौीङराङीैयघकघअऊेहअनेमग।तोँागझढइहवँडऔीएजगदेणनचसप"

### Train the model

In [None]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [None]:
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", example_batch_mean_loss)

Prediction shape:  (64, 100, 132)  # (batch_size, sequence_length, vocab_size)
Mean loss:         tf.Tensor(4.883576, shape=(), dtype=float32)


In [None]:
tf.exp(example_batch_mean_loss).numpy()

132.1022

In [None]:
model.compile(optimizer='adam', loss=loss)

In [None]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

### Execute the training

In [None]:
EPOCHS = 50

In [None]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
print(model.layers[0].get_weights()[0].shape)

(132, 1024)


In [None]:
embeddings = model.get_layer('embedding').get_weights()[0]

In [None]:
char_embeddings = embeddings
embeddings.shape

(132, 1024)

In [None]:
with open("embeddings.pickle" , 'wb') as f:
  pickle.dump(embeddings, f)

### Generate Embeddings for Train Images

In [None]:
tokens = ids_from_chars.get_vocabulary()
len(tokens)

132

### Combining tokens and embeddings

In [None]:
token_embeddings = dict()
for index,token in enumerate(tokens):
  token_embeddings[token] = embeddings[index]

#### reverse dictionary lookup

In [None]:
char_to_index = {}
for index,char in enumerate(tokens):
  char_to_index[char] = index

In [None]:
def average_word_embeddings(word, char_embeddings, char_to_index):
  """
  Average the character embeddings for a word

  Arguments:
    word -- the word for which we want to generate the embedding
    char_embeddngs -- a numpy array of shape (num_characters, embedding_size) containing the character embeddings
    char_to_index -- a dictionary that maps each character to its index in char_embeddings
  
  Returns:
    word_embedding -- the averaged character embeddings for the word
  """
  # char_indices = [char_to_index[char] for char in word]
  char_indices = list()
  for char in word:
    if char not in tokens:
      char = "[UNK]"
    char_indices.append(char_to_index[char])
  word_embedding = np.mean(char_embeddings[char_indices], axis=0)
  return word_embedding

### TESTING:

In [None]:
average_word_embeddings("सुन्तला1", char_embeddings, char_to_index)

array([-0.04833474,  0.02841681, -0.02949674, ...,  0.10344802,
        0.06044192, -0.03551155], dtype=float32)

In [None]:
def average_sentence_embeddings(sentence):
  """
  Average the word embeddings for a sentence

  Arguments:
    sentence -- the sentence for which we want to generate the embedding
  """
  text_embeddings = list()
  sentence_embeddings = list()
  sentence = sentence.split(" ")
  sentence_indices = list()
  for index, word in enumerate(sentence):
    if word not in tokens:
      word = "[UNK]"
    sentence_indices.append(index)
    embeddings = average_word_embeddings(word, char_embeddings, char_to_index)
    sentence_embeddings.append(embeddings)
  text_embeddings.append(np.mean(sentence_embeddings, axis=0))
  return text_embeddings


In [None]:
average_sentence_embeddings("सुन्तला र पहेँलो दाग भएको सानो कालो चरा, छोटो  टार्सस र मध्यम चुचुरोमा")[0]

array([-0.06284051, -0.0250615 , -0.04158952, ..., -0.02211666,
       -0.02491022,  0.01588372], dtype=float32)

In [None]:
average_sentence_embeddings("यो प्रायः कालो चराको पूरै शरीरमा कालो हुन्छ बाहेक कभरट्स ट्यानको रेखाको साथ चम्किलो रातो हुन्छ।\n")

[array([-0.05196976, -0.03763029, -0.04089294, ..., -0.01561808,
        -0.03914392,  0.02317065], dtype=float32)]

## Generate Embeddings for Text Descriptions

In [None]:
text_embeddings = list()
for paragraphs in text_description:
  sentence_embeddings = list()
  for sentence in paragraphs:
    embed = average_sentence_embeddings(sentence)[0]
    sentence_embeddings.append(embed)
  sentence_embeddings = np.array(sentence_embeddings)
  text_embeddings.append(sentence_embeddings)

In [None]:
len(text_embeddings)

1056

In [None]:
text_embeddings[1].shape

(10, 1024)

In [None]:
779 - 543

236

## Save first 543 as train_embeddings and rest as test_embeddings

In [None]:
with open("train_embeddings.pickle" , 'wb') as f:
  pickle.dump(text_embeddings[:779], f)

In [None]:
with open("test_embeddings.pickle" , 'wb') as f:
  pickle.dump(text_embeddings[779:], f)