<a href="https://colab.research.google.com/github/saihaneesh26/4thsem-daa-lab/blob/main/NLP_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import numpy as np
import os

In [None]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')


In [None]:
train_text = open(path_to_file,'rb').read()
train_text = train_text.decode(encoding="utf-8")

In [None]:
len(train_text)

1115394

In [None]:
#vectorize the text
vocab = sorted(set(train_text))
print("Unique words :",len(vocab))

Unique words : 65


In [None]:
#map the words to unique ids or tokenization
char2idx  = {u:i for i,u in enumerate(vocab)}#tokenids
idx2char  = np.array(vocab)
text_as_input = np.array([char2idx[c] for c in train_text])
print(text_as_input[:2])

[18 47]


In [None]:
#create dataset
char_data = tf.data.Dataset.from_tensor_slices(text_as_input)
seq_len = 100 #max input len-- len of input must be same  for training
seq = char_data.batch(seq_len+1,drop_remainder=True)

In [None]:
def split_input_target(chunk):
  input_text = chunk[:-1]
  target = chunk[1:]
  return input_text,target

dataset = seq.map(split_input_target)


In [None]:
Buffersize = 10000
BATCHSIZE = 64
dataset = dataset.shuffle(Buffersize).batch(BATCHSIZE,drop_remainder=True)
print(dataset)

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>


In [None]:
#model
vocab_len = len(vocab)
embedding_dims = 256
RNN_units = 1024

In [None]:
# An Embedding Layer: This layer serves as the input layer, accepting input values (in number format) and convert them into vectors.
# A GRU layer: An RNN layer filled with 1024 Gradient Descent Units
# A Dense layer: To output the result, with vocab_size outputs.

def build_model(vocab_len,embedding_dims,RNN_units,batch_size):
  model = tf.keras.Sequential([
      tf.keras.layers.Embedding(vocab_len, embedding_dims,
                              batch_input_shape=[batch_size, None]),
      tf.keras.layers.GRU(RNN_units,return_sequences=True,stateful=True,recurrent_initializer='glorot_uniform'),
      tf.keras.layers.Dense(vocab_len)
  ])
  return model

In [None]:

model = build_model(vocab_len,embedding_dims,RNN_units,BATCHSIZE)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           16640     
                                                                 
 gru (GRU)                   (64, None, 1024)          3938304   
                                                                 
 dense (Dense)               (64, None, 65)            66625     
                                                                 
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


In [None]:
#compile
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam',loss=loss,metrics=[['accuracy']])


In [None]:
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = os.path.join('./training_checkpoints',"ckpt_{epoch}"),
    save_weights_only = True
)

In [None]:
NUM_EPOCHS = 30 # 20 minimum
history = model.fit(dataset,epochs = NUM_EPOCHS,callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
tf.train.latest_checkpoint('./training_checkpoints')
model = build_model(vocab_len, embedding_dims, RNN_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint('./training_checkpoints'))
model.build(tf.TensorShape([1, None]))
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (1, None, 256)            16640     
                                                                 
 gru_2 (GRU)                 (1, None, 1024)           3938304   
                                                                 
 dense_2 (Dense)             (1, None, 65)             66625     
                                                                 
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


In [None]:
#input for model
#no of chars to generate
#vectorize the input
#temperature value to manually adjust variability of predictions
#devectorize teh output and feed it as input for next predictions
#join the generated chars to final string

def generate_text(model,nums,temp,start):
  input_eval = [char2idx[s] for s in start] # string to numbers (vectorizing)
  input_eval = tf.expand_dims(input_eval, 0) # dimension expansion
  text_generated = [] # Empty string to store our results
  model.reset_states() # Clears the hidden states in the RNN

  for i in range(nums): #Run a loop for number of characters to generate
    predictions = model(input_eval) # prediction for single character
    predictions = tf.squeeze(predictions, 0)
    predictions /= temp
    predicted_id = tf.random.categorical(predictions,num_samples=1)[-1,0].numpy()

    input_eval = tf.expand_dims([predicted_id],0)
    text_generated.append(idx2char[predicted_id])

  return (start+"".join(text_generated))


In [None]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (1, None, 256)            16640     
                                                                 
 gru_2 (GRU)                 (1, None, 1024)           3938304   
                                                                 
 dense_2 (Dense)             (1, None, 65)             66625     
                                                                 
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


In [1]:
generated_text = generate_text(
                    model, 
                    nums=500, 
                    temp=1, 
                    start="Haneesh")
print(generated_text)

NameError: ignored