<a href="https://colab.research.google.com/github/saihaneesh26/NLP-proj/blob/main/NLP_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import numpy as np
import os

In [2]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')


Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [26]:
train_text = open(path_to_file,'rb').read()
train_text = train_text.decode(encoding="utf-8")
train_text[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [4]:
len(train_text)

1115394

In [5]:
#vectorize the text
vocab = sorted(set(train_text))
print("Unique words :",len(vocab))

Unique words : 65


In [6]:
#map the words to unique ids or tokenization
char2idx  = {u:i for i,u in enumerate(vocab)}#tokenids
idx2char  = np.array(vocab)
text_as_input = np.array([char2idx[c] for c in train_text])
print(text_as_input[:2])

[18 47]


In [7]:
#create dataset
char_data = tf.data.Dataset.from_tensor_slices(text_as_input)
seq_len = 100 #max input len-- len of input must be same  for training
seq = char_data.batch(seq_len+1,drop_remainder=True)

In [8]:
def split_input_target(chunk):
  input_text = chunk[:-1]
  target = chunk[1:]
  return input_text,target

dataset = seq.map(split_input_target)


In [9]:
Buffersize = 10000
BATCHSIZE = 64
dataset = dataset.shuffle(Buffersize).batch(BATCHSIZE,drop_remainder=True)
print(dataset)

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>


In [10]:
#model
vocab_len = len(vocab)
embedding_dims = 256
RNN_units = 1024

In [11]:
# An Embedding Layer: This layer serves as the input layer, accepting input values (in number format) and convert them into vectors.
# A GRU layer: An RNN layer filled with 1024 Gradient Descent Units
# A Dense layer: To output the result, with vocab_size outputs.

def build_model(vocab_len,embedding_dims,RNN_units,batch_size):
  model = tf.keras.Sequential([
      tf.keras.layers.Embedding(vocab_len, embedding_dims,
                              batch_input_shape=[batch_size, None]),
      tf.keras.layers.GRU(RNN_units,return_sequences=True,stateful=True,recurrent_initializer='glorot_uniform'),
      tf.keras.layers.Dense(vocab_len)
  ])
  return model

In [12]:

model = build_model(vocab_len,embedding_dims,RNN_units,BATCHSIZE)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           16640     
                                                                 
 gru (GRU)                   (64, None, 1024)          3938304   
                                                                 
 dense (Dense)               (64, None, 65)            66625     
                                                                 
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


In [13]:
#compile
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam',loss=loss,metrics=[['accuracy']])


In [14]:
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = os.path.join('./training_checkpoints',"ckpt_{epoch}"),
    save_weights_only = True
)

In [15]:
NUM_EPOCHS = 5 # 20 minimum
history = model.fit(dataset,epochs = NUM_EPOCHS,callbacks=[checkpoint_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [16]:
tf.train.latest_checkpoint('./training_checkpoints')
model = build_model(vocab_len, embedding_dims, RNN_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint('./training_checkpoints'))
model.build(tf.TensorShape([1, None]))
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (1, None, 256)            16640     
                                                                 
 gru_1 (GRU)                 (1, None, 1024)           3938304   
                                                                 
 dense_1 (Dense)             (1, None, 65)             66625     
                                                                 
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


In [23]:
#input for model
#no of chars to generate
#vectorize the input
#temperature value to manually adjust variability of predictions
#devectorize teh output and feed it as input for next predictions
#join the generated chars to final string

def generate_text(model,choice,nums,temp,start):
  input_eval = [char2idx[s] for s in start] # string to numbers (vectorizing)
  input_eval = tf.expand_dims(input_eval, 0) # dimension expansion
  text_generated = [] # Empty string to store our results
  model.reset_states() # Clears the hidden states in the RNN

  while(nums>0): #Run a loop for number of characters to generate
    predictions = model(input_eval) # prediction for single character
    predictions = tf.squeeze(predictions, 0)
    predictions /= temp
    predicted_id = tf.random.categorical(predictions,num_samples=1)[-1,0].numpy()

    input_eval = tf.expand_dims([predicted_id],0)
    if(choice==1 and idx2char[predicted_id]==' '):
      nums-=1
    elif(choice==2 and idx2char[predicted_id]=='\n'):
      nums-=1
    text_generated.append(idx2char[predicted_id])

  return (start+"".join(text_generated))


In [21]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (1, None, 256)            16640     
                                                                 
 gru_1 (GRU)                 (1, None, 1024)           3938304   
                                                                 
 dense_1 (Dense)             (1, None, 65)             66625     
                                                                 
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


In [25]:
d = ["Words","Sentences"]
ch = int(input("1.words 2.Sentences\nEnter choice:"))
val = int(input("Enter number of "+str(d[ch-1])+" required:"))
st = input("Enter starting word:")
generated_text = generate_text(
                    model, 
                    choice=ch,
                    nums=val, 
                    temp=1, 
                    start=st)
print(generated_text)

1.words 2.Sentences
Enter choice:2
Enter number of Sentences required:5
Enter starting word:hello
hellow us two
eraves and them by thus God here thy ears.

SICINIUS:
Nowng them and Prepore--

