### Prasad Rajesh Posture  
**Batch**: July 2022  
Artificial Intelligence  

**Task :** Create a deep learning model that generates meaningful text based on your input.

In [1]:
#Import Dependencies
import numpy
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# load data
# loading data and opening our input data in the form of a txt file
# Project Gutenburg/berg is where the data can be found
file=open("../input/frankenstein/frankenstein.txt").read()

In [3]:
# tokenization
# standardization
#what is tokenization? Tokenization is the process of breaking a stream of text up into words phrases symbols or some meaningful elements
def tokenize_words(input):
    input=input.lower()
    tokenizer=RegexpTokenizer(r'\w+')
    tokens=tokenizer.tokenize(input)
    filtered=filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)
processed_inputs=tokenize_words(file)

In [4]:
#chara to numbers
chars=sorted(list(set(processed_inputs)))
char_to_num=dict((c,i) for i,c in enumerate(chars))

In [5]:
#check if the words to char or chars to num has worked
input_len=len(processed_inputs)
vocab_len=len(chars)
print("Total number of characters:", input_len)
print("Total vocab:",vocab_len)

Total number of characters: 269566
Total vocab: 38


In [6]:
#sege length
seq_length=100
x_data=[]
y_data=[]

In [7]:
#loop through the sequence
for i in range(0,input_len-seq_length,1):
    in_seq=processed_inputs[i:i+seq_length]
    out_seq=processed_inputs[i+seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])
    
n_patterns=len(x_data)
print("Total Patterns:", n_patterns)

Total Patterns: 269466


In [8]:
# convert input sequence to np array and so on
X=numpy.reshape(x_data,(n_patterns, seq_length,1))
X=X/float(vocab_len)

In [9]:
# One-hot encoding
y=np_utils.to_categorical(y_data)

In [10]:
#Creating the model
model=Sequential()
model.add(LSTM(256, input_shape=(X.shape[1],X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1],activation='softmax'))

2022-07-27 06:02:44.576114: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-27 06:02:44.697261: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-27 06:02:44.698100: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-27 06:02:44.699827: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [11]:
#Compile the model
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [12]:
#saving the weights
filepath='model_weights_saved.hdf5'
checkpoint=ModelCheckpoint(filepath, monitor='loss',verbose=1,save_best_only=True,mode='min')
desired_callbacks=[checkpoint]

In [13]:
#fit the model and let it train
model.fit(X,y,epochs=10,batch_size=256,callbacks=desired_callbacks)

2022-07-27 06:02:48.444967: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10


2022-07-27 06:02:52.827949: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005



Epoch 00001: loss improved from inf to 2.78743, saving model to model_weights_saved.hdf5
Epoch 2/10

Epoch 00002: loss improved from 2.78743 to 2.45650, saving model to model_weights_saved.hdf5
Epoch 3/10

Epoch 00003: loss improved from 2.45650 to 2.20228, saving model to model_weights_saved.hdf5
Epoch 4/10

Epoch 00004: loss improved from 2.20228 to 2.02055, saving model to model_weights_saved.hdf5
Epoch 5/10

Epoch 00005: loss improved from 2.02055 to 1.90130, saving model to model_weights_saved.hdf5
Epoch 6/10

Epoch 00006: loss improved from 1.90130 to 1.82192, saving model to model_weights_saved.hdf5
Epoch 7/10

Epoch 00007: loss improved from 1.82192 to 1.76161, saving model to model_weights_saved.hdf5
Epoch 8/10

Epoch 00008: loss improved from 1.76161 to 1.71275, saving model to model_weights_saved.hdf5
Epoch 9/10

Epoch 00009: loss improved from 1.71275 to 1.67817, saving model to model_weights_saved.hdf5
Epoch 10/10

Epoch 00010: loss improved from 1.67817 to 1.64654, savin

<keras.callbacks.History at 0x7f5de37bb510>

In [14]:
#recompile the model with saved weights
filename='model_weights_saved.hdf5'
model.load_weights(filename)
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [15]:
# output of the models back into characters
num_to_char=dict((i,c) for i,c in enumerate(chars))

In [16]:
#random seed to help generate
start=numpy.random.randint(0,len(x_data)-1)
pattern=x_data[start]
print("Random Seed :")
print("\"",''.join([num_to_char[value] for value in pattern]),"\"")

Random Seed :
" sh combined disdain malignity unearthly ugliness rendered almost horrible human eyes scarcely observ "


In [17]:
# generate the text
for i in range(1000):
    x=numpy.reshape(pattern,(1,len(pattern),1))
    x=x/float(vocab_len)
    prediction=model.predict(x, verbose=0)
    index=numpy.argmax(prediction)
    result=num_to_char[index]
    seg_in=[num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern=pattern[1:len(pattern)]

ed see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see see s

The End