In [1]:
import os
print('Hello World')

Hello World


In [2]:
!pip install nltk scikeras -q

In [3]:
## Data Collection

import nltk
nltk.download('gutenberg')

from nltk.corpus import gutenberg

import pandas as pd

## loading data

data = gutenberg.raw('shakespeare-hamlet.txt')

## save as txt

with open('hamlet.txt','w') as f:
  f.write(data)

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


In [4]:
## Data Preprocessomg

import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split



In [5]:
with open('hamlet.txt','r') as f:
  data = f.read().lower()

## tokenize text

tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

total_words = len(tokenizer.word_index)+1
total_words

4818

In [6]:
# create input list

input_sequences = []

for line in data.split('\n'):
  token_list = tokenizer.texts_to_sequences([line])[0]
  for i in range(1,len(token_list)):
    n_gram_sequence = token_list[:i+1]
    input_sequences.append(n_gram_sequence)



In [7]:
input_sequences[:10]

[[1, 687],
 [1, 687, 4],
 [1, 687, 4, 45],
 [1, 687, 4, 45, 41],
 [1, 687, 4, 45, 41, 1886],
 [1, 687, 4, 45, 41, 1886, 1887],
 [1, 687, 4, 45, 41, 1886, 1887, 1888],
 [1180, 1889],
 [1180, 1889, 1890],
 [1180, 1889, 1890, 1891]]

In [8]:
## pad sequences

max_sequnece_len = max(len(x) for x in input_sequences)
max_sequnece_len

14

In [9]:
input_sequences = np.array(pad_sequences(input_sequences,maxlen=max_sequnece_len,padding='pre'))
input_sequences

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0,    0, ..., 1047,    4,  193]], dtype=int32)

In [10]:
## create predicitors and label

import tensorflow as tf

x,y = input_sequences[:,:-1],input_sequences[:,-1]
y = tf.keras.utils.to_categorical(y,num_classes=total_words)

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=24)

In [12]:
## train our LSTM RNN

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

## Define Mode

model = Sequential()
model.add(Embedding(total_words,100,input_length=max_sequnece_len-1))
model.add(LSTM(150, return_sequences=True))
model.add(LSTM(100, return_sequences=True)) # Added return_sequences=True
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words,activation='softmax'))

## Complile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [13]:
### Train the model

history = model.fit(x_train,y_train,epochs=50,validation_data=(x_test,y_test),verbose=1)

Epoch 1/50
[1m604/604[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 12ms/step - accuracy: 0.0299 - loss: 7.1615 - val_accuracy: 0.0325 - val_loss: 6.8268
Epoch 2/50
[1m604/604[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.0338 - loss: 6.5505 - val_accuracy: 0.0325 - val_loss: 6.8266
Epoch 3/50
[1m604/604[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.0386 - loss: 6.3844 - val_accuracy: 0.0395 - val_loss: 6.9061
Epoch 4/50
[1m604/604[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.0421 - loss: 6.3106 - val_accuracy: 0.0480 - val_loss: 6.9100
Epoch 5/50
[1m604/604[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.0469 - loss: 6.1631 - val_accuracy: 0.0460 - val_loss: 6.9094
Epoch 6/50
[1m604/604[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.0522 - loss: 6.0155 - val_accuracy: 0.0479 - val_loss: 6.9578
Epoch 7/50
[1m604

In [16]:
def predict_next_word(model, tokenizer, text, max_sequence_len):
    # Tokenize the input text
    token_list = tokenizer.texts_to_sequences([text])[0]

    # Pad the sequence
    if len(token_list)>=max_sequence_len:
        token_list = token_list[:max_sequence_len]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicated = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicated, axis=1)
    for word, index in tokenizer.word_index.items():
      if index==predicted_word_index:
        return word
    return None


In [20]:
input_text = "You come most"
max_sequnece_len= model.input_shape[1]+1
nex_word = predict_next_word(model, tokenizer, input_text, max_sequnece_len)
print(nex_word)

course


In [22]:
#
model.save('nex_word_lstm.h5')

import pickle

with open('tokenizer.pkl','wb') as f:
  pickle.dump(tokenizer,f, protocol=pickle.HIGHEST_PROTOCOL)



In [23]:
!pip freeze > requirements.txt

In [24]:
# prompt: how to remove libararies from the requirements.txt which is not goona use in this file

import os
import nltk
from nltk.corpus import gutenberg
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import pickle

# ... (rest of your code)

!pip freeze > requirements.txt
!pip install -r requirements.txt  # Reinstall using the new requirements.txt

Collecting cudf-cu12@ https://pypi.nvidia.com/cudf-cu12/cudf_cu12-24.10.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (from -r requirements.txt (line 59))
  Downloading https://pypi.nvidia.com/cudf-cu12/cudf_cu12-24.10.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (24.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.9/24.9 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting en-core-web-sm@ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889 (from -r requirements.txt (line 89))
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m98.0 MB/s[0m eta [36m0:00:00[0m
[?25hProcessing /colabtools/dist/google_colab-1