In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import re
import string
from string import digits

In [2]:
df = pd.read_csv("/kaggle/input/newdata33/newdata.txt", sep='\t', header=None)
df.head()

Unnamed: 0,0
0,They enjoyed a lazy afternoon at home.
1,He learned to play the piano in his spare time.
2,The birds sang sweetly in the early morning.
3,They visited an art museum to see the new exhi...
4,He went to the store to buy some fresh produce.


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48320 entries, 0 to 48319
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       48320 non-null  object
dtypes: object(1)
memory usage: 377.6+ KB


In [4]:

# Lowercase all characters
df[0] = df[0].apply(lambda x: x.lower())

# Remove single and double quotes
df[0] = df[0].apply(lambda x: re.sub(r"[\"']", '', x))

# Set of all special characters
exclude = set(string.punctuation)
# Remove all the special characters
df[0] = df[0].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

# Remove all numbers from text
remove_digits = str.maketrans('', '', digits)
df[0] = df[0].apply(lambda x: x.translate(remove_digits))

# Remove extra spaces
df[0] = df[0].apply(lambda x: x.strip())
df[0] = df[0].apply(lambda x: re.sub(" +", " ", x))


In [5]:
df.head()

Unnamed: 0,0
0,they enjoyed a lazy afternoon at home
1,he learned to play the piano in his spare time
2,the birds sang sweetly in the early morning
3,they visited an art museum to see the new exhi...
4,he went to the store to buy some fresh produce


In [6]:
# Assuming the text data is in the first column
text_data = df[0].astype(str).tolist()  # Convert column to a list of strings

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)
print(f"Number of unique words: {len(tokenizer.word_index)}")

Number of unique words: 19122


In [None]:
tokenizer.word_index

In [7]:
# Save tokenizer
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle)

In [8]:
# Assuming text data is in the first column of the DataFrame (df[0])
text_data = df[0].astype(str).tolist()  # Convert the column to a list of strings

# Generate subsequences
input_sequences = []
for sentence in text_data:
    tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(tokenized_sentence)):
        input_sequences.append(tokenized_sentence[:i+1])

print(f"Number of input sequences: {len(input_sequences)}")

Number of input sequences: 437518


In [None]:
input_sequences

In [9]:
#max length of sentence
max_len = max([len(x) for x in input_sequences])
max_len

21

In [10]:
#zero padding
padded_input_sequences = pad_sequences(input_sequences, maxlen=max_len, padding='pre')
padded_input_sequences

array([[  0,   0,   0, ...,   0,   5,  30],
       [  0,   0,   0, ...,   5,  30,   2],
       [  0,   0,   0, ...,  30,   2, 597],
       ...,
       [  0,   0,   0, ...,  32,  10,   1],
       [  0,   0,   0, ...,  10,   1,  98],
       [  0,   0,   0, ...,   1,  98, 534]], dtype=int32)

In [11]:
# Prepare X and y using CountVectorizer
X = padded_input_sequences[:, :-1]
y = padded_input_sequences[:, -1]
print(X.shape, y.shape)

(437518, 20) (437518,)


In [12]:
# Convert y to categorical
vocab_size = len(tokenizer.word_index) + 1
y = tf.keras.utils.to_categorical(y, num_classes=vocab_size)
vocab_size

19123

In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input

In [22]:
model = Sequential()
model.add(Input(shape=(X.shape[1],)))
model.add(Embedding(vocab_size, 100))
model.add(LSTM(150, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(vocab_size, activation='softmax'))

#compile
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [23]:
# Define the batch generator function
def generate_batch(X, y, batch_size=64):
    '''Generate a batch of data for training.'''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, X.shape[1]), dtype='float32')
            decoder_target_data = np.zeros((batch_size, y.shape[1]), dtype='float32')
            for i, (input_seq, target_seq) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                encoder_input_data[i] = input_seq
                decoder_target_data[i] = target_seq
            yield encoder_input_data, decoder_target_data

In [24]:
from tensorflow.keras.callbacks import ModelCheckpoint
checkpoint_callback = ModelCheckpoint(
    filepath='best_model.keras',  # File path to save the model
    monitor='val_loss',        # Metric to monitor (e.g., validation loss)
    save_best_only=True,       # Save only the best model
    mode='min',                # Mode for val_loss: minimize it
    verbose=1                  # Print messages when saving the model
)

In [25]:
# Define early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='loss',  # You can also use 'val_loss' if you have validation data
    patience=3,      # Stop training after 5 epochs with no improvement
    restore_best_weights=True  # Restore the best weights once training stops
)

In [26]:
batch_sizes = 128
steps_per_epochs = len(X) // batch_sizes

history = model.fit(
    generate_batch(X, y, batch_size=batch_sizes),
    epochs=60,
    steps_per_epoch=steps_per_epochs,
    verbose=1,
    callbacks=[checkpoint_callback,early_stopping]
)

Epoch 1/60
[1m3418/3418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 30ms/step - accuracy: 0.1017 - loss: 6.6479
Epoch 2/60
[1m   3/3418[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:42[0m 30ms/step - accuracy: 0.2144 - loss: 1.0251

  self._save_model(epoch=epoch, batch=None, logs=logs)


[1m3418/3418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 30ms/step - accuracy: 0.1937 - loss: 5.4411
Epoch 3/60
[1m3418/3418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 30ms/step - accuracy: 0.2446 - loss: 4.9853
Epoch 4/60
[1m3418/3418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 30ms/step - accuracy: 0.2724 - loss: 4.6950
Epoch 5/60
[1m3418/3418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 30ms/step - accuracy: 0.2864 - loss: 4.5006
Epoch 6/60
[1m3418/3418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 30ms/step - accuracy: 0.2937 - loss: 4.3234
Epoch 7/60
[1m3418/3418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 30ms/step - accuracy: 0.3001 - loss: 4.1866
Epoch 8/60
[1m3418/3418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 30ms/step - accuracy: 0.3058 - loss: 4.0600
Epoch 9/60
[1m3418/3418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 30ms/step - accuracy: 0.3122 - loss: 3.9535
Epoch 10/60

In [27]:
# Function to predict next top 3 words with probabilities
def predict_next_words(model, tokenizer, text, max_len, top_n=3):
    tokenized_text = tokenizer.texts_to_sequences([text])[0]
    padded_token_text = pad_sequences([tokenized_text], maxlen=max_len, padding='pre')
    predictions = model.predict(padded_token_text, verbose=0)[0]
    
    # Get the top N predictions
    top_indices = np.argsort(predictions)[-top_n:][::-1]
    top_words = [(word, predictions[index]) for word, index in tokenizer.word_index.items() if index in top_indices]
    return top_words

In [34]:
# Predict multiple words
text = "lets go"
for i in range(5):
    top_predictions = predict_next_words(model, tokenizer, text, max_len, top_n=3)
    print(f"Current text: '{text}'")
    print(f"Top predictions: {top_predictions}")
    next_word = top_predictions[0][0]  # Choose the word with the highest probability
    text += " " + next_word

Current text: 'lets go'
Top predictions: [('to', 0.19974668), ('on', 0.17169979), ('out', 0.21576026)]
Current text: 'lets go to'
Top predictions: [('the', 0.66113836), ('a', 0.020017266), ('visit', 0.140714)]
Current text: 'lets go to the'
Top predictions: [('beach', 0.38402003), ('farmers', 0.054403014), ('nearest', 0.09970752)]
Current text: 'lets go to the beach'
Top predictions: [('the', 0.018844157), ('to', 0.6291719), ('enjoying', 0.27504632)]
Current text: 'lets go to the beach the'
Top predictions: [('birds', 0.2208275), ('wind', 0.16478634), ('weather', 0.23065408)]


In [29]:
# Save model
model.save('model/model.h5')

# Save tokenizer
with open('model/tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle)
