In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import pickle
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import RMSprop, Adam
from tensorflow.keras.models import Sequential
import tensorflow as tf

In [None]:

with open('/content/drive/MyDrive/Natural-Language-Processing/train.en', 'r') as f:
    sentences = f.readlines()

In [None]:
sentences = sentences[:3000]

In [None]:
import re

def preprocess(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'\d+', '', sentence)
    sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)
    sentence = sentence.strip()
    return sentence

sentences = [preprocess(sentence) for sentence in sentences]

In [None]:
def adjust_sentence_length(sentences, sequence_length):
    adjusted_sentences = []
    buffer = []

    for sentence in sentences:
        words = sentence.split()
        buffer.extend(words)

        while len(buffer) >= sequence_length:
            adjusted_sentences.append(' '.join(buffer[:sequence_length]))
            buffer = buffer[sequence_length:]

    if buffer:
        adjusted_sentences.append(' '.join(buffer))

    # Post-processing to ensure all sentences are around sequence_length
    adjusted_sentences_final = []
    for sentence in adjusted_sentences:
        words = sentence.split()
        if len(words) < sequence_length:
            if adjusted_sentences_final:
                last_sentence_words = adjusted_sentences_final[-1].split()
                if len(last_sentence_words) + len(words) <= sequence_length:
                    adjusted_sentences_final[-1] += ' ' + sentence
                    continue
        adjusted_sentences_final.append(sentence)

    return adjusted_sentences_final

sentences = adjust_sentence_length(sentences, 20)

In [None]:
sentences[:20]

['the plot of the movie revolves around the life of two cancer patients kizie and manny the same as on',
 'the stone oommen chandy mullappally ramachandran ramesh chennithala and pk many were arrested and jailed what has happened they are',
 'doing politics the back houses a rear camera was it sometime during his prehuman existence at his birth as a',
 'human at his baptism or upon his being resurrected they didnt talk the driver immediately informed the police they have',
 'subsequently been expelled from the party to that end we can reflect on some past examples of courage it was',
 'the first time something like that was happening in my life i will be sending a letter to the cm',
 'in this regard he said the security personnel were caught off guard where is achhe din which way will you',
 'go in it became part of the mission san gabriel arcngel and then the rancho san gorgonio he was speaking',
 'at the inaugural session of the threeday conference and exhibition titled aurangzeb and

In [None]:
def generate_vocab_map(d, sentences):
    index = 1
    for sentence in sentences:
        for word in sentence.split():
            if word not in d:
                d[word] = index
                index += 1

vocab_mapping = {}
generate_vocab_map(vocab_mapping, sentences)

In [None]:
def convert_int_to_text(int_sequences, vocab_map):
    # Create a reverse mapping from index to word
    reverse_vocab_map = {index: word for word, index in vocab_map.items()}

    text_sequences = []
    for int_sequence in int_sequences:
        text_sequence = []
        for token in int_sequence:
            if token in reverse_vocab_map:
                text_sequence.append(reverse_vocab_map[token])

        text_sequences.append(' '.join(text_sequence))

    return text_sequences

In [None]:
sequences = []

def generate_tokens(tokens_list, sentences, vocab_map):
    for sentence in sentences:
        sentence_tokens = []
        for word in sentence.split():
            if word in vocab_map:
                sentence_tokens.append(vocab_map[word])

        tokens_list.append(sentence_tokens)

generate_tokens(sequences, sentences, vocab_mapping)

In [None]:
sequences[:10]

[[1, 2, 3, 1, 4, 5, 6, 1, 7, 3, 8, 9, 10, 11, 12, 13, 1, 14, 15, 16],
 [1,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  12,
  24,
  25,
  26,
  27,
  12,
  28,
  29,
  30,
  31,
  32,
  33],
 [34,
  35,
  1,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  45,
  49,
  15,
  38],
 [50,
  48,
  45,
  51,
  52,
  53,
  45,
  54,
  55,
  32,
  56,
  57,
  1,
  58,
  59,
  60,
  1,
  61,
  32,
  62],
 [63,
  64,
  65,
  66,
  1,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  16,
  74,
  75,
  76,
  3,
  77,
  42,
  41],
 [1, 78, 79, 80, 81, 69, 41, 82, 83, 84, 7, 85, 86, 87, 88, 38, 89, 68, 1, 90],
 [83,
  91,
  92,
  93,
  94,
  1,
  95,
  96,
  26,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  104,
  105,
  86,
  106],
 [107,
  83,
  42,
  108,
  109,
  3,
  1,
  110,
  111,
  112,
  113,
  12,
  114,
  1,
  115,
  111,
  116,
  93,
  41,
  117],
 [48,
  1,
  118,
  119,
  3,
  1,
  120,
  121,
  12,
  122,
  123,
  124,
  12,
  125,
  126,
  38,
  127,
  3,
 

In [None]:
tokens_list = [sequence for sequence in sequences]
tokens = [token for word_seq in tokens_list for token in word_seq]

tokens[:10]

[1, 2, 3, 1, 4, 5, 6, 1, 7, 3]

In [None]:
input_sequence = []
targets = []
sequence_length = 20

for i in range(len(tokens) - sequence_length):
    input_sequence.append(tokens[i:i+sequence_length])
    targets.append(tokens[i + sequence_length])

In [None]:
input_sequence[:5]

[[1, 2, 3, 1, 4, 5, 6, 1, 7, 3, 8, 9, 10, 11, 12, 13, 1, 14, 15, 16],
 [2, 3, 1, 4, 5, 6, 1, 7, 3, 8, 9, 10, 11, 12, 13, 1, 14, 15, 16, 1],
 [3, 1, 4, 5, 6, 1, 7, 3, 8, 9, 10, 11, 12, 13, 1, 14, 15, 16, 1, 17],
 [1, 4, 5, 6, 1, 7, 3, 8, 9, 10, 11, 12, 13, 1, 14, 15, 16, 1, 17, 18],
 [4, 5, 6, 1, 7, 3, 8, 9, 10, 11, 12, 13, 1, 14, 15, 16, 1, 17, 18, 19]]

In [None]:
targets[:10]

[1, 17, 18, 19, 20, 21, 22, 23, 12, 24]

In [None]:
X = np.array(input_sequence)
y = np.array(targets)

In [None]:
X

array([[   1,    2,    3, ...,   14,   15,   16],
       [   2,    3,    1, ...,   15,   16,    1],
       [   3,    1,    4, ...,   16,    1,   17],
       ...,
       [6568, 5568, 6569, ...,  156,  253, 1044],
       [5568, 6569, 6570, ...,  253, 1044,   38],
       [6569, 6570, 6571, ..., 1044,   38, 2036]])

In [None]:
from tensorflow.keras.utils import to_categorical

y = to_categorical(y)

In [None]:
y

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [None]:
model = tf.keras.models.load_model('/content/drive/MyDrive/Natural-Language-Processing/LSTM_Generator2')

In [None]:

model = Sequential()
model.add(Embedding(input_dim=len(vocab_mapping) + 1, output_dim=128))
model.add(LSTM(128, return_sequences=True))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(LSTM(128))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(units=len(vocab_mapping) + 1, activation='softmax'))

model.compile(loss = "categorical_crossentropy", optimizer = 'adam', metrics = ['accuracy'])

In [None]:
model.fit(X, y, batch_size = 32, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7a9431c990c0>

In [None]:
model.save('/content/drive/MyDrive/Natural-Language-Processing/LSTM_Generator2')

In [None]:
def predict_next_word(input_text):
    input_text = input_text.lower()
    word_tokens = input_text.split()
    int_tokens = [vocab_mapping[token] for token in word_tokens]

    prediction = model.predict([int_tokens])
    prediction_idx = np.argmax(prediction)
    return convert_int_to_text([[prediction_idx]], vocab_mapping)[0]


def generate_text(input_text, n_words):
    word_sequence = input_text.split()
    context = word_sequence[:]
    for _ in range(n_words):
        prediction = predict_next_word(' '.join(context))
        word_sequence.append(prediction)
        context.append(prediction)
        if len(context) > 20:
            context.pop(0)

    return ' '.join(word_sequence)


In [None]:
generate_text("everyone is living happy", 10)



'everyone is living happy with you and how it has come even my mother'

In [None]:
generate_text("everyone is living happy", 30)



'is living happy with you and how it has come even my mother said that may be away your sight and hearing allah has come over there will come about this beat she'

In [None]:
generate_text("everyone is living happy today since", 50)



'everyone is living happy today since you past how does our difference you do what we will go about so bengal if it modi eyes they fabricated it then wont they had done so mumbai does his stand for bible israel along with pure jehovahs discussions to meet his daughter report on the accident of car'

In [None]:
generate_text("doing politics the back houses a rear camera was it sometime during his prehuman existence at his birth as a", 100)



'doing politics the back houses a rear camera was it sometime during his prehuman existence at his birth as a human at his baptism or upon his being resurrected they didnt talk the driver immediately informed the police they have subsequently been expelled from the party to that end we can reflect on some past examples of courage it was the first time something like that was happening in my life i will be sending a letter to the cm in this regard he said the security personnel were caught off guard where is achhe din which way will you go in it became part of the mission san gabriel arcngel and then the rancho san gorgonio he was speaking'

In [None]:
generate_text("The people around the world gathered", 200)



'The people around the world gathered on the day of judgment hindi film actor mahesh anand passes away another passenger suffered injuries in the mishap and why are despoiling and violence in front of me and why does quarreling occur and why is strife carried this approach has to change then there was violence meditation will be helpful to have peace of mind the couple are parents to an eightyear old daughter police is investigating into the matter the central delhi has by social media the issue has given its district and any lord i turn away from the students the students has not released on this hospital for beneficial for the country the construction of these roads is underway the turbopetrol is likely to get a sevenspeed dct auto option a decision on this will be taken in our next meeting enter your details that is our thinking his voice was wobbling they have perfected that urgent investigation you make the night to enter into the day and you make the day to enter into the night i