### Next Word Prediction Model Training Pipeline
This notebook trains a next-word prediction model using LSTM on a cleaned article dataset.

In [None]:
import pandas as pd
import numpy as np
import pickle
import tensorflow as tf
from nltk.tokenize import sent_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import Sequence
import os

In [None]:
# Load and preprocess the dataset
df = pd.read_pickle('final_nlp_data.pkl')
print(f"Loaded {len(df)} rows")

Loaded 192363 rows


In [None]:
# Fit tokenizer on a sample of the data for efficiency
sample_df = df.sample(100, random_state=42)

# Save sample_df to a temp file
sample_df.to_pickle('models/sample_df.pkl')

sentences = []
for text in sample_df['clean_text']:
    sentences.extend([s for s in sent_tokenize(text) if len(s.split()) > 3])

tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)

# Save tokenizer
os.makedirs('models', exist_ok=True)
with open('models/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [5]:
# Data generator for large-scale training
class NGramSequenceGenerator(Sequence):
    def __init__(self, df_path, tokenizer, max_seq_len=30, batch_size=128):
        self.df = pd.read_pickle(df_path)
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        self.batch_size = batch_size
        self.sentences = self._prepare_sentences()
        self.ngrams = self._create_ngram_sequences()

    def _prepare_sentences(self):
        sentences = []
        for text in self.df['clean_text']:
            sentences.extend([s for s in sent_tokenize(text) if len(s.split()) > 3])
        return sentences

    def _create_ngram_sequences(self):
        sequences = []
        for line in self.sentences:
            token_list = self.tokenizer.texts_to_sequences([line])[0]
            for i in range(1, len(token_list)):
                sequences.append(token_list[:i+1])
        return sequences

    def __len__(self):
        return int(np.ceil(len(self.ngrams) / self.batch_size))

    def __getitem__(self, idx):
        batch = self.ngrams[idx * self.batch_size:(idx + 1) * self.batch_size]
        input_padded = pad_sequences(batch, maxlen=self.max_seq_len, padding='pre')
        X = input_padded[:, :-1]
        y = input_padded[:, -1]
        y = tf.keras.utils.to_categorical(y, num_classes=len(self.tokenizer.word_index)+1)
        return X, y

In [None]:
# Build and train the model
# Build and train the model
import warnings
warnings.filterwarnings("ignore")

if tf.config.experimental.list_physical_devices('GPU'):
    with tf.device('/GPU:0'):
        model = Sequential([
            Embedding(input_dim=10000, output_dim=128, input_length=29),
            LSTM(256, return_sequences=True),
            LSTM(128),
            Dense(128, activation='relu'),
            Dense(len(tokenizer.word_index)+1, activation='softmax')
        ])
else:
    model = Sequential([
        Embedding(input_dim=10000, output_dim=128, input_length=29),
        LSTM(256, return_sequences=True),
        LSTM(128),
        Dense(128, activation='relu'),
        Dense(len(tokenizer.word_index)+1, activation='softmax')
    ])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

train_gen = NGramSequenceGenerator('sample_data.pkl', tokenizer, max_seq_len=30)


checkpoint = ModelCheckpoint('models/nextword_model.h5', monitor='loss', save_best_only=True)
model.fit(train_gen, epochs=100, callbacks=[checkpoint])

In [None]:
def greedy_sample(preds):
    """Select the word with the highest probability."""
    return np.argmax(preds)

def generate_next_words(seed_text, model, tokenizer, max_seq_len, num_words=10):
    for _ in range(num_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len - 1, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)[0]
        next_index = greedy_sample(predicted_probs)
        next_word = tokenizer.index_word.get(next_index, '')
        if next_word == "":
            continue
        seed_text += " " + next_word
    return seed_text

In [None]:
model = tf.keras.models.load_model('models/nextword_model.h5')
tokenizer = pickle.load(open('models/tokenizer.pkl', 'rb'))
max_seq_len = 30

In [None]:
# Example usage
print(generate_next_words("Artificial intelligence is", model, tokenizer, max_seq_len))