In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from pexpect.replwrap import python
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import word2vec

In [None]:
df=pd.read_csv('abcnews-date-text.csv',nrows=300000)
text_data=df['headline_text'].astype(str).tolist()

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and len(word) > 1]
    return " ".join(words)


In [None]:
cleaned_texts=[clean_text(t) for t in text_data]
sentences=[t.split() for t in cleaned_texts if t.strip()!=""]
print(f"exmple sentences: {sentences[0]}")

exmple sentences: ['aba', 'decides', 'community', 'broadcasting', 'licence']


In [None]:
w2v_model= word2vec.Word2Vec(
    sentences=sentences,
    vector_size=100,
    window=5,
    min_count=5,
    sg=1,
    workers=4,
    epochs=10,
    sample=1e-4,
    negative=10,
)

In [None]:
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(cleaned_texts)
vocab_size = len(tokenizer.word_index) + 1
input_sequences=[]
for line in cleaned_texts:
    if line.strip() == "":
        continue
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

max_sequence_length = max([len(x) for x in input_sequences])


x=pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')
x_train=x[:,:-1]
y_train=x[:,-1]

print(x_train.shape)
print(len(tokenizer.word_index))


(1256468, 9)
34448


In [None]:
embedding_dim=w2v_model.vector_size
embedding_matrix=np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
         embedding_matrix[i] = w2v_model.wv[word]

print(embedding_matrix.shape)

(34449, 100)


In [None]:
model_next_word=keras.models.Sequential()
model_next_word.add(layers.Input(shape=(max_sequence_length- 1,)))
model_next_word.add(layers.Embedding(
    vocab_size,
    embedding_dim,
    weights=[embedding_matrix],
    trainable=True
))
model_next_word.add(layers.LSTM(128))

model_next_word.add(layers.Dense(vocab_size, activation='softmax'))

In [None]:
model_next_word.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_next_word.summary()

In [None]:

from keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


model_next_word.fit(x_train, y_train, epochs=30, verbose=1, batch_size=128,
                validation_split=0.1,          callbacks=[early_stopping])


Epoch 1/30
[1m8835/8835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 16ms/step - accuracy: 0.0352 - loss: 7.7404 - val_accuracy: 0.0984 - val_loss: 6.6480
Epoch 2/30
[1m8835/8835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 16ms/step - accuracy: 0.1072 - loss: 6.2628 - val_accuracy: 0.1124 - val_loss: 6.4494
Epoch 3/30
[1m8835/8835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 15ms/step - accuracy: 0.1297 - loss: 5.8155 - val_accuracy: 0.1193 - val_loss: 6.4199
Epoch 4/30
[1m8835/8835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 15ms/step - accuracy: 0.1461 - loss: 5.5226 - val_accuracy: 0.1188 - val_loss: 6.4583
Epoch 5/30
[1m8835/8835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 15ms/step - accuracy: 0.1597 - loss: 5.3075 - val_accuracy: 0.1182 - val_loss: 6.5254
Epoch 6/30
[1m8835/8835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 15ms/step - accuracy: 0.1739 - loss: 5.1275 - val_accuracy: 0.1182 - val_loss: 6.598

<keras.src.callbacks.history.History at 0x7c7f401d93d0>

In [None]:
def predict_next_words(model, tokenizer, max_sequence_len, seed_text, num_words_to_predict=1):
    generated_text = seed_text
    for _ in range(num_words_to_predict):
        cleaned_seed = clean_text(generated_text)
        token_list = tokenizer.texts_to_sequences([cleaned_seed])[0]

        if not token_list:
            print("لم يتم العثور على كلمات صالحة في النص المدخل .")
            return generated_text


        padded_token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
        predicted_probs = model.predict(padded_token_list, verbose=0)[0]
        predicted_index = np.argmax(predicted_probs)

        predicted_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                predicted_word = word
                break

        if predicted_word:
            generated_text += " " + predicted_word
        else:
            print("لم يتم العثور على كلمة متوقعة.")
            break

    return generated_text



seed_1 = "police investigate"
predicted_1 = predict_next_words(model_next_word, tokenizer, max_sequence_length, seed_1, num_words_to_predict=3)
print(f"النص الأصلي: '{seed_1}' -> التوقع: '{predicted_1}'")

seed_2 = "government announce"
predicted_2 = predict_next_words(model_next_word, tokenizer, max_sequence_length, seed_2, num_words_to_predict=2)
print(f"النص الأصلي: '{seed_2}' -> التوقع: '{predicted_2}'")

seed_3 = "australia"
predicted_3 = predict_next_words(model_next_word, tokenizer, max_sequence_length, seed_3, num_words_to_predict=5)
print(f"النص الأصلي: '{seed_3}' -> التوقع: '{predicted_3}'")

seed_4 = "global warming"
predicted_4 = predict_next_words(model_next_word, tokenizer, max_sequence_length, seed_4, num_words_to_predict=4)
print(f"النص الأصلي: '{seed_4}' -> التوقع: '{predicted_4}'")

seed_5 = "egypt"
predicted_5 = predict_next_words(model_next_word, tokenizer, max_sequence_length, seed_5, num_words_to_predict=3)
print(f"النص الأصلي: '{seed_5}' -> التوقع: '{predicted_5}'")

النص الأصلي: 'police investigate' -> التوقع: 'police investigate fatal car crash'
النص الأصلي: 'government announce' -> التوقع: 'government announce new law'
النص الأصلي: 'australia' -> التوقع: 'australia take lead world cup final'
النص الأصلي: 'global warming' -> التوقع: 'global warming may help ease water'
النص الأصلي: 'egypt' -> التوقع: 'egypt train crash kill'
