# Natural Language Processing Assignment :
## Predictive Text :

- Name : Sharanya Dasgupta
- Roll N0. : CS2320

In [2]:
# Imports
import nltk
import json
nltk.download('punkt')
import numpy as np
import pandas as pd
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.optimizers import RMSprop

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shara\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Step 1 : Data Preprocessing

In [31]:
# News Article Data : https://www.kaggle.com/datasets/jillanisofttech/fake-or-real-news
dataframe = pd.read_csv("fake_or_real_news.csv")
dataframe.head()

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [33]:
text_data = list(dataframe.text.values)
# Full raw text
merged_text = "".join(text_data) 

# Subset of data
sliced_text = merged_text[:10000]

# List of Sentences
corpus = nltk.sent_tokenize(sliced_text.lower())
len(corpus)

110

In [34]:
# Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1
total_words

719

# Step 2 : Create sequences of tokens from corpus for model training

In [35]:
# Creating input sequences of words
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Padding sequences to have equal length
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Splitting sequences into input and output
X = input_sequences[:,:-1]
y = input_sequences[:,-1]
y = np.eye(total_words)[y]

# Step 3 : Build the LSTM model

In [51]:
# Model
model = Sequential()
model.add(Embedding(total_words, 100))
model.add(LSTM(200, return_sequences=True))
model.add(LSTM(200))
model.add(Dense(total_words, activation='softmax'))

# Compiling
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Training
model.fit(X, y, batch_size=128, epochs=100, shuffle=True)

# Saving
model.save("next-word-prediction.keras")
model = load_model("next-word-prediction.keras")

Epoch 1/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 199ms/step - accuracy: 0.0228 - loss: 6.4763
Epoch 2/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 197ms/step - accuracy: 0.0517 - loss: 5.9302
Epoch 3/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 201ms/step - accuracy: 0.0550 - loss: 5.8475
Epoch 4/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 200ms/step - accuracy: 0.0547 - loss: 5.8053
Epoch 5/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 196ms/step - accuracy: 0.0475 - loss: 5.8569
Epoch 6/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 187ms/step - accuracy: 0.0505 - loss: 5.8503
Epoch 7/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 186ms/step - accuracy: 0.0555 - loss: 5.8065
Epoch 8/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 190ms/step - accuracy: 0.0578 - loss: 5.7787
Epoch 9/100
[1m13/13[0m [32m━

# Step 4: Predict the next word

In [52]:
def predict_next_word(model, tokenizer, text, max_sequence_len):
    """
    This function takes a text, tokenizes it, and predicts the top 5 most probable next words
    based on the model.
    """

    token_list = tokenizer.texts_to_sequences([text])[0]

    token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')

    predicted_probs = model.predict(token_list, verbose=0)[0]

    top_5_indices = np.argsort(predicted_probs)[-5:][::-1]

    top_5_words = [tokenizer.index_word[i] for i in top_5_indices]

    return top_5_words

def interactive_lstm_prediction():
    """
    This function allows the user to type words interactively, and the LSTM model will
    predict the next word based on the current context. The user can select a predicted
    word or type their own custom word.
    """
    context = []
    print("Start typing! Type 'STOP' to exit.")

    user_input = input("Enter the first word to start: ")

    context.append(user_input.lower())

    while True:

        input_text = " ".join(context)
        print("\nCurrent context:", input_text)

        predictions = predict_next_word(model, tokenizer, input_text, max_sequence_len)
        print("Top 5 predictions:", predictions)

        user_input = input(f"Choose a word from list or type your own ({', '.join(predictions)}): ")

        if user_input.lower() == "stop":
            print("Exiting the prediction session.")
            break

        context.append(user_input)

interactive_lstm_prediction()

Start typing! Type 'STOP' to exit.

Current context: he
Top 5 predictions: ['accused', 'was', 'an', 'had', 'may']

Current context: he was
Top 5 predictions: ['a', 'on', 'appearing', 'previously', 'an']

Current context: he was a

Current context: he was a surreal
Top 5 predictions: ['moment', 'president', 'wound', 'about', 'himself']

Current context: he was a surreal president
Top 5 predictions: ['in', 'at', 'to', 'and', 'went']

Current context: he was a surreal president and
Top 5 predictions: ['the', 'his', 'media', 'a', 'time']

Current context: he was a surreal president and his
Top 5 predictions: ['nominee', '“principled”', 'house', 'sigh', 'clintons']

Current context: he was a surreal president and his nominee
Top 5 predictions: ['of', 'and', 'it', 'that', 'with']

Current context: he was a surreal president and his nominee of
Top 5 predictions: ['the', 'a', 'his', 'fighting', 'an']

Current context: he was a surreal president and his nominee of the
Top 5 predictions: ['kgb',