### Importing libraries

In [1]:
import random
import pickle
import heapq

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk.tokenize import RegexpTokenizer

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Activation
from tensorflow.keras.optimizers import RMSprop



In [2]:
df = pd.read_csv(r"C:\Users\Owner\Desktop\code\NLP\fake_or_real_news_sent.csv")

In [3]:
df.head()

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
df.shape

(6335, 4)

In [5]:
text = list(df.text.values)

joined = " ".join(text)

In [6]:
partial = joined[:200000]

In [7]:
tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(partial.lower())

In [8]:
unique_tokens = np.unique(tokens)
unique_tokens_index = {token: idx for idx, token in enumerate(unique_tokens)}

In [9]:
len(unique_tokens_index)

5903

### Words to consider

In [10]:
n_words = 10
input_words = []
next_words = []

for i in range(len(tokens) - n_words):
    input_words.append(tokens[i : i + n_words])
    next_words.append(tokens[i + n_words])

### Bag of Words for X and y

In [11]:
X = np.zeros((len(input_words),n_words,len(unique_tokens)),dtype=bool)
y = np.zeros((len(input_words),len(unique_tokens)),dtype=bool)

In [12]:
X

array([[[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ...,
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False]],

       [[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ...,
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False]],

       [[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ...,
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, Fal

In [13]:
y

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [14]:
for i, words in enumerate(input_words):
    for j, word in enumerate(words):
        X[i, j, unique_tokens_index[word]] = 1
    y[i, unique_tokens_index[next_words[i]]] = 1

### Training the model

In [15]:
model = Sequential()
model.add(LSTM(128,input_shape=(n_words, len(unique_tokens)), return_sequences=True))
model.add(LSTM(128))
model.add(Dense(len(unique_tokens)))
model.add(Activation("softmax"))

  super().__init__(**kwargs)


In [16]:
model.compile(loss="categorical_crossentropy", optimizer=RMSprop(learning_rate=0.01), metrics=["accuracy"])
model.fit(X,y,batch_size=128,epochs=30,shuffle=True)

Epoch 1/30
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 203ms/step - accuracy: 0.0503 - loss: 7.2283
Epoch 2/30
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 186ms/step - accuracy: 0.0633 - loss: 6.8301
Epoch 3/30
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 182ms/step - accuracy: 0.0750 - loss: 6.6004
Epoch 4/30
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 188ms/step - accuracy: 0.0937 - loss: 6.3135
Epoch 5/30
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 198ms/step - accuracy: 0.1101 - loss: 6.0883
Epoch 6/30
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 176ms/step - accuracy: 0.1238 - loss: 5.8405
Epoch 7/30
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 166ms/step - accuracy: 0.1452 - loss: 5.5382
Epoch 8/30
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 158ms/step - accuracy: 0.1591 - loss: 5.2943
Epoch 9/30
[1m2

<keras.src.callbacks.history.History at 0x17b43c59f10>

In [17]:
model.save("mymodel.h5")



In [18]:
model = load_model("mymodel.h5")



In [19]:
def next_word_predict(input_text, n_best):
    input_text = input_text.lower()
    X = np.zeros((1, n_words, len(unique_tokens)))
    for i, word in enumerate(input_text.split()):
        X[0, i , unique_tokens_index[word]] = 1
    
    predictions = model.predict(X)[0]
    return np.argpartition(predictions, -n_best)[-n_best:]

In [29]:
possible = next_word_predict("What is going to be the president of the country",5)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step


In [30]:
possible

array([ 502, 5710, 2606, 4753, 5764], dtype=int64)

In [31]:
print([unique_tokens[idx] for idx in possible])

['at', 'was', 'i', 'should', 'where']


In [32]:
possible = next_word_predict("The current voting trends tell us",5)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step


In [33]:
possible

array([ 747,  430, 2618, 4420, 1479], dtype=int64)

In [35]:
print([unique_tokens[idx] for idx in possible])

['both', 'are', 'if', 'republicans', 'democrats']
