In [36]:
import random
import pickle

import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer

from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Activation
from tensorflow.keras.optimizers import RMSprop

In [37]:
#import the dataset and make it into a list
text_df = pd.read_csv("./dataset/fake_or_real_news.csv")
text = list(text_df.text.values)
joined_text = " ".join(text)

In [67]:
#splitting and resulting individual word in a list
partial_text = joined_text[:1000000]
tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(partial_text.lower())

In [68]:
#removing duplicate word
unique_tokens = np.unique(tokens)
unique_tokens_index = {token: idx for idx, token in enumerate(unique_tokens)}

In [69]:
n_words = 10
input_words = []
next_words = []

for i in range(len(tokens) - n_words):
    input_words.append(tokens[i: + n_words])
    next_words.append(tokens[i + n_words])

In [70]:
x = np.zeros((len(input_words), n_words, len(unique_tokens)), dtype=bool)
y = np.zeros((len(next_words), len(unique_tokens)), dtype=bool)

In [71]:
for i, words in enumerate(input_words):
    for j, word in enumerate(words):
        x[i, j, unique_tokens_index[word]] = 1
    y[i, unique_tokens_index[next_words[i]]] = 1

In [72]:
model = Sequential()
model.add(LSTM(128, input_shape=(n_words, len(unique_tokens)), return_sequences=True))
model.add(LSTM(128))
model.add(Dense(len(unique_tokens)))
model.add(Activation("softmax"))

  super().__init__(**kwargs)


In [44]:
optimizer = RMSprop(learning_rate=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
history = model.fit(x, y, batch_size=128, epochs=10, shuffle=True).history

Epoch 1/10
[1m1326/1326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m425s[0m 312ms/step - accuracy: 0.0514 - loss: 7.4496
Epoch 2/10
[1m1326/1326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m462s[0m 348ms/step - accuracy: 0.0540 - loss: 7.2937
Epoch 3/10
[1m1326/1326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m436s[0m 329ms/step - accuracy: 0.0525 - loss: 7.3112
Epoch 4/10
[1m1326/1326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m469s[0m 353ms/step - accuracy: 0.0504 - loss: 7.3280
Epoch 5/10
[1m1326/1326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m445s[0m 335ms/step - accuracy: 0.0508 - loss: 7.3518
Epoch 6/10
[1m1326/1326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m470s[0m 354ms/step - accuracy: 0.0522 - loss: 7.3440
Epoch 7/10
[1m1326/1326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m479s[0m 361ms/step - accuracy: 0.0515 - loss: 7.3542
Epoch 8/10
[1m1326/1326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m443s[0m 333ms/step - accuracy: 0.0515 - loss:

In [45]:
history = model.fit(x, y, batch_size=128, epochs=5, shuffle=True).history

Epoch 1/5
[1m1326/1326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m415s[0m 312ms/step - accuracy: 0.0501 - loss: 7.3810
Epoch 2/5
[1m1326/1326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m414s[0m 312ms/step - accuracy: 0.0497 - loss: 7.3954
Epoch 3/5
[1m1326/1326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m405s[0m 305ms/step - accuracy: 0.0496 - loss: 7.4004
Epoch 4/5
[1m1326/1326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m388s[0m 293ms/step - accuracy: 0.0486 - loss: 7.4023
Epoch 5/5
[1m1326/1326[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m392s[0m 296ms/step - accuracy: 0.0488 - loss: 7.4125


In [73]:
model.save("text_gen_model2.h5")
with open("history2.p", "wb") as f:
    pickle.dump(history, f)



In [74]:
model = load_model("text_gen_model2.h5")
history = pickle.load(open("history2.p", "rb"))



In [75]:
def predict_next_word(input_text, n_best):
    input_text = input_text.lower()
    x = np.zeros((1, n_words, len(unique_tokens)))
    for i, word in enumerate(input_text.split()):
        x[0, i, unique_tokens_index[word]] = 1
    
    predictions = model.predict(x)[0]
    return np.argpartition(predictions, -n_best)[-n_best:]

In [76]:
input_text = "The beauty of city"
input_words = input_text.lower().split()
for word in input_words:
    if word not in unique_tokens_index:
        print(f"Word '{word}' not in unique_tokens_index")
possible = predict_next_word(input_text, 5)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 660ms/step


In [80]:
for idx in possible:
    print(unique_tokens[idx])

recordings
podcast
aamaq
hub
geller


In [78]:
def generate_text(input_text, n_words, creativity=3):
    word_sequence = input_text.split()
    current = 0
    for _ in range(n_words):
        sub_sequence = " ".join(tokenizer.tokenize(" ".join(word_sequence).lower())[current:current+n_words])
        try:
            choice = unique_tokens[random.choice(predict_next_word(sub_sequence, creativity))]
        except:
            choice  = random.choice(unique_tokens)
        word_sequence.append(choice)
        current += 1
    return " ".join(word_sequence)

In [79]:
generate_text("I will have to look into this thing because I", 100, 10)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40

'I will have to look into this thing because I hindered feminism bin fund pundit dealerships todd guard guard happier civilians devices bombshell ritchie nor amanda mystery aggressive weapons unwanted examining jfk jfk withhold aloft exclaims milk hitler reset rejecting operates ll length prioritize foregone robin predictor prey pug underwriters ran hailing goto subatomic crowning reflection reinforcements http schizoid appoint boycott http respectful hacks baines leigh baines colleen heaviest weaver grey scrambles swedish scrambles rural shocked boasted newman band scrambles replacement landscape asking noble types proxy togetherness proponents uber wives lesbian doers lesbian christ christ tumultuous realist lifespans simmering 500 simmering methodical fairness hurry diagnosis undercut thermal application megyn light'