In [4]:
import random
import pickle
import numpy as np
import pandas as pd

from nltk.tokenize import RegexpTokenizer
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Activation
from tensorflow.keras.optimizers import RMSprop

text_df = pd.read_csv("fake_or_real_news.csv")

In [5]:
text_df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [6]:
text = list(text_df.text.values)
joined_text = " ".join(text)

In [51]:
partial_teext = joined_text[:100000]


In [52]:
#tokenize text
tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(partial_teext.lower())



In [18]:
tokens

['daniel',
 'greenfield',
 'a',
 'shillman',
 'journalism',
 'fellow',
 'at',
 'the',
 'freedom',
 'center',
 'is',
 'a',
 'new',
 'york',
 'writer',
 'focusing',
 'on',
 'radical',
 'islam',
 'in',
 'the',
 'final',
 'stretch',
 'of',
 'the',
 'election',
 'hillary',
 'rodham',
 'clinton',
 'has',
 'gone',
 'to',
 'war',
 'with',
 'the',
 'fbi',
 'the',
 'word',
 'unprecedented',
 'has',
 'been',
 'thrown',
 'around',
 'so',
 'often',
 'this',
 'election',
 'that',
 'it',
 'ought',
 'to',
 'be',
 'retired',
 'but',
 'it',
 's',
 'still',
 'unprecedented',
 'for',
 'the',
 'nominee',
 'of',
 'a',
 'major',
 'political',
 'party',
 'to',
 'go',
 'war',
 'with',
 'the',
 'fbi',
 'but',
 'that',
 's',
 'exactly',
 'what',
 'hillary',
 'and',
 'her',
 'people',
 'have',
 'done',
 'coma',
 'patients',
 'just',
 'waking',
 'up',
 'now',
 'and',
 'watching',
 'an',
 'hour',
 'of',
 'cnn',
 'from',
 'their',
 'hospital',
 'beds',
 'would',
 'assume',
 'that',
 'fbi',
 'director',
 'james',
 'c

In [53]:
unique_tokens = np.unique(tokens)
unique_token_index = {token:idx for idx, token in enumerate(unique_tokens)}

unique_token_index

{'0': 0,
 '000': 1,
 '1': 2,
 '10': 3,
 '100': 4,
 '106': 5,
 '10th': 6,
 '11': 7,
 '12': 8,
 '12pm': 9,
 '13': 10,
 '14': 11,
 '15': 12,
 '16': 13,
 '160': 14,
 '17': 15,
 '187': 16,
 '19': 17,
 '1939': 18,
 '1960': 19,
 '197': 20,
 '1971': 21,
 '1972': 22,
 '1983': 23,
 '1984': 24,
 '1985': 25,
 '1990s': 26,
 '1992': 27,
 '1994': 28,
 '1996': 29,
 '1998': 30,
 '1999': 31,
 '2': 32,
 '20': 33,
 '200': 34,
 '2000': 35,
 '20001': 36,
 '2001': 37,
 '2002': 38,
 '2003': 39,
 '2004': 40,
 '2006': 41,
 '2007': 42,
 '2008': 43,
 '2009': 44,
 '2010': 45,
 '2011': 46,
 '2012': 47,
 '2013': 48,
 '2014': 49,
 '2015': 50,
 '2016': 51,
 '2020': 52,
 '21': 53,
 '2117': 54,
 '21st': 55,
 '21wire': 56,
 '22': 57,
 '227': 58,
 '23': 59,
 '24': 60,
 '241': 61,
 '27': 62,
 '2nd': 63,
 '3': 64,
 '30': 65,
 '300': 66,
 '31': 67,
 '33': 68,
 '3pm': 69,
 '4': 70,
 '40': 71,
 '400': 72,
 '42': 73,
 '49': 74,
 '5': 75,
 '50': 76,
 '500': 77,
 '51': 78,
 '52': 79,
 '57': 80,
 '5pm': 81,
 '6': 82,
 '60': 83,
 '

In [54]:
n_words = 10
input_words = []
next_word = []

for i in range(len(tokens) - n_words):
  input_words.append(tokens[i:i + n_words])
  next_word.append(tokens[i+ n_words])


In [55]:
next_word

['is',
 'a',
 'new',
 'york',
 'writer',
 'focusing',
 'on',
 'radical',
 'islam',
 'in',
 'the',
 'final',
 'stretch',
 'of',
 'the',
 'election',
 'hillary',
 'rodham',
 'clinton',
 'has',
 'gone',
 'to',
 'war',
 'with',
 'the',
 'fbi',
 'the',
 'word',
 'unprecedented',
 'has',
 'been',
 'thrown',
 'around',
 'so',
 'often',
 'this',
 'election',
 'that',
 'it',
 'ought',
 'to',
 'be',
 'retired',
 'but',
 'it',
 's',
 'still',
 'unprecedented',
 'for',
 'the',
 'nominee',
 'of',
 'a',
 'major',
 'political',
 'party',
 'to',
 'go',
 'war',
 'with',
 'the',
 'fbi',
 'but',
 'that',
 's',
 'exactly',
 'what',
 'hillary',
 'and',
 'her',
 'people',
 'have',
 'done',
 'coma',
 'patients',
 'just',
 'waking',
 'up',
 'now',
 'and',
 'watching',
 'an',
 'hour',
 'of',
 'cnn',
 'from',
 'their',
 'hospital',
 'beds',
 'would',
 'assume',
 'that',
 'fbi',
 'director',
 'james',
 'comey',
 'is',
 'hillary',
 's',
 'opponent',
 'in',
 'this',
 'election',
 'the',
 'fbi',
 'is',
 'under',
 '

In [None]:
input_words

In [95]:
#preparing a list tha for ech sample wil looks inside 10 posible words and will mark 0 or 1 according with the binary data value
x = np.zeros((len(input_words), n_words, len(unique_tokens)), dtype=bool)
y = np.zeros((len(next_word), len(unique_tokens)), dtype=bool)


In [29]:
x

array([[[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ...,
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False]],

       [[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ...,
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False]],

       [[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ...,
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, Fal

In [28]:
y

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [96]:
#go through the samples, maped arrays and set the only position from the dictionary to 1 for the naural network
for i, words in enumerate(input_words):
  for j, word in enumerate(words):
    x[i, j, unique_token_index[word]] = 1
  y[i, unique_token_index[next_word[i]]] = 1

In [97]:
model = Sequential()
model.add(LSTM(128, input_shape=(n_words, len(unique_tokens)), return_sequences=True))
model.add(LSTM(128))
model.add(Dense(len(unique_tokens)))
model.add(Activation("softmax"))

In [59]:
model.compile(loss="categorical_crossentropy", optimizer=RMSprop(learning_rate=0.01), metrics=["accuracy"])
model.fit(x, y, batch_size=128, epochs= 30, shuffle = True)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x79d7161734c0>

In [117]:
model.save("pretrained_model/mymodel.h5")



In [146]:
model = load_model("pretrained_model/mymodel.h5")



In [110]:
from posixpath import split
def predict_next_word(input_text, n_best):
  input_text = input_text.lower()
  x = np.zeros((1, n_words, len(unique_tokens)))
  for i, word in enumerate(input_text.split()):
      x[0,i, unique_token_index[word]] = 1
  prediction = model.predict(x)[0]
  return np.argpartition(prediction, -n_best)[-n_best:]


In [149]:
possible = predict_next_word("He will run because he", 5)



In [150]:
possible

array([2904, 1041, 1835, 3661, 3094])

In [151]:
print([unique_tokens[idx] for idx in possible])

['screengrabs', 'doesn', 'label', 'work', 'spheres']


In [114]:
def generate_text(input_text, text_length,creativity=3):
  word_sequence = input_text.split()
  current = 0
  for _ in range(text_length):
    sub_sequence = " ".join(tokenizer.tokenize(" ". join(word_sequence).lower())[current:current + n_words])
    try:
      choice = unique_tokens[random.choice(predict_next_word(sub_sequence, creativity))]
    except:
      choice = random.choice(unique_tokens)

    word_sequence.append(choice)
    current+=1
  return " ".join(word_sequence)

In [122]:
generate_text("I don't know what to do when ", 10)



"I don't know what to do when highlighted afghanistan labeling reportedly committing pt machines pt arrangement arrangement"

In [144]:
model = load_model("pretrained_model/mymodel.h5")
posible = predict_next_word("He have to run to his room and he want", 5)






In [145]:
print([unique_tokens[idx] for idx in posible])

['52', 'uniforms', 'amongst', 'classroom', 'officers']


In [157]:
generate_text("i want to see what is posible in this world", 10, 20)



'i want to see what is posible in this world community unusually intangible facts teammates staffers 227 10 lesser targeting'