# **Imports**

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import random
import pickle

from nltk.tokenize import RegexpTokenizer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Activation
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.models import load_model





# **Read csv and extract relevant text**

In [None]:
text_df = pd.read_csv('fake_or_real_news.csv')

In [None]:
text_df

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [None]:
text =  list(text_df.text.values) ##list of all text values from csv

In [None]:
joined_text = (" ").join(text) ##all list elements joined together with space in btwn

In [None]:
partial_text = joined_text[:100000] ##taking only first 100000 characters from text body

# **Tokenising Words**

In [None]:
tokenizer = RegexpTokenizer(r'\w+') #initialising object of regexptokenizer class(splits words where words consists of letters, numbers and underscores)

In [None]:
tokens = tokenizer.tokenize(partial_text.lower())

In [None]:
unique_tokens = np.unique(tokens) ##removing duplicates

In [None]:
unique_tokens

array(['0', '000', '1', ..., 'zarif', 'zero', 'zhou'], dtype='<U18')

# **Mapping tokens to index.**

In [None]:
token_with_index = {token : index for index,token in enumerate(unique_tokens)}

In [None]:
token_with_index

{'0': 0,
 '000': 1,
 '1': 2,
 '10': 3,
 '100': 4,
 '106': 5,
 '10th': 6,
 '11': 7,
 '12': 8,
 '12pm': 9,
 '13': 10,
 '14': 11,
 '15': 12,
 '16': 13,
 '160': 14,
 '17': 15,
 '187': 16,
 '19': 17,
 '1939': 18,
 '1960': 19,
 '197': 20,
 '1971': 21,
 '1972': 22,
 '1983': 23,
 '1984': 24,
 '1985': 25,
 '1990s': 26,
 '1992': 27,
 '1994': 28,
 '1996': 29,
 '1998': 30,
 '1999': 31,
 '2': 32,
 '20': 33,
 '200': 34,
 '2000': 35,
 '20001': 36,
 '2001': 37,
 '2002': 38,
 '2003': 39,
 '2004': 40,
 '2006': 41,
 '2007': 42,
 '2008': 43,
 '2009': 44,
 '2010': 45,
 '2011': 46,
 '2012': 47,
 '2013': 48,
 '2014': 49,
 '2015': 50,
 '2016': 51,
 '2020': 52,
 '21': 53,
 '2117': 54,
 '21st': 55,
 '21wire': 56,
 '22': 57,
 '227': 58,
 '23': 59,
 '24': 60,
 '241': 61,
 '27': 62,
 '2nd': 63,
 '3': 64,
 '30': 65,
 '300': 66,
 '31': 67,
 '33': 68,
 '3pm': 69,
 '4': 70,
 '40': 71,
 '400': 72,
 '42': 73,
 '49': 74,
 '5': 75,
 '50': 76,
 '500': 77,
 '51': 78,
 '52': 79,
 '57': 80,
 '5pm': 81,
 '6': 82,
 '60': 83,
 '

# **Splitting into X and Y**

In [None]:
n_words = int(10)
input_words = []
next_words = []

for i in range(len(unique_tokens) - n_words ):
  input_words.append(unique_tokens[i:i+n_words])
  next_words.append(unique_tokens[i+n_words])

In [None]:
##setting dimensions for X and y
X = np.zeros((len(input_words), n_words , len(unique_tokens)), dtype = 'bool')
y = np.zeros((len(next_words), len(unique_tokens)), dtype='bool')

In [None]:
##adding values to X and Y
for i,words in enumerate(input_words):
  for j,word in enumerate(words):
    X[i,j, token_with_index[word]] = 1
  y[i, token_with_index[next_words[i]]] = 1


In [None]:
X, y

(array([[[ True, False, False, ..., False, False, False],
         [False,  True, False, ..., False, False, False],
         [False, False,  True, ..., False, False, False],
         ...,
         [False, False, False, ..., False, False, False],
         [False, False, False, ..., False, False, False],
         [False, False, False, ..., False, False, False]],
 
        [[False,  True, False, ..., False, False, False],
         [False, False,  True, ..., False, False, False],
         [False, False, False, ..., False, False, False],
         ...,
         [False, False, False, ..., False, False, False],
         [False, False, False, ..., False, False, False],
         [False, False, False, ..., False, False, False]],
 
        [[False, False,  True, ..., False, False, False],
         [False, False, False, ..., False, False, False],
         [False, False, False, ..., False, False, False],
         ...,
         [False, False, False, ..., False, False, False],
         [False, False, 

# **Model Compilation and Training**

In [None]:
X = X.astype('float32')
y = y.astype('float32')

In [None]:
model = tf.keras.Sequential(
    [
        # tf.keras.layers.LSTM(128, input_shape=(n_words, len(unique_tokens)) , return_sequences = True),
        # tf.keras.layers.LSTM(128),
        # tf.keras.layers.Dense(len(unique_tokens)),
        # tf.keras.layers.Activation('softmax')


        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(len(unique_tokens), activation='softmax')
    ]
)


##Bidirectional LSTMs are used to provide both future and past sequential context
##Dropout layer to prevent overfitting - (drops neurons that might not be contributing to the output effectively)


In [None]:
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
history = model.fit(X , y, batch_size = 64, epochs = 35 , shuffle=True)

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


In [None]:
model.save('nextword.h5')


  saving_api.save_model(


In [None]:
model = load_model('nextword.h5')

# **Model Predictions**

In [None]:
def predict_next_word(input_words, n_best):
  input_words = input_words.lower().split()
  X=np.zeros((1,int(n_words),len(unique_tokens)))
  for i,word in enumerate(input_words):
    X[0,i,token_with_index[word]] = 1;
  predictions = model.predict(X)[0]
  return np.argpartition(predictions, -n_best)[-n_best:]

In [None]:
def generate_text(input_words, text_len, choice=3):
  words = input_words.lower().split()
  current = 0;
  for i in range(text_len):
    sub_seq = " ".join(tokenizer.tokenize(" ".join(words))[current:current+n_words])
    try:
      word = unique_tokens[random.choice(predict_next_word(sub_seq, choice))]
    except:
      word = random.choice(unique_tokens)
    words.append(word)
    current = current +1

  return " ".join(words)


In [None]:
possible = predict_next_word('He came back at the house with his family at', 5)



In [None]:
for i in possible:
  print(unique_tokens[i])

stories
museum
peddle
ending
kindertransports
