In [None]:
import os
import pandas as pd
import numpy as np
import re
import random
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from keras.layers import Dense, Embedding, Dropout, GlobalAveragePooling1D
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam, SGD, RMSprop

In [None]:
SEED = 42

In [None]:
fake = pd.read_csv('../input/fake-and-real-news-dataset/Fake.csv')
true = pd.read_csv('../input/fake-and-real-news-dataset/True.csv')
stopwords = pd.read_csv('../input/stopwords/stopwords.txt', header=None)

In [None]:
stopwords = [x for x in stopwords[0]]

In [None]:
fake['label'] = 0
true['label'] = 1

In [None]:
drop = ['subject', 'date']
fake.drop(drop, axis=1, inplace=True)
true.drop(drop, axis=1, inplace=True)

In [None]:
data = pd.concat([fake, true], ignore_index=True, sort=False)

In [None]:
data.head()

In [None]:
X = data['text']
y = data['label']

In [None]:
def normalize(text):
    return re.sub(r'(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)|([^\x00-\x7F]+)|([0-9])|(\w+:\/\/\S+)|([^\w\s])|(\s+)', ' ', text)

X = X.apply(normalize)

In [None]:
X = X.str.lower()

In [None]:
def filter_length(text):
    return re.sub(r'\b(?:\w{,3})\b', '', text)

X = X.apply(filter_length)

In [None]:
def whitespace_LT(text):
    return text.strip()

X = X.apply(whitespace_LT)

In [None]:
def multispace(text):
    return re.sub(r'\s+', ' ', text)

X = X.apply(multispace)

In [None]:
X[random.randint(0, len(X))]

In [None]:
from nltk.tokenize import word_tokenize

def word_tokenize_wrapper(text):
    return word_tokenize(text)

X = X.apply(word_tokenize_wrapper)

In [None]:
def sw_removal(words):
    return [word for word in words if word not in stopwords]

X = X.apply(sw_removal)

In [None]:
X

In [None]:
import ast
def join(texts):
#     text = ast.literal_eval(texts)
    return ' '.join([text for text in texts])

X = X.apply(join)

In [None]:
X

In [None]:
from keras.preprocessing.text import Tokenizer

In [None]:
Xt, Xv, yt, yv = train_test_split(X, y, stratify=y, test_size=1/3, random_state=SEED, shuffle=True)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(Xt)
sequences = tokenizer.texts_to_sequences(Xt)
word_index = tokenizer.word_index
vocab_size = len(word_index)
test_tokens = Tokenizer()
test_tokens.fit_on_texts(Xv)
test_sequences = test_tokens.texts_to_sequences(Xv)
test_word_index = test_tokens.word_index
test_vocab_size = len(test_word_index)

In [None]:
from keras.preprocessing.sequence import pad_sequences

In [None]:
Xt = pad_sequences(sequences, padding='post')
Xv = pad_sequences(test_sequences, padding='post')

In [None]:
es = EarlyStopping(monitor='val_loss', patience=4, mode='min', verbose=1)
model = Sequential([
    Embedding(vocab_size+1, 256),
    GlobalAveragePooling1D(),
    Dense(32, activation='relu'),
    Dense(32),
    Dense(64),
    Dense(128),
    Dropout(.2),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer=Adam(lr=0.0005))
model.summary()

In [None]:
history = model.fit(Xt, yt, validation_split=.1, epochs=32, batch_size=128, callbacks=[es])

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('accuracy')
plt.legend(['train', 'test'])
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('loss')
plt.legend(['train', 'test'])
plt.show()

In [None]:
model.evaluate(Xv, yv)

In [None]:
y_pred = model.predict_classes(Xv)

In [None]:
y_pred = [x[0] for x in y_pred]

In [None]:
print('Accuracy  : %.4f' %accuracy_score(yv, y_pred))
print('F1 Score  : %.4f' %f1_score(yv, y_pred))
print('Precision : %.4f' %precision_score(yv, y_pred))
print('Recall    : %.4f' %recall_score(yv, y_pred))