In [None]:
# Supress warnings
import warnings

warnings.filterwarnings("ignore")

In [None]:
# Reading the datasets
import pandas as pd

df_real = pd.read_csv("../input/fake-and-real-news-dataset/True.csv")
df_fake = pd.read_csv("../input/fake-and-real-news-dataset/Fake.csv")

display(df_real)
display(df_fake)

In [None]:
# Information about the datasets

display(df_real.info())
display(df_fake.info())

In [None]:
# Dropping publishers

def remove_publisher(text):
    if " - " in text:
        return text.split(" - ", 1)[1]
    return text

df_real["text"] = df_real["text"].apply(remove_publisher)
df_fake["text"] = df_fake["text"].apply(remove_publisher)

display(df_real)
display(df_fake)

In [None]:
# Remove html
from bs4 import BeautifulSoup

def remove_html(text):
    soup = BeautifulSoup(text, "html.parser")
    html_free = soup.get_text()
    return html_free

In [None]:
# Remove punctuation
import string

def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct

In [None]:
# Tokenize
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r"\w+")

def word_tokenizer(text):
    tokenized_text = tokenizer.tokenize(text.lower())
    return tokenized_text

In [None]:
# Remove stopwords
from nltk.corpus import stopwords

def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words("english")]
    return words

In [None]:
# Lemmatizing
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def word_lemmatizer(text):
    lemmatized_text = [lemmatizer.lemmatize(word) for word in text]
    return lemmatized_text

In [None]:
# Data cleaning

def clean_data(text):
    text = remove_html(text)
    text = remove_punctuation(text)
    text = word_tokenizer(text)
    text = remove_stopwords(text)
    text = word_lemmatizer(text)
    return text

df_real["title"] = df_real["title"].apply(clean_data)
df_real["text"] = df_real["text"].apply(clean_data)

df_fake["title"] = df_fake["title"].apply(clean_data)
df_fake["text"] = df_fake["text"].apply(clean_data)

display(df_real)
display(df_fake)

In [None]:
# Adding label and merging data

df_real["label"] = 1
df_fake["label"] = 0

data = pd.concat([df_real, df_fake])
print(data.shape)

display(data)

In [None]:
# Building corpus

corpus = []
for lst in data["title"] + data["text"]:
    for item in lst:
        corpus.append(item)

print(len(corpus))

In [None]:
# Building vocabulary

vocab = set(corpus)
print("%d unique words" % (len(vocab)))

In [None]:
# Calculate tf-idf
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np

count_vect = CountVectorizer(stop_words="english")
sf = count_vect.fit_transform(corpus)

tfidf_trans = TfidfTransformer()
transformed_weights = tfidf_trans.fit_transform(sf)

weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
df_weights = pd.DataFrame({"term": count_vect.get_feature_names(), "weight": weights})
df_weights.sort_values(by="weight", ascending=False).head(10)

In [None]:
# Build word-tfidf dictionary

tfidf_score = {}
for _, item in df_weights.iterrows():
    tfidf_score[item["term"]] = item["weight"]

print(len(tfidf_score))

In [None]:
# Set mean tfidf score

mean_tfidf = df_weights["weight"].mean()
print("%.10f" % mean_tfidf)

In [None]:
# Function to drop words based on tfidf score

def drop_words(lst):
    text = []
    for item in lst:
        if item in tfidf_score and tfidf_score[item] >= mean_tfidf:
            text.append(item)
    return text

In [None]:
# Filter data

data["title"] = data["title"].apply(drop_words)
data["text"] = data["text"].apply(drop_words)

display(data)

In [None]:
# How many words present in each sample 

length = []
[length.append(len(str(text))) for text in data["title"] + data["text"]]
data["length"] = length
data.head()

In [None]:
# Get minimum, maximum, average length

print("Minimum sentence length = ", min(data["length"]))
print("Maximum sentence length = ", max(data["length"]))
avg_sent_len = round(sum(data["length"])/len(data["length"]))


print("Average sentence length = ", avg_sent_len)

In [None]:
# How many samples have less than 500 words

print(len(data[data["length"] < 500]))

In [None]:
# Drop the outliers

data = data.drop(data["text"][data["length"] < 500].index, axis=0)
print("Minimum sentence length = ", min(data["length"]))
print("Maximum sentence length = ", max(data["length"]))
avg_sent_len = round(sum(data["length"])/len(data["length"]))
print("Average sentence length = ", avg_sent_len)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
sns.distplot(data["length"], ax=ax)
ax.set_xlim(1, 5000)
plt.show()

In [None]:
# Dropping unnecessary features

data["text"] = data["title"] + data["text"]
data.drop(columns=["title", "subject", "date", "length"], axis=1, inplace=True)

display(data)

In [None]:
# Corpus size and vocabulary size

corpus = []
for lst in data["text"]:
    for item in lst:
        corpus.append(item)

print("Corpus size: ", len(corpus))
vocab = set(corpus)
print("Vocabulary size: ", len(vocab))

In [None]:
# Importing libraries and setting parameters

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping

embedding_dim = 100
max_length = avg_sent_len
trunc_type = "post"
padding_type = "post"
oov_tok = "<OOV>"
vocab_size = len(vocab)

In [None]:
# Train test split

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(data.text,data.label,test_size=0.3, random_state=42, shuffle=True, stratify=data.label)

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

In [None]:
# Tokenize

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(x_train)
word_index = tokenizer.word_index
vocab_size=len(word_index)

tokenixed_train = tokenizer.texts_to_sequences(x_train)
x_train = pad_sequences(tokenixed_train, maxlen=max_length, truncating=trunc_type)

tokenized_test = tokenizer.texts_to_sequences(x_test)
x_test = pad_sequences(tokenized_test, maxlen=max_length, truncating=trunc_type)

In [None]:
# Download the Glove vector

embeddings_index = {}

with open("../input/glove6b100d/glove.6B.100d.txt") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeddings_index[word] = coefs

embeddings_matrix = np.zeros((vocab_size+1, embedding_dim))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector

In [None]:
# Make numpy arrays

x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)

In [None]:
# Build the model
import tensorflow as tf
 
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True, dropout = 0.2)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, dropout = 0.2)),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
# YOUR CODE HERE
])

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
callback = EarlyStopping(monitor="val_loss", min_delta=1e-3, patience=5, verbose=1, mode="auto", restore_best_weights=True)
model.summary()

In [None]:
# Train the model

history = model.fit(x_train, y_train, batch_size=64, validation_data=(x_test, y_test), callbacks=[callback], epochs=10, verbose=1)

In [None]:
import matplotlib.pyplot as plt


def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history["val_" + string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, "val_" + string])
  plt.show()
  
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")