# Fake news detection

In [None]:
# Importig standard Libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns

# Train / Test split
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

#!pip install wordcloud
from wordcloud import WordCloud,STOPWORDS

# Import the natural language toolkit library 
#!pip install nltk
import nltk
#nltk.download("punkt")
#nltk.download("stopwords")
from nltk.corpus import stopwords

# Text tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Importing Metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Loading the model and Ploting its architecture.
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import plot_model

#Ploting the confusion matrix
from sklearn.metrics import confusion_matrix

**Importing the ISOT dataset files**

In [None]:
%matplotlib inline

# read the csv files
d_true = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/True.csv")
d_fake = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/Fake.csv")

### 1. EDA & DATA PREPROCESSING

In [None]:
d_true.head()

In [None]:
d_true.describe()

In [None]:
d_fake.head()

In [None]:
d_fake.describe()

In [None]:
# add a column with the name label that has value 1 for reliable news and 0 for fake news

d_true["label"] = 1
d_fake["label"] = 0

**True Dataset's WordCloud**

In [None]:
# plot the true dataset's wordcloud using top 500 words

plt.figure(figsize = (15,15))
wc = WordCloud(max_words = 500 , width = 1000 , height = 500, background_color="rgba(255, 255, 255, 0)", stopwords = STOPWORDS).generate(" ".join(d_true.text))
plt.imshow(wc , interpolation = 'bilinear')
plt.axis('off')
#plt.savefig("../True dataset's world cloud.png", bbox_inches='tight')

**fake Dataset's WordCloud**

In [None]:
# plot the true dataset's wordcloud using top 500 words

plt.figure(figsize = (15,15))
wc = WordCloud(max_words = 500 , width = 1000 , height = 500 , background_color="rgba(255, 255, 255, 0)", stopwords = STOPWORDS).generate(" ".join(d_fake.text))
plt.imshow(wc)
plt.axis('off')
#plt.savefig("../Fake dataset's world cloud.png", bbox_inches='tight')

In [None]:
# Concatenate the title with the article text

d_true['text'] = d_true['title'] +" "+ d_true['text']
d_fake['text'] = d_fake['title'] +" "+ d_fake['text']

In [None]:
# drop the unnecessary columns

d_true.drop(["title", "subject", "date"], axis=1, inplace= True)
d_fake.drop(["title", "subject", "date"], axis=1, inplace= True)

In [None]:
# concatenate the two dataframes and shuffle the result

data = pd.concat([d_true, d_fake], axis=0, ignore_index = True)
data = shuffle(data)

data = data.reset_index(drop= True)
data.head()

In [None]:
# check if there is any null values

data.isnull().sum()

In [None]:
data.shape

In [None]:
#check the data distribution

data.label.value_counts().plot(kind='bar', color=['b', 'g'])

In [None]:
#check the number of articles in each type (fake or true)
# 0 for fake and 1 for true

data.label.value_counts()

In [None]:
# plotting the number of words in texts

fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,10))
text_len=data[data['label']==0]['text'].str.split().map(lambda x: len(x))
ax1.hist(text_len,color='SkyBlue')
ax1.set_title('Fake news texts')

text_len=data[data['label']==1]['text'].str.split().map(lambda x: len(x))
ax2.hist(text_len,color='PeachPuff')
ax2.set_title('Real news texts')
fig.suptitle('Number of Words in texts')
plt.show()

#### TEXT PROCESSING WITH THE NLTK LIBRARY

In [None]:
# defining a function that takes care of cleaning stopwords and punctuations using nltk library.
stop_words = set(stopwords.words('english'))
def process(text):
    """Converting the texts into lowercase characters and removing punctuations and stopwords using the nltk library."""
    text = text.lower()
    words = nltk.word_tokenize(text)
    new_words= [word for word in words if word.isalnum() and word not in stop_words]
    text = " ".join(new_words)
    return text

In [None]:
# cleaning the text and seperating the target(label) variable and the feature(text) variable.
data = shuffle(data)

data['text'] = data['text'].apply(process)
X = data['text'].to_frame()
Y = data['label'].to_frame()

In [None]:
# Ploting the number of words after removing stopwords
text_len=X['text'].str.split().map(lambda x: len(x))
plt.hist(text_len,color='SkyBlue')
plt.title('number of words')

In [None]:
# Calculating the average number of words in the texts in order to use it as the length of sequences.
# Calculting the number of unique words in order to pass it as argument to the tensorflow tokenizer.

Avg_len = text_len.mean()
Avg_len = round(Avg_len)
lst = []
for i in X['text']:
    tmp = i.split()
    lst.extend(tmp)
lst = set(lst)
Vocab_size = len(lst)
print("the average number of words in the texts is : ", Avg_len)
print("the texts contains", Vocab_size, "unique words")

### 2. TEXT TOKENIZATION

In [None]:
tokenizer = Tokenizer(num_words=Vocab_size)
tokenizer.fit_on_texts(X['text'])
sequences = tokenizer.texts_to_sequences(X['text'])

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
# padding the sequences created by the tokenizer using the the average number of words in the texts + 2 = 235 as the maxlen.
# also setting up the truncation and the padding to be at the end of the sequence.

data = pad_sequences(sequences, maxlen=Avg_len+2, padding='post', truncating='post')

### 3. Spliting the data into train / test sets

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(data, Y, test_size=0.25, random_state=25)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

### 4. Using GloVe for Word Embeddings

In [None]:
# Import tensor representations for words
# GloVe is an unsupervised learning algorithm for obtaining vector representations for words

embeddings_index = {};
with open("/kaggle/input/glove6b100dtxt/glove.6B.100d.txt", encoding="utf8") as f:
    for line in f:
        values = line.split();
        word = values[0];
        coefs = np.asarray(values[1:], dtype='float32');
        embeddings_index[word] = coefs;
print(len(coefs))

embeddings_matrix = np.zeros((Vocab_size+1, 100));
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word);
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector;


In [None]:
print(embeddings_matrix.shape)

### 5. Building the architecture of the model

In [None]:
# Building the architecture of the model
     
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(word_index)+1, 100, weights=[embeddings_matrix], trainable = False),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.summary()

### 6. Model training

In [None]:
# using an early stop callback to stop the trainning if the loss function cannot be improved anymore.

early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

history = model.fit(X_train, Y_train, epochs=10, validation_split=0.1, batch_size=32, shuffle=True, callbacks=[early_stop])

##### Save the model

In [None]:
model.save("/kaggle/output/model1.h5")

In [None]:
# Plotting the architecture of the model.

plot_model(model, to_file='/kaggle/output/model_schema.png', show_shapes=True)

### 7. Model Evaluation

In [None]:
# evaluating the model with the evaluate method.
model.evaluate(X_test, Y_test)

In [None]:
# predict the labels of test set.
Y_pred = (model.predict(X_test) >= 0.5).astype("int")

In [None]:
# Evaluating the model using sklearn metrics.

accuracy = accuracy_score(list(Y_test['label']), Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)

print("Model Accuracy : ", accuracy)
print('Precision on testing set:', precision)
print('Recall on testing set:', recall)

In [None]:
# Ploting the confusion matrix using the seaborn library

graph = confusion_matrix(Y_test, Y_pred)
plt.figure(figsize=(12, 10))
ax= plt.subplot()
sns.heatmap(graph, annot=True, ax = ax)
ax.xaxis.set_ticklabels(['Fake','True'], size=15)
ax.yaxis.set_ticklabels(['Fake','True'], size=15)
plt.savefig("../Confusion_matrix.png", bbox_inches='tight')