In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data viz
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Dropout, Embedding, LSTM, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
fake_df = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv')
true_df = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')

In [None]:
fake_df.head()

In [None]:
true_df.head()

In [None]:
true_df['label'] = 1
fake_df['label'] = 0

In [None]:
news_df = pd.concat([true_df,fake_df],ignore_index=True)

https://pandas.pydata.org/docs/reference/api/pandas.concat.html

In [None]:
news_df.head()

In [None]:
news_df.info()

In [None]:
sns.set_theme(style='darkgrid')
sns.countplot(x='label',data=news_df)
plt.xlabel('Label (0 = False; 1 = True)')
plt.show()

In [None]:
news_df['text'] = news_df['subject'] + ' ' + news_df['title'] + ' ' + news_df['text']
news_df.drop(columns=['subject','title','date'],inplace=True)

In [None]:
news_df.text[0]

<h2>Text Pre-processing</h2>
<ul>
    <li> Convert letter to lowercase </li>
    <li> Remove punctuations </li>
    <li> Remove stopwords (commonly used words that do not contribute meaning)</li>
</ul>

In [None]:
import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(stop_words)

In [None]:
def text_cleaning(text):
    text = text.lower()
    text = re.sub('https>://\S+|www\.\S+','',text)
    text = re.sub('[\W]',' ', text)
    text = re.sub('\s+',' ', text)
    
    tokenized_words = text.split()
    result = []
    for word in tokenized_words:
        if word not in stop_words:
            result.append(word)
    return result

In [None]:
print(text_cleaning(news_df.text[0]))

In [None]:
news_df.text = news_df.text.apply(text_cleaning)

<h2> Create Model </h2>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(news_df.text, news_df.label, test_size = 0.2, random_state = 42, shuffle = True)

In [None]:
max_vocab = 10000
max_len = 256
tokenizer = Tokenizer(num_words=10000,oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_train = pad_sequences(X_train, padding='post', maxlen=max_len)
X_test = pad_sequences(X_test, padding='post', maxlen=max_len)

In [None]:
def create_model(max_vocab,max_len):
    return Sequential([
        Embedding(input_dim=max_vocab, output_dim=100,input_length=max_len,trainable=False),
        Bidirectional(LSTM(64,  return_sequences=True)),
        Bidirectional(LSTM(32)),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(1,activation="sigmoid")
    ])

In [None]:
tf_model = create_model(max_vocab,max_len)
tf_model.summary()

In [None]:
tf_model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
history = tf_model.fit(X_train, y_train, batch_size = 16, epochs = 2,
                       validation_split=0.1)

<h2> Classification Report </h2>

In [None]:
y_pred = tf_model.predict(X_test)
print(classification_report(y_test,y_pred >= 0.5))

 Uncomment to plot accuracy curve 

In [None]:
# acc = history.history['accuracy']
# val_acc = history.history['val_accuracy']
# loss = history.history['loss']
# val_loss = history.history['val_loss']
# epochs = range(1, len(acc) + 1)

# plt.plot(epochs, acc, 'b', label='Training acc')
# plt.plot(epochs, val_acc, 'g', label='Validation acc')
# plt.xlabel('Epochs')
# plt.ylabel('Accuracy')
# plt.title('Training and validation accuracy')
# plt.legend()