In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from spacy.lang.en.stop_words import STOP_WORDS
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from keras import layers
from keras import callbacks
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
true_csv = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')
fake_csv = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv')

true_csv['label'] = [1]*true_csv.shape[0]
fake_csv['label'] = [0]*fake_csv.shape[0]

csv = pd.concat([true_csv, fake_csv])

csv.head()

# **Class Distribution**

In [None]:
csv['label'].value_counts()

In [None]:
csv['subject'].value_counts()

# **Data Cleaning**
1. Removing non-alpha characters.
2. Removing stop words.

In [None]:
def clean(text):
    text = re.sub('[^\w\s+]', '', text.lower())
    text = ' '.join([word for word in text.split() if word not in STOP_WORDS])
    
    return text

In [None]:
csv['text_combined'] = csv['title']+' '+csv['text']
csv['text_cleaned'] = csv['text_combined'].apply(clean)

csv[['text_combined', 'text_cleaned', 'label']].head()

# **Machine Learning**
**Feature Extraction**

In [None]:
vect = TfidfVectorizer()
features = vect.fit_transform(csv['text_cleaned'])
features.shape

**Train-Test Split**

In [None]:
features_shuffled, labels_shuffled = shuffle(features, csv['label'])
X_train, X_test, y_train, y_test = train_test_split(features_shuffled, 
                                                    labels_shuffled, 
                                                    stratify=labels_shuffled,
                                                    random_state=42,
                                                    test_size=0.20)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

**Model Building**

In [None]:
model = LinearSVC()

model.fit(X_train, y_train)
model.score(X_train, y_train)

**Model Evaluation**

In [None]:
print(classification_report(y_test, model.predict(X_test)))

In [None]:
plot_confusion_matrix(model, X_test, y_test, cmap=plt.cm.Blues)

# **Deep Learning**

**Feature Extraction and Preprocessing**

In [None]:
encoder = Tokenizer(oov_token='OOV')
encoder.fit_on_texts(csv['text_cleaned'])

word_to_index = encoder.word_index

x = encoder.texts_to_sequences(csv['text_cleaned'])

vocab_size = len(word_to_index)+1
vocab_size

x = pad_sequences(x, padding='post')

In [None]:
x.shape

In [None]:
features_shuffled, labels_shuffled = shuffle(x, csv['label'])
X_train1, X_test1, y_train1, y_test1 = train_test_split(features_shuffled, 
                                                    labels_shuffled, 
                                                    stratify=labels_shuffled,
                                                    random_state=42,
                                                    test_size=0.20)
X_train1.shape, X_test1.shape, y_train1.shape, y_test1.shape

# **Basic Neural Network**

1. First layer is an Embedding layer of 10 dimensions. Using 10 D to just increase the training time.
2. Second layer is a Global Average Pooling layer that averages out all features within a feature map.
3. Third and fourth layers are dense layers or fully connected layers with activation functions as RELU and SIGMOID.

In [None]:
model1 = Sequential()
model1.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=10, 
                           input_length=X_train1.shape[1]))
model1.add(layers.GlobalAveragePooling1D())
model1.add(layers.Dense(8, activation='relu'))
model1.add(layers.Dense(1, activation='sigmoid'))

In [None]:
model1.compile(loss=BinaryCrossentropy(), optimizer='adam', metrics=['accuracy'])
model1.summary()

# **Model Training**

In [None]:
callback = callbacks.EarlyStopping(monitor='val_loss', patience=3)
history = model1.fit(X_train1, y_train1, epochs=10, validation_data=(X_test1, y_test1))

# **Model Evaluation**

In [None]:
print(classification_report(y_test1, model1.predict_classes(X_test1)))

In [None]:
cm = confusion_matrix(y_test1, model1.predict_classes(X_test1))
from sklearn.metrics import ConfusionMatrixDisplay
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=[0, 1])

In [None]:
disp = disp.plot(include_values=True, ax=None, xticks_rotation='horizontal', cmap=plt.cm.Blues)
plt.show()