In [3]:
# Mount Google Drive to access files (if needed)
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import nltk
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Check if GPU is available and use it
physical_devices = tf.config.list_physical_devices('GPU')
if len(physical_devices) == 0:
    print("No GPU available. Using CPU instead.")
else:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    print(f'GPU {physical_devices[0]} available: True')

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Define a function for text preprocessing (stemming and cleaning)
def preprocess_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    ps = PorterStemmer()
    text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
    text = ' '.join(text)
    return text

# Load dataset (adjust path as per your file location)
news_dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Dataset/fakeNewsData.csv')
news_dataset = news_dataset.fillna('')

# Combine author and title into content
news_dataset['content'] = news_dataset['author'] + ' ' + news_dataset['title']
news_dataset['content'] = news_dataset['content'].apply(preprocess_text)

# Split dataset into train and test sets
X = news_dataset['content']
y = news_dataset['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=1)

# Vectorization using CountVectorizer
cv = CountVectorizer(max_features=5000)
X_train_cv = cv.fit_transform(X_train).toarray()
X_test_cv = cv.transform(X_test).toarray()

# Define Dense model (removing Conv1D)
input_layer = Input(shape=(X_train_cv.shape[1],))  # Shape for CountVectorizer input
dense_layer_1 = Dense(128, activation='relu')(input_layer)
dropout_layer_1 = Dropout(0.5)(dense_layer_1)
dense_layer_2 = Dense(64, activation='relu')(dropout_layer_1)
dropout_layer_2 = Dropout(0.5)(dense_layer_2)
output_layer = Dense(1, activation='sigmoid')(dropout_layer_2)

model = Model(inputs=input_layer, outputs=output_layer)

model.compile(optimizer=Adam(learning_rate=0.005), loss='binary_crossentropy', metrics=['accuracy'])

print("\nTraining Dense Network with CountVectorizer Features...")
history = model.fit(X_train_cv, y_train, epochs=20, batch_size=64, validation_data=(X_test_cv, y_test), verbose=1)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
GPU PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU') available: True


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Training Dense Network with CountVectorizer Features...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [4]:
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef

# Accuracy
loss, accuracy = model.evaluate(X_test_cv, y_test)

print(f"\nTest Accuracy: {accuracy}")
print(f"\nLoss: {loss}")

# Predict probabilities for test set
y_pred_prob = model.predict(X_test_cv)

# Convert probabilities to binary predictions (0 or 1)
y_pred = (y_pred_prob > 0.5).astype(int)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{cm}")

# AUC Score
auc_score = roc_auc_score(y_test, y_pred_prob)
print(f"AUC Score: {auc_score:.4f}")

# Calculate True Positives, True Negatives, False Positives, False Negatives
tn, fp, fn, tp = cm.ravel()

# Sensitivity (Recall)
sensitivity = tp / (tp + fn)
print(f"Sensitivity (Recall): {sensitivity:.4f}")

# Specificity
specificity = tn / (tn + fp)
print(f"Specificity: {specificity:.4f}")

# Precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision:.4f}")

# F1 Score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.4f}")

# Matthews Correlation Coefficient (MCC)
mcc = matthews_corrcoef(y_test, y_pred)
print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")


Test Accuracy: 0.9883241653442383

Loss: 0.09376787394285202
Confusion Matrix:
[[3567   52]
 [  33 3628]]
AUC Score: 0.9985
Sensitivity (Recall): 0.9910
Specificity: 0.9856
Precision: 0.9859
F1 Score: 0.9884
Matthews Correlation Coefficient (MCC): 0.9767
