In [None]:
# Mount Google Drive to access files (if needed)
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import nltk
import re
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Dropout, Input, Attention
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Check if GPU is available and use it
physical_devices = tf.config.list_physical_devices('GPU')
if len(physical_devices) == 0:
    print("No GPU available. Using CPU instead.")
else:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    print(f'GPU {physical_devices[0]} available: True')

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Define a function for text preprocessing (stemming and cleaning)
def preprocess_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    ps = PorterStemmer()
    text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
    text = ' '.join(text)
    return text

# Load dataset (adjust path as per your file location)
news_dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Dataset/fakeNewsData.csv')
news_dataset = news_dataset.fillna('')

# Combine author and title into content
news_dataset['content'] = news_dataset['author'] + ' ' + news_dataset['title']
news_dataset['content'] = news_dataset['content'].apply(preprocess_text)

# Split dataset into train and test sets
X = news_dataset['content']
y = news_dataset['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Word2Vec model
sentences = [text.split() for text in X_train]
wv_model = Word2Vec(sentences=sentences, vector_size=200, window=5, min_count=2, workers=4)

# Function to get Word2Vec embeddings
def get_word2vec_embeddings(wv_model, texts, max_len):
    embeddings = np.zeros((len(texts), max_len, wv_model.vector_size))
    for i, text in enumerate(texts):
        words = text.split()
        for j, word in enumerate(words):
            if j == max_len:
                break
            if word in wv_model.wv:
                embeddings[i, j] = wv_model.wv[word]
    return embeddings

max_len = 200  # Adjusted max length to manage memory usage
X_train_word2vec = get_word2vec_embeddings(wv_model, X_train, max_len)
X_test_word2vec = get_word2vec_embeddings(wv_model, X_test, max_len)

# Define CNN with Attention
input_layer = Input(shape=(X_train_word2vec.shape[1], X_train_word2vec.shape[2]))
conv_layer = Conv1D(filters=128, kernel_size=5, activation='relu')(input_layer)
attention_data = Conv1D(filters=128, kernel_size=3, activation='relu')(input_layer)
attention_layer = Attention()([conv_layer, attention_data])
flatten_layer = GlobalMaxPooling1D()(attention_layer)
dropout_layer = Dropout(0.5)(flatten_layer)
dense_layer_1 = Dense(64, activation='relu')(dropout_layer)
output_layer = Dense(1, activation='sigmoid')(dense_layer_1)

model_attention = Model(inputs=input_layer, outputs=output_layer)

model_attention.compile(optimizer=Adam(learning_rate=0.0005), loss='binary_crossentropy', metrics=['accuracy'])

print("\nTraining CNN with Attention and Word2Vec Features...")
history_attention = model_attention.fit(X_train_word2vec, y_train, epochs=20, batch_size=32, validation_data=(X_test_word2vec, y_test), verbose=1)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
No GPU available. Using CPU instead.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Training CNN with Attention and Word2Vec Features...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef

# Accuracy
loss_attention, accuracy_attention = model_attention.evaluate(X_test_word2vec, y_test)

print(f"\nAttention CNN Test Accuracy: {accuracy_attention}")
print(f"\n Loss Attention: {loss_attention}")

# Predict probabilities for test set
y_pred_prob = model_attention.predict(X_test_word2vec)

# Convert probabilities to binary predictions (0 or 1)
y_pred = (y_pred_prob > 0.5).astype(int)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{cm}")

# AUC Score
auc_score = roc_auc_score(y_test, y_pred_prob)
print(f"AUC Score: {auc_score:.4f}")

# Calculate True Positives, True Negatives, False Positives, False Negatives
tn, fp, fn, tp = cm.ravel()

# Sensitivity (Recall)
sensitivity = tp / (tp + fn)
print(f"Sensitivity (Recall): {sensitivity:.4f}")

# Specificity
specificity = tn / (tn + fp)
print(f"Specificity: {specificity:.4f}")

# Precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision:.4f}")

# F1 Score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.4f}")

# Matthews Correlation Coefficient (MCC)
mcc = matthews_corrcoef(y_test, y_pred)
print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")


Attention CNN Test Accuracy: 0.957932710647583

 Loss Attention: 0.1323414295911789
Confusion Matrix:
[[1980  152]
 [  23 2005]]
AUC Score: 0.9864
Sensitivity (Recall): 0.9887
Specificity: 0.9287
Precision: 0.9295
F1 Score: 0.9582
Matthews Correlation Coefficient (MCC): 0.9177
