# From Detection to Credibility: A Machine Learning Framework for Assessing News Source Reliability



In [None]:
!pip3 install -r ../requirements.txt

In [1]:
# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm


# Text Preprocessing and NLP
import nltk
# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords
# Tokenizing sentences/words
from nltk.tokenize import word_tokenize
# Part-of-speech tagging
from nltk import pos_tag
# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer


# Data Preparation (Loading CSV)

Load the processed_data `csv` file into pandas DataFrames
- `processed_data.csv` is loaded into `data` DataFrame (stemming has been performed to reduce processing time.)

In [2]:
data = pd.read_csv('../processed_data.csv')

In [3]:
data.info()
print("Dataframe Shape:", data.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63860 entries, 0 to 63859
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   label                   63860 non-null  int64 
 1   full_content            63860 non-null  object
 2   processed_full_content  63860 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.5+ MB
Dataframe Shape: (63860, 3)


In [4]:
data.head()

Unnamed: 0,label,full_content,processed_full_content
0,1,No comment is expected from Barack Obama Membe...,no comment expect barack obama member fyf911 f...
1,1,Did they post their votes for Hillary already?,post vote hillari alreadi
2,1,"Now, most of the demonstrators gathered last n...",demonstr gather last night exercis constitut p...
3,0,A dozen politically active pastors came here f...,dozen polit activ pastor came privat dinner fr...
4,1,"The RS-28 Sarmat missile, dubbed Satan 2, will...",rs-28 sarmat missil dub satan 2 replac ss-18 f...


In [5]:
data.isnull().sum()

label                     0
full_content              0
processed_full_content    0
dtype: int64

In [6]:
data['label'].value_counts()

label
0    34779
1    29081
Name: count, dtype: int64

In [7]:
# Ensure required NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('all')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/dariusng2103/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dariusng2103/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/dariusng2103/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/dariusng2103/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /home/dariusng2103/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/dariusng2103/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!


True

### Basic Convolutional Neural Network (Tokenizer + Embedding Layer) + 5 Fold Cross-Validation + L2 Regularization

In [8]:
import tensorflow as tf
import numpy as np
import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)

# Tokenization and Padding Parameters
max_words = 10000  # Max vocabulary size
max_sequence_length = 300  # Max length of sequences

# Tokenize and Pad Sequences
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data['processed_full_content'])
sequences = tokenizer.texts_to_sequences(data['processed_full_content'])
X = pad_sequences(sequences, maxlen=max_sequence_length)
y = data['label'].values  # Target labels

# Define the CNN Model with L2 Regularization
def create_basic_cnn():
    model = Sequential()
    model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_sequence_length))
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu', kernel_regularizer=l2(0.01)))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.01)))
    model.add(Dropout(0.5))  # Add dropout for regularization
    model.add(Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01)))  # Binary classification

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# 5-Fold Cross-Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
all_fold_metrics = {'accuracy': [], 'precision': [], 'recall': [], 'f1': []}

for train_index, val_index in kf.split(X, y):
    print(f"\nTraining fold {fold}...")
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    model = create_basic_cnn()
    history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_val, y_val), verbose=1)
    
    # Predict and evaluate
    y_pred = (model.predict(X_val) > 0.5).astype(int)
    accuracy = accuracy_score(y_val, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_pred, average='binary')
    
    # Store metrics for this fold
    all_fold_metrics['accuracy'].append(accuracy)
    all_fold_metrics['precision'].append(precision)
    all_fold_metrics['recall'].append(recall)
    all_fold_metrics['f1'].append(f1)
    
    print(f"Fold {fold} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
    fold += 1

# Calculate and print average metrics across all folds
avg_accuracy = np.mean(all_fold_metrics['accuracy'])
avg_precision = np.mean(all_fold_metrics['precision'])
avg_recall = np.mean(all_fold_metrics['recall'])
avg_f1 = np.mean(all_fold_metrics['f1'])

print("\nAverage Evaluation Metrics across 5 folds:")
print(f"Average Accuracy: {avg_accuracy:.4f}")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1 Score: {avg_f1:.4f}")

2024-11-06 14:03:17.371651: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-06 14:03:17.486219: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1730872997.530685     747 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1730872997.543159     747 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 14:03:17.653495: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr


Training fold 1...


I0000 00:00:1730873018.871822     747 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9557 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4070, pci bus id: 0000:01:00.0, compute capability: 8.9


Epoch 1/10


I0000 00:00:1730873019.811247    4171 service.cc:148] XLA service 0x7f1280014fd0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1730873019.811470    4171 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 4070, Compute Capability 8.9
2024-11-06 14:03:39.842899: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1730873019.945921    4171 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m 94/799[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 1ms/step - accuracy: 0.5845 - loss: 1.8652

I0000 00:00:1730873020.949878    4171 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.8193 - loss: 0.7628 - val_accuracy: 0.9577 - val_loss: 0.2331
Epoch 2/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9584 - loss: 0.2266 - val_accuracy: 0.9643 - val_loss: 0.2051
Epoch 3/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9704 - loss: 0.1922 - val_accuracy: 0.9665 - val_loss: 0.1916
Epoch 4/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9776 - loss: 0.1711 - val_accuracy: 0.9669 - val_loss: 0.1842
Epoch 5/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9831 - loss: 0.1562 - val_accuracy: 0.9663 - val_loss: 0.1827
Epoch 6/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9855 - loss: 0.1450 - val_accuracy: 0.9669 - val_loss: 0.1759
Epoch 7/10
[1m799/799[0m [32m━━━━━━━



[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8315 - loss: 0.7543 - val_accuracy: 0.9594 - val_loss: 0.2321
Epoch 2/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9610 - loss: 0.2247 - val_accuracy: 0.9636 - val_loss: 0.2072
Epoch 3/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9716 - loss: 0.1899 - val_accuracy: 0.9667 - val_loss: 0.1928
Epoch 4/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9789 - loss: 0.1699 - val_accuracy: 0.9678 - val_loss: 0.1847
Epoch 5/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9830 - loss: 0.1550 - val_accuracy: 0.9676 - val_loss: 0.1807
Epoch 6/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9869 - loss: 0.1440 - val_accuracy: 0.9675 - val_loss: 0.1765
Epoch 7/10
[1m799/799[0m [32m━━━━━━━



[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8305 - loss: 0.7702 - val_accuracy: 0.9579 - val_loss: 0.2282
Epoch 2/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9585 - loss: 0.2270 - val_accuracy: 0.9640 - val_loss: 0.2019
Epoch 3/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9708 - loss: 0.1910 - val_accuracy: 0.9663 - val_loss: 0.1882
Epoch 4/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9784 - loss: 0.1707 - val_accuracy: 0.9675 - val_loss: 0.1810
Epoch 5/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9828 - loss: 0.1561 - val_accuracy: 0.9674 - val_loss: 0.1759
Epoch 6/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9854 - loss: 0.1453 - val_accuracy: 0.9681 - val_loss: 0.1711
Epoch 7/10
[1m799/799[0m [32m━━━━━━━



[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8281 - loss: 0.7717 - val_accuracy: 0.9543 - val_loss: 0.2351
Epoch 2/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9601 - loss: 0.2263 - val_accuracy: 0.9574 - val_loss: 0.2113
Epoch 3/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9714 - loss: 0.1913 - val_accuracy: 0.9627 - val_loss: 0.1964
Epoch 4/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9798 - loss: 0.1698 - val_accuracy: 0.9644 - val_loss: 0.1901
Epoch 5/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9831 - loss: 0.1551 - val_accuracy: 0.9635 - val_loss: 0.1837
Epoch 6/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9869 - loss: 0.1433 - val_accuracy: 0.9648 - val_loss: 0.1788
Epoch 7/10
[1m799/799[0m [32m━━━━━━━



Epoch 1/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8437 - loss: 0.7758 - val_accuracy: 0.9565 - val_loss: 0.2355
Epoch 2/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9599 - loss: 0.2276 - val_accuracy: 0.9644 - val_loss: 0.2051
Epoch 3/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9714 - loss: 0.1906 - val_accuracy: 0.9646 - val_loss: 0.1926
Epoch 4/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9795 - loss: 0.1691 - val_accuracy: 0.9677 - val_loss: 0.1856
Epoch 5/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9830 - loss: 0.1554 - val_accuracy: 0.9691 - val_loss: 0.1793
Epoch 6/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9864 - loss: 0.1443 - val_accuracy: 0.9679 - val_loss: 0.1767
Epoch 7/10
[1m799/799[0m 

### Convolutional Neural Network + TF-IDF Vectorizer

Using TF-IDF vectorizer along with CNN led to a drastic fall in performance. Below are some reasons why we should not use TF-IDF vectorizer along with a CNN or other neural networks.

#### Lack of Spatial Structure:

TF-IDF vectors are sparse and non-sequential representations where each position in the vector represents a word, not a spatial pattern.
CNNs are designed to detect patterns in sequential or spatially structured data (e.g., images or sentences), so they might struggle to find meaningful patterns in TF-IDF vectors.

#### High-Dimensional Sparse Data:

TF-IDF vectors, especially with a high max_features value (like 10,000), result in a high-dimensional but sparse input.
CNNs are generally not well-suited for such high-dimensional sparse data; they perform better with dense embeddings where words have contextually meaningful dimensions.

#### Mismatch Between Input Type and CNN Architecture:

CNNs are typically effective when applied to word embeddings (like GloVe or Word2Vec) because embeddings maintain semantic relationships and neighborhood structures.
TF-IDF, however, does not capture word order or semantic relationships, which means the convolution operation might not yield meaningful feature maps.


In [None]:
import tensorflow as tf
import numpy as np
import random

tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)

import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Dense, Dropout, Reshape, Input
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Step 1: Apply TF-IDF Vectorization
max_features = 10000  # Limit TF-IDF to top 10,000 features
tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
X_tfidf = tfidf_vectorizer.fit_transform(data['processed_full_content']).toarray()

# Convert the labels
y = data['label'].values  # Target labels

# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Step 3: Define the CNN Model for TF-IDF Input
def create_cnn_with_tfidf():
    inputs = Input(shape=(max_features,))
    x = Reshape((max_features, 1))(inputs)  # Reshape TF-IDF output to be compatible with Conv1D

    # Convolutional layer
    x = Conv1D(filters=128, kernel_size=5, activation='relu')(x)
    x = GlobalMaxPooling1D()(x)
    
    # Fully connected layer
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.5)(x)  # Dropout for regularization
    outputs = Dense(1, activation='sigmoid')(x)  # Output layer for binary classification

    # Create model
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Step 4: Train the Model
model = create_cnn_with_tfidf()
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), verbose=1)

# Step 5: Evaluate the Model
y_pred = (model.predict(X_test) > 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

print("\nEvaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Epoch 1/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 24ms/step - accuracy: 0.5353 - loss: 0.6893 - val_accuracy: 0.5483 - val_loss: 0.6825
Epoch 2/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - accuracy: 0.5550 - loss: 0.6836 - val_accuracy: 0.5729 - val_loss: 0.6786
Epoch 3/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - accuracy: 0.5565 - loss: 0.6815 - val_accuracy: 0.5727 - val_loss: 0.6772
Epoch 4/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 14ms/step - accuracy: 0.5610 - loss: 0.6801 - val_accuracy: 0.5727 - val_loss: 0.6752
Epoch 5/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.5593 - loss: 0.6793 - val_accuracy: 0.5710 - val_loss: 0.6766
Epoch 6/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - accuracy: 0.5620 - loss: 0.6795 - val_accuracy: 0.5711 - val_loss: 0.6749
Epoch 7/10
[1m799

### Convolutional Neural Networks + Count Vectorization (conversion to sequences) + Stratified 5-Fold CV + L2 Regularization

In [9]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tensorflow.keras.regularizers import l2

seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)

# Step 1: Text Vectorization using CountVectorizer
max_features = 10000  # Max vocabulary size for CountVectorizer
vectorizer = CountVectorizer(max_features=max_features)
X_counts = vectorizer.fit_transform(data['processed_full_content'])
word_index = vectorizer.vocabulary_

# Convert CountVectorizer output to sequences
index_to_word = {i: word for word, i in word_index.items()}

def counts_to_sequences(X_counts):
    sequences = []
    for i in range(X_counts.shape[0]):
        indices = X_counts[i].nonzero()[1]
        words = [index_to_word[idx] for idx in indices]
        seq = [word_index[word] + 1 for word in words]  # +1 because 0 is reserved for padding
        sequences.append(seq)
    return sequences

sequences = counts_to_sequences(X_counts)
max_sequence_length = 300  # Adjust to your needs
X = pad_sequences(sequences, maxlen=max_sequence_length)
y = data['label'].values  # Target labels

# Define the Basic CNN Model with L2 Regularization
def create_basic_cnn_with_l2():
    model = Sequential()
    
    # Embedding layer with random initialization
    model.add(Embedding(input_dim=max_features + 1, output_dim=128))
    
    # Convolutional layer with L2 regularization
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu', kernel_regularizer=l2(0.01)))
    model.add(GlobalMaxPooling1D())
    
    # Fully connected layer with L2 regularization
    model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.01)))
    model.add(Dropout(0.5))  # Add dropout for regularization
    
    # Output layer with L2 regularization
    model.add(Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01)))  # Binary classification
    
    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Step 4: Stratified 5-Fold Cross-Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = create_basic_cnn_with_l2()
    model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2, verbose=1)
    
    # Evaluate the model
    y_pred = (model.predict(X_test) > 0.5).astype(int)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
    
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

# Step 5: Print Cross-Validation Results
print("\nCross-Validation Metrics:")
print(f"Average Accuracy: {np.mean(accuracy_scores):.4f}")
print(f"Average Precision: {np.mean(precision_scores):.4f}")
print(f"Average Recall: {np.mean(recall_scores):.4f}")
print(f"Average F1 Score: {np.mean(f1_scores):.4f}")


Epoch 1/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7685 - loss: 0.8773 - val_accuracy: 0.9291 - val_loss: 0.2726
Epoch 2/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9301 - loss: 0.2740 - val_accuracy: 0.9330 - val_loss: 0.2610
Epoch 3/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9425 - loss: 0.2451 - val_accuracy: 0.9389 - val_loss: 0.2481
Epoch 4/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9531 - loss: 0.2220 - val_accuracy: 0.9424 - val_loss: 0.2419
Epoch 5/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9605 - loss: 0.2073 - val_accuracy: 0.9360 - val_loss: 0.2567
Epoch 6/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9662 - loss: 0.1931 - val_accuracy: 0.9419 - val_loss: 0.2467
Epoch 7/10
[1m639/639[0m 

### Convolutional Neural Network + Custom-trained Word2Vec Embeddings + 5-Fold Cross Validation + L2 Regularization

#### Why do we use word embedding over other preprocessing techniques (eg. tf-idf, count vectorizer), for our task of fake news classification?


##### 1. Word embeddings capture the semantic relationships between words in a dense, low-dimensional space.
Fake news often uses subtle language, and word embeddings like GloVe can capture the semantic context of words, allowing the model to understand relationships between words that simple vectorizers would miss. This helps in detecting nuanced differences in language use between real and fake news.

##### 2. Word embeddings produce dense, low-dimensional vectors (e.g., 100-300 dimensions) that capture rich word information.
Pre-trained embeddings are built on large corpora like Wikipedia and news articles, giving our model external knowledge that’s useful for distinguishing between real news and fake news. This boosts the model's ability to generalize on unseen test data from our web scraping.

##### 3. Efficient Representation of Semantics
Words in fake news can appear in different contexts, but with similar underlying meanings (e.g., "hoax" and "lie"). GloVe embeddings represent these similar words in close proximity in the vector space, helping the model recognize fake news patterns more effectively than TF-IDF or Count Vectorizer.

##### 4. Handling Synonyms and Rare Words:
Fake news often uses alternative phrases or rare terminology. Pre-trained embeddings like GloVe can handle these rare words because they’ve seen a broad variety of language during training, making our model more robust against unusual vocabulary choices in fake news.

## Cross Validation 
We use Stratified K-Fold Cross-Validation with n_splits=5 to evaluate the model on different splits of the data. 
For each fold, we store the metrics (accuracy, precision, recall, and F1 score) and then calculate the average metrics across all folds for a robust evaluation.

Cross-validation helps us understand the model’s performance more robustly by testing it on multiple splits of the data. This approach gives a more reliable estimate of model performance and helps reduce the risk of overfitting to any single train-test split.

In [14]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from gensim.models import Word2Vec
from tensorflow.keras.regularizers import l2

seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)

# Tokenization parameters
max_words = 5000
max_sequence_length = 300

# Tokenize and create sequences
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data['processed_full_content'])
sequences = tokenizer.texts_to_sequences(data['processed_full_content'])
X = pad_sequences(sequences, maxlen=max_sequence_length)
y = data['label'].values

# Step 2: Train Word2Vec Embeddings
sentences = [text.split() for text in data['processed_full_content']]
embedding_dim = 100
custom_word2vec = Word2Vec(sentences, vector_size=embedding_dim, window=5, min_count=2, workers=4)

# Step 3: Create Embedding Matrix from Custom Word2Vec
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    if i < max_words:
        if word in custom_word2vec.wv:
            embedding_matrix[i] = custom_word2vec.wv[word]
        else:
            embedding_matrix[i] = np.random.normal(size=(embedding_dim,))

# Define CNN Model with Custom Word2Vec Embeddings and L2 Regularization
def create_cnn_with_l2():
    input_layer = Input(shape=(max_sequence_length,))
    embedding_layer = Embedding(input_dim=vocab_size,
                                output_dim=embedding_dim,
                                weights=[embedding_matrix],
                                trainable=True)(input_layer)

    x = Conv1D(filters=64, kernel_size=5, activation='relu', kernel_regularizer=l2(0.01))(embedding_layer)
    x = GlobalMaxPooling1D()(x)
    x = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(x)
    x = Dropout(0.5)(x)
    output_layer = Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01))(x)

    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Implement Stratified 5-Fold Cross-Validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracies, precisions, recalls, f1_scores = [], [], [], []

for train_index, test_index in kfold.split(X, y):
    X_train, X_val = X[train_index], X[test_index]
    y_train, y_val = y[train_index], y[test_index]

    # Create and train the CNN model
    model = create_cnn_with_l2()
    model.fit(X_train, y_train, epochs=10, batch_size=256, validation_data=(X_val, y_val), verbose=1)

    # Evaluate model on validation set
    y_pred = (model.predict(X_val) > 0.5).astype(int)
    accuracy = accuracy_score(y_val, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_pred, average='binary')

    # Collect metrics
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate and display average metrics
avg_accuracy = np.mean(accuracies)
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)
avg_f1_score = np.mean(f1_scores)

print("\nAggregated Evaluation Metrics:")
print(f"Average Accuracy: {avg_accuracy:.4f}")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1 Score: {avg_f1_score:.4f}")

Epoch 1/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.7829 - loss: 1.7065 - val_accuracy: 0.9429 - val_loss: 0.6274
Epoch 2/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9409 - loss: 0.5485 - val_accuracy: 0.9536 - val_loss: 0.3495
Epoch 3/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9572 - loss: 0.3277 - val_accuracy: 0.9627 - val_loss: 0.2546
Epoch 4/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9650 - loss: 0.2447 - val_accuracy: 0.9689 - val_loss: 0.2093
Epoch 5/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9701 - loss: 0.2028 - val_accuracy: 0.9693 - val_loss: 0.1872
Epoch 6/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9728 - loss: 0.1777 - val_accuracy: 0.9720 - val_loss: 0.1726
Epoch 7/10
[1m200/200[0m 

### Convolutional Neural network + GloVe word embeddings (100D) + 5-Fold Cross Validation + L2 regularization

In [None]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Concatenate, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.utils.class_weight import compute_class_weight

seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)

# Tokenize the already preprocessed text in 'processed_full_content' column
tokenizer = Tokenizer(num_words=5000)  # Limit to top 5000 words
tokenizer.fit_on_texts(data['processed_full_content'])
sequences = tokenizer.texts_to_sequences(data['processed_full_content'])

# Padding sequences to ensure uniform length
max_sequence_length = 300
X = pad_sequences(sequences, maxlen=max_sequence_length)
y = data['label'].values  # Labels for fake news (0 for real, 1 for fake)

# Load pre-trained GloVe embeddings
embeddings_index = {}
with open('../glove.6B.100d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Prepare embedding matrix for the words in your dataset
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Define CNN model with pre-trained embeddings and L2 regularization using the functional API
def create_model():
    input_layer = Input(shape=(max_sequence_length,))
    embedding_layer = Embedding(input_dim=vocab_size,
                                output_dim=embedding_dim,
                                weights=[embedding_matrix],
                                trainable=True)(input_layer)

    convs = []
    for kernel_size in [3, 4, 5]:
        conv = Conv1D(filters=64, kernel_size=kernel_size, activation='relu',
                      kernel_regularizer=l2(0.01))(embedding_layer)
        pool = GlobalMaxPooling1D()(conv)
        convs.append(pool)

    merged = Concatenate()(convs)
    dense = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(merged)
    drop = Dropout(0.5)(dense)
    output = Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01))(drop)

    model = Model(inputs=input_layer, outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Perform Stratified K-Fold Cross Validation with 5 folds
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store metrics for each fold
accuracies, precisions, recalls, f1_scores = [], [], [], []

for train_index, test_index in kfold.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Compute class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    class_weights_dict = dict(enumerate(class_weights))

    # Create and train the CNN model
    model = create_model()
    model.fit(X_train, y_train, epochs=10, batch_size=256, validation_data=(X_test, y_test),
              class_weight=class_weights_dict, verbose=1)

    # Predict and calculate metrics for this fold
    y_pred = (model.predict(X_test) > 0.5).astype(int)
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

    # Store metrics for this fold
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate average metrics across all folds
avg_accuracy = np.mean(accuracies)
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)
avg_f1_score = np.mean(f1_scores)

# Print the classification report
print("\nAggregated Report:")
print(f"Average Accuracy: {avg_accuracy:.4f}")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1-score: {avg_f1_score:.4f}")

Epoch 1/10
[1m194/200[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 8ms/step - accuracy: 0.7540 - loss: 2.5671




[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 20ms/step - accuracy: 0.7575 - loss: 2.5328 - val_accuracy: 0.9443 - val_loss: 0.5582
Epoch 2/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9451 - loss: 0.4801 - val_accuracy: 0.9608 - val_loss: 0.2999
Epoch 3/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9592 - loss: 0.2874 - val_accuracy: 0.9648 - val_loss: 0.2272
Epoch 4/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9653 - loss: 0.2228 - val_accuracy: 0.9661 - val_loss: 0.2001
Epoch 5/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9686 - loss: 0.1977 - val_accuracy: 0.9651 - val_loss: 0.1902
Epoch 6/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9728 - loss: 0.1844 - val_accuracy: 0.9660 - val_loss: 0.1843
Epoch 7/10
[1m200/200[0m [32m━━━━━━

### Convolutional Neural network + GloVe word embeddings (300D) + 5-Fold Cross Validation + L2 regularization

L2 Regularization discourages large weights by adding a penalty to the loss function. It encourages the model to find simpler solutions by minimizing both the original loss and the weight magnitudes.

*Benefit*: Reduces overfitting by making the model less sensitive to noise and preventing it from memorizing specific data points.

In [None]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Concatenate, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.utils.class_weight import compute_class_weight

seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)

# Tokenize the already preprocessed text in 'processed_full_content' column
tokenizer = Tokenizer(num_words=5000)  # Limit to top 5000 words
tokenizer.fit_on_texts(data['processed_full_content'])
sequences = tokenizer.texts_to_sequences(data['processed_full_content'])

# Padding sequences to ensure uniform length
max_sequence_length = 300
X = pad_sequences(sequences, maxlen=max_sequence_length)
y = data['label'].values  # Labels for fake news (0 for real, 1 for fake)

# Load pre-trained GloVe embeddings
embeddings_index = {}
with open('../glove.6B.300d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Prepare embedding matrix for the words in your dataset
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 300
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Define CNN model with pre-trained embeddings and L2 regularization using the functional API
def create_model():
    input_layer = Input(shape=(max_sequence_length,))
    embedding_layer = Embedding(input_dim=vocab_size,
                                output_dim=embedding_dim,
                                weights=[embedding_matrix],
                                trainable=True)(input_layer)

    convs = []
    for kernel_size in [3, 4, 5]:
        conv = Conv1D(filters=64, kernel_size=kernel_size, activation='relu',
                      kernel_regularizer=l2(0.01))(embedding_layer)
        pool = GlobalMaxPooling1D()(conv)
        convs.append(pool)

    merged = Concatenate()(convs)
    dense = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(merged)
    drop = Dropout(0.5)(dense)
    output = Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01))(drop)

    model = Model(inputs=input_layer, outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Perform Stratified K-Fold Cross Validation with 5 folds
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store metrics for each fold
accuracies, precisions, recalls, f1_scores = [], [], [], []

for train_index, test_index in kfold.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Compute class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    class_weights_dict = dict(enumerate(class_weights))

    # Create and train the CNN model
    model = create_model()
    model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test),
              class_weight=class_weights_dict, verbose=1)

    # Predict and calculate metrics for this fold
    y_pred = (model.predict(X_test) > 0.5).astype(int)
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

    # Store metrics for this fold
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate average metrics across all folds
avg_accuracy = np.mean(accuracies)
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)
avg_f1_score = np.mean(f1_scores)

# Print the classification report
print("\nAggregated Report:")
print(f"Average Accuracy: {avg_accuracy:.4f}")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1-score: {avg_f1_score:.4f}")


ValueError: could not broadcast input array from shape (300,) into shape (100,)

### Convolutional Neural network + Custom-trained word2vec word embeddings + 5-Fold Cross Validation + L2 Regularization + GridSearchCV


In [22]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import make_scorer, f1_score
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from gensim.models import Word2Vec
from tensorflow.keras.regularizers import l2

# Seed for reproducibility
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

# Tokenize the text
max_words = 10000  # Vocabulary size
max_sequence_length = 300  # Maximum sequence length

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data['processed_full_content'])
sequences = tokenizer.texts_to_sequences(data['processed_full_content'])
X = pad_sequences(sequences, maxlen=max_sequence_length)
y = data['label'].values  # Labels

# Train Custom Word2Vec 
sentences = [text.split() for text in data['processed_full_content']]
embedding_dim = 100
custom_word2vec = Word2Vec(sentences, vector_size=embedding_dim, window=5, min_count=2, workers=4)

# Create Embedding Matrix
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    if i < max_words:
        if word in custom_word2vec.wv:
            embedding_matrix[i] = custom_word2vec.wv[word]
        else:
            embedding_matrix[i] = np.random.normal(size=(embedding_dim,))

# Define Model Function for KerasClassifier
def create_model(filters=128, kernel_size=5, dense_units=64, dropout_rate=0.5, l2_reg=0.01):
    input_layer = Input(shape=(max_sequence_length,))
    embedding_layer = Embedding(input_dim=vocab_size,
                                output_dim=embedding_dim,
                                weights=[embedding_matrix],
                                trainable=False)(input_layer)

    x = Conv1D(filters=filters, kernel_size=kernel_size, activation='relu', kernel_regularizer=l2(l2_reg))(embedding_layer)
    x = GlobalMaxPooling1D()(x)
    
    x = Dense(dense_units, activation='relu', kernel_regularizer=l2(l2_reg))(x)
    x = Dropout(dropout_rate)(x)
    output_layer = Dense(1, activation='sigmoid', kernel_regularizer=l2(l2_reg))(x)

    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Wrap the model with KerasClassifier
model = KerasClassifier(build_fn=create_model, verbose=1)

# Define Grid of Hyperparameters
param_grid = {
    'filters': [64, 128],
    'kernel_size': [3, 5],
    'dense_units': [32, 64],
    'dropout_rate': [0.2, 0.5],
    'l2_reg': [0.01, 0.001],
    'epochs': [5, 10],
    'batch_size': [32, 64]
}

# Define F1 Score as Scoring Metric
f1_scorer = make_scorer(f1_score, average='binary')

# Perform Grid Search
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=f1_scorer, cv=StratifiedKFold(n_splits=5), verbose=1)
grid_result = grid.fit(X, y)

# Print Best Score and Parameters
print("Best F1 Score: {:.4f}".format(grid_result.best_score_))
print("Best Parameters: ", grid_result.best_params_)


ModuleNotFoundError: No module named 'tensorflow.keras.wrappers'