In [1]:
import os
import sys
from pathlib import Path

if "workding_dir" not in locals():
    workding_dir = str(Path.cwd().parent)
os.chdir(workding_dir)
sys.path.append(workding_dir)
print("working dir:", workding_dir)

working dir: /Users/inflaton/code/engd/papers/DM-Fake-News-Detection


# From Detection to Credibility: A Machine Learning Framework for Assessing News Source Reliability



In [2]:
# !pip3 install -r ../requirements.txt

In [3]:
# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm


# Text Preprocessing and NLP
import nltk
# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords
# Tokenizing sentences/words
from nltk.tokenize import word_tokenize
# Part-of-speech tagging
from nltk import pos_tag
# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer


# Data Preparation (Loading CSV)

Load the processed_data `csv` file into pandas DataFrames
- `processed_data.csv` is loaded into `data` DataFrame (stemming has been performed to reduce processing time.)

In [4]:
data = pd.read_csv('./processed_data_filtered.csv')

In [5]:
data['label'].value_counts()

label
0    34030
1    26461
Name: count, dtype: int64

In [6]:
data.info()
print("Dataframe Shape:", data.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60491 entries, 0 to 60490
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   label                   60491 non-null  int64 
 1   full_content            60491 non-null  object
 2   processed_full_content  60491 non-null  object
 3   word_count              60491 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 1.8+ MB
Dataframe Shape: (60491, 4)


In [7]:
# # Ensure required NLTK data is downloaded
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('all')

### Basic Convolutional Neural Network (Tokenizer + Embedding Layer) + 5 Fold Cross-Validation + L2 Regularization

In [8]:
import tensorflow as tf
import numpy as np
import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)

# Tokenization and Padding Parameters
max_words = 10000  # Max vocabulary size
max_sequence_length = 300  # Max length of sequences

# Tokenize and Pad Sequences
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data['processed_full_content'])
sequences = tokenizer.texts_to_sequences(data['processed_full_content'])
X = pad_sequences(sequences, maxlen=max_sequence_length)
y = data['label'].values  # Target labels

# Define the CNN Model with L2 Regularization
def create_basic_cnn():
    model = Sequential()
    model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_sequence_length))
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu', kernel_regularizer=l2(0.01)))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.01)))
    model.add(Dropout(0.2))  # Add dropout for regularization
    model.add(Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01)))  # Binary classification

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# 5-Fold Cross-Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
all_fold_metrics = {'accuracy': [], 'precision': [], 'recall': [], 'f1': []}

for train_index, val_index in kf.split(X, y):
    print(f"\nTraining fold {fold}...")
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    model = create_basic_cnn()
    history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_val, y_val), verbose=1)
    
    # Predict and evaluate
    y_pred = (model.predict(X_val) > 0.5).astype(int)
    accuracy = accuracy_score(y_val, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_pred, average='binary')
    
    # Store metrics for this fold
    all_fold_metrics['accuracy'].append(accuracy)
    all_fold_metrics['precision'].append(precision)
    all_fold_metrics['recall'].append(recall)
    all_fold_metrics['f1'].append(f1)
    
    print(f"Fold {fold} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
    fold += 1

# Calculate and print average metrics across all folds
avg_accuracy = np.mean(all_fold_metrics['accuracy'])
avg_precision = np.mean(all_fold_metrics['precision'])
avg_recall = np.mean(all_fold_metrics['recall'])
avg_f1 = np.mean(all_fold_metrics['f1'])

print("\nAverage Evaluation Metrics across 5 folds:")
print(f"Average Accuracy: {avg_accuracy:.4f}")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1 Score: {avg_f1:.4f}")


Training fold 1...
Epoch 1/10




[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 20ms/step - accuracy: 0.8405 - loss: 0.7487 - val_accuracy: 0.9555 - val_loss: 0.2279
Epoch 2/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 21ms/step - accuracy: 0.9619 - loss: 0.2136 - val_accuracy: 0.9564 - val_loss: 0.2090
Epoch 3/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 21ms/step - accuracy: 0.9740 - loss: 0.1784 - val_accuracy: 0.9579 - val_loss: 0.2013
Epoch 4/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 22ms/step - accuracy: 0.9815 - loss: 0.1587 - val_accuracy: 0.9559 - val_loss: 0.1995
Epoch 5/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 23ms/step - accuracy: 0.9851 - loss: 0.1452 - val_accuracy: 0.9553 - val_loss: 0.1979
Epoch 6/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 23ms/step - accuracy: 0.9881 - loss: 0.1352 - val_accuracy: 0.9582 - val_loss: 0.1904
Epoch 7/10
[1m757/757[0m 



[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 31ms/step - accuracy: 0.8293 - loss: 0.7502 - val_accuracy: 0.9604 - val_loss: 0.2178
Epoch 2/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 33ms/step - accuracy: 0.9596 - loss: 0.2150 - val_accuracy: 0.9651 - val_loss: 0.1953
Epoch 3/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 37ms/step - accuracy: 0.9717 - loss: 0.1823 - val_accuracy: 0.9655 - val_loss: 0.1859
Epoch 4/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 44ms/step - accuracy: 0.9788 - loss: 0.1643 - val_accuracy: 0.9675 - val_loss: 0.1776
Epoch 5/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 42ms/step - accuracy: 0.9834 - loss: 0.1498 - val_accuracy: 0.9669 - val_loss: 0.1737
Epoch 6/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 39ms/step - accuracy: 0.9866 - loss: 0.1391 - val_accuracy: 0.9680 - val_loss: 0.1694
Epoch 7/10
[1m757/757[0m 



[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 31ms/step - accuracy: 0.8344 - loss: 0.7535 - val_accuracy: 0.9536 - val_loss: 0.2321
Epoch 2/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 31ms/step - accuracy: 0.9616 - loss: 0.2109 - val_accuracy: 0.9612 - val_loss: 0.2077
Epoch 3/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 31ms/step - accuracy: 0.9742 - loss: 0.1778 - val_accuracy: 0.9635 - val_loss: 0.1938
Epoch 4/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 30ms/step - accuracy: 0.9813 - loss: 0.1578 - val_accuracy: 0.9666 - val_loss: 0.1812
Epoch 5/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 30ms/step - accuracy: 0.9865 - loss: 0.1427 - val_accuracy: 0.9643 - val_loss: 0.1814
Epoch 6/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 29ms/step - accuracy: 0.9898 - loss: 0.1316 - val_accuracy: 0.9632 - val_loss: 0.1795
Epoch 7/10
[1m757/757[0m 



[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 28ms/step - accuracy: 0.8359 - loss: 0.7566 - val_accuracy: 0.9611 - val_loss: 0.2232
Epoch 2/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 27ms/step - accuracy: 0.9642 - loss: 0.2113 - val_accuracy: 0.9649 - val_loss: 0.2007
Epoch 3/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 27ms/step - accuracy: 0.9750 - loss: 0.1776 - val_accuracy: 0.9661 - val_loss: 0.1885
Epoch 4/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 29ms/step - accuracy: 0.9821 - loss: 0.1565 - val_accuracy: 0.9678 - val_loss: 0.1822
Epoch 5/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 28ms/step - accuracy: 0.9857 - loss: 0.1422 - val_accuracy: 0.9677 - val_loss: 0.1777
Epoch 6/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 28ms/step - accuracy: 0.9888 - loss: 0.1319 - val_accuracy: 0.9677 - val_loss: 0.1736
Epoch 7/10
[1m757/757[0m 



[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 29ms/step - accuracy: 0.8436 - loss: 0.7631 - val_accuracy: 0.9564 - val_loss: 0.2260
Epoch 2/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 28ms/step - accuracy: 0.9621 - loss: 0.2155 - val_accuracy: 0.9629 - val_loss: 0.1993
Epoch 3/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 29ms/step - accuracy: 0.9735 - loss: 0.1794 - val_accuracy: 0.9609 - val_loss: 0.1946
Epoch 4/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 29ms/step - accuracy: 0.9803 - loss: 0.1601 - val_accuracy: 0.9621 - val_loss: 0.1841
Epoch 5/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 30ms/step - accuracy: 0.9847 - loss: 0.1460 - val_accuracy: 0.9630 - val_loss: 0.1788
Epoch 6/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 30ms/step - accuracy: 0.9878 - loss: 0.1352 - val_accuracy: 0.9658 - val_loss: 0.1711
Epoch 7/10
[1m757/757[0m 

### Convolutional Neural Network + TF-IDF Vectorizer

Using TF-IDF vectorizer along with CNN led to a drastic fall in performance. Below are some reasons why we should not use TF-IDF vectorizer along with a CNN or other neural networks.

#### Lack of Spatial Structure:

TF-IDF vectors are sparse and non-sequential representations where each position in the vector represents a word, not a spatial pattern.
CNNs are designed to detect patterns in sequential or spatially structured data (e.g., images or sentences), so they might struggle to find meaningful patterns in TF-IDF vectors.

#### High-Dimensional Sparse Data:

TF-IDF vectors, especially with a high max_features value (like 10,000), result in a high-dimensional but sparse input.
CNNs are generally not well-suited for such high-dimensional sparse data; they perform better with dense embeddings where words have contextually meaningful dimensions.

#### Mismatch Between Input Type and CNN Architecture:

CNNs are typically effective when applied to word embeddings (like GloVe or Word2Vec) because embeddings maintain semantic relationships and neighborhood structures.
TF-IDF, however, does not capture word order or semantic relationships, which means the convolution operation might not yield meaningful feature maps.


In [9]:
import tensorflow as tf
import numpy as np
import random

tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)

import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Dense, Dropout, Reshape, Input
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Step 1: Apply TF-IDF Vectorization
max_features = 10000  # Limit TF-IDF to top 10,000 features
tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
X_tfidf = tfidf_vectorizer.fit_transform(data['processed_full_content']).toarray()

# Convert the labels
y = data['label'].values  # Target labels

# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Step 3: Define the CNN Model for TF-IDF Input
def create_cnn_with_tfidf():
    inputs = Input(shape=(max_features,))
    x = Reshape((max_features, 1))(inputs)  # Reshape TF-IDF output to be compatible with Conv1D

    # Convolutional layer
    x = Conv1D(filters=128, kernel_size=5, activation='relu')(x)
    x = GlobalMaxPooling1D()(x)
    
    # Fully connected layer
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.5)(x)  # Dropout for regularization
    outputs = Dense(1, activation='sigmoid')(x)  # Output layer for binary classification

    # Create model
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Step 4: Train the Model
model = create_cnn_with_tfidf()
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), verbose=1)

# Step 5: Evaluate the Model
y_pred = (model.predict(X_test) > 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

print("\nEvaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Epoch 1/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 97ms/step - accuracy: 0.5619 - loss: 0.6843 - val_accuracy: 0.5679 - val_loss: 0.6789
Epoch 2/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 97ms/step - accuracy: 0.5699 - loss: 0.6806 - val_accuracy: 0.5834 - val_loss: 0.6755
Epoch 3/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 94ms/step - accuracy: 0.5749 - loss: 0.6774 - val_accuracy: 0.5825 - val_loss: 0.6727
Epoch 4/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 92ms/step - accuracy: 0.5747 - loss: 0.6764 - val_accuracy: 0.5837 - val_loss: 0.6717
Epoch 5/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 91ms/step - accuracy: 0.5759 - loss: 0.6756 - val_accuracy: 0.5817 - val_loss: 0.6713
Epoch 6/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 92ms/step - accuracy: 0.5750 - loss: 0.6753 - val_accuracy: 0.5810 - val_loss: 0.6712
Epoch 7/10
[1m7

### Convolutional Neural Networks + Count Vectorization (conversion to sequences) + Stratified 5-Fold CV + L2 Regularization

In [10]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tensorflow.keras.regularizers import l2

seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)

# Step 1: Text Vectorization using CountVectorizer
max_features = 10000  # Max vocabulary size for CountVectorizer
vectorizer = CountVectorizer(max_features=max_features)
X_counts = vectorizer.fit_transform(data['processed_full_content'])
word_index = vectorizer.vocabulary_

# Convert CountVectorizer output to sequences
index_to_word = {i: word for word, i in word_index.items()}

def counts_to_sequences(X_counts):
    sequences = []
    for i in range(X_counts.shape[0]):
        indices = X_counts[i].nonzero()[1]
        words = [index_to_word[idx] for idx in indices]
        seq = [word_index[word] + 1 for word in words]  # +1 because 0 is reserved for padding
        sequences.append(seq)
    return sequences

sequences = counts_to_sequences(X_counts)
max_sequence_length = 300  # Adjust to your needs
X = pad_sequences(sequences, maxlen=max_sequence_length)
y = data['label'].values  # Target labels

# Define the Basic CNN Model with L2 Regularization
def create_basic_cnn_with_l2():
    model = Sequential()
    
    # Embedding layer with random initialization
    model.add(Embedding(input_dim=max_features + 1, output_dim=128))
    
    # Convolutional layer with L2 regularization
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu', kernel_regularizer=l2(0.01)))
    model.add(GlobalMaxPooling1D())
    
    # Fully connected layer with L2 regularization
    model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.01)))
    model.add(Dropout(0.5))  # Add dropout for regularization
    
    # Output layer with L2 regularization
    model.add(Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01)))  # Binary classification
    
    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Step 4: Stratified 5-Fold Cross-Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = create_basic_cnn_with_l2()
    model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2, verbose=1)
    
    # Evaluate the model
    y_pred = (model.predict(X_test) > 0.5).astype(int)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
    
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

# Step 5: Print Cross-Validation Results
print("\nCross-Validation Metrics:")
print(f"Average Accuracy: {np.mean(accuracy_scores):.4f}")
print(f"Average Precision: {np.mean(precision_scores):.4f}")
print(f"Average Recall: {np.mean(recall_scores):.4f}")
print(f"Average F1 Score: {np.mean(f1_scores):.4f}")


Epoch 1/10
[1m605/605[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 53ms/step - accuracy: 0.7806 - loss: 0.8735 - val_accuracy: 0.9351 - val_loss: 0.2590
Epoch 2/10
[1m605/605[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 29ms/step - accuracy: 0.9343 - loss: 0.2636 - val_accuracy: 0.9462 - val_loss: 0.2264
Epoch 3/10
[1m605/605[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 29ms/step - accuracy: 0.9468 - loss: 0.2325 - val_accuracy: 0.9520 - val_loss: 0.2136
Epoch 4/10
[1m605/605[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 28ms/step - accuracy: 0.9561 - loss: 0.2111 - val_accuracy: 0.9552 - val_loss: 0.2078
Epoch 5/10
[1m605/605[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 30ms/step - accuracy: 0.9635 - loss: 0.1976 - val_accuracy: 0.9542 - val_loss: 0.2055
Epoch 6/10
[1m605/605[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 30ms/step - accuracy: 0.9704 - loss: 0.1836 - val_accuracy: 0.9558 - val_loss: 0.2053
Epoch 7/10
[1m6

### Convolutional Neural Network + Custom-trained Word2Vec Embeddings + 5-Fold Cross Validation + L2 Regularization

#### Why do we use word embedding over other preprocessing techniques (eg. tf-idf, count vectorizer), for our task of fake news classification?


##### 1. Word embeddings capture the semantic relationships between words in a dense, low-dimensional space.
Fake news often uses subtle language, and word embeddings like GloVe can capture the semantic context of words, allowing the model to understand relationships between words that simple vectorizers would miss. This helps in detecting nuanced differences in language use between real and fake news.

##### 2. Word embeddings produce dense, low-dimensional vectors (e.g., 100-300 dimensions) that capture rich word information.
Pre-trained embeddings are built on large corpora like Wikipedia and news articles, giving our model external knowledge that’s useful for distinguishing between real news and fake news. This boosts the model's ability to generalize on unseen test data from our web scraping.

##### 3. Efficient Representation of Semantics
Words in fake news can appear in different contexts, but with similar underlying meanings (e.g., "hoax" and "lie"). GloVe embeddings represent these similar words in close proximity in the vector space, helping the model recognize fake news patterns more effectively than TF-IDF or Count Vectorizer.

##### 4. Handling Synonyms and Rare Words:
Fake news often uses alternative phrases or rare terminology. Pre-trained embeddings like GloVe can handle these rare words because they’ve seen a broad variety of language during training, making our model more robust against unusual vocabulary choices in fake news.

## Cross Validation 
We use Stratified K-Fold Cross-Validation with n_splits=5 to evaluate the model on different splits of the data. 
For each fold, we store the metrics (accuracy, precision, recall, and F1 score) and then calculate the average metrics across all folds for a robust evaluation.

Cross-validation helps us understand the model’s performance more robustly by testing it on multiple splits of the data. This approach gives a more reliable estimate of model performance and helps reduce the risk of overfitting to any single train-test split.

In [11]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from gensim.models import Word2Vec
from tensorflow.keras.regularizers import l2
import random

# Set seeds
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)

# Parameters
max_words = 5000
max_sequence_length = 300
embedding_dim = 100

def create_embedding_matrix(word2vec_model, tokenizer, vocab_size, embedding_dim):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in tokenizer.word_index.items():
        if i < vocab_size:
            if word in word2vec_model.wv:
                embedding_matrix[i] = word2vec_model.wv[word]
            else:
                embedding_matrix[i] = np.random.normal(size=(embedding_dim,))
    return embedding_matrix

def create_cnn_with_l2(vocab_size, embedding_dim, embedding_matrix):
    input_layer = Input(shape=(max_sequence_length,))
    embedding_layer = Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        trainable=True
    )(input_layer)
    
    x = Conv1D(filters=128, kernel_size=5, activation='relu', kernel_regularizer=l2(0.01))(embedding_layer)
    x = GlobalMaxPooling1D()(x)
    x = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(x)
    x = Dropout(0.5)(x)
    output_layer = Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01))(x)
    
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Implement cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_metrics = []

for fold, (train_idx, val_idx) in enumerate(kfold.split(data['processed_full_content'], data['label']), 1):
    print(f"\nFold {fold}")
    
    # Split data
    train_texts = data['processed_full_content'].iloc[train_idx]
    val_texts = data['processed_full_content'].iloc[val_idx]
    train_labels = data['label'].iloc[train_idx]
    val_labels = data['label'].iloc[val_idx]
    
    # Tokenization
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(train_texts)
    
    # Create sequences
    X_train = pad_sequences(tokenizer.texts_to_sequences(train_texts), maxlen=max_sequence_length)
    X_val = pad_sequences(tokenizer.texts_to_sequences(val_texts), maxlen=max_sequence_length)
    
    # Train Word2Vec on training data only
    train_sentences = [text.split() for text in train_texts]
    word2vec_model = Word2Vec(train_sentences, vector_size=embedding_dim, window=5, min_count=2, workers=4)
    
    # Create embedding matrix
    vocab_size = min(max_words, len(tokenizer.word_index) + 1)
    embedding_matrix = create_embedding_matrix(word2vec_model, tokenizer, vocab_size, embedding_dim)
    
    # Create and train model
    model = create_cnn_with_l2(vocab_size, embedding_dim, embedding_matrix)
    
    history = model.fit(
        X_train, train_labels,
        epochs=10,
        batch_size=256,
        validation_data=(X_val, val_labels),
        verbose=1
    )
    
    # Evaluate
    y_pred = (model.predict(X_val) > 0.5).astype(int)
    
    # Calculate metrics
    accuracy = accuracy_score(val_labels, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(val_labels, y_pred, average='binary')
    
    fold_metrics.append({
        'fold': fold,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    })
    
    print(f"\nFold {fold} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

# Calculate and display average metrics
avg_metrics = {
    'accuracy': np.mean([m['accuracy'] for m in fold_metrics]),
    'precision': np.mean([m['precision'] for m in fold_metrics]),
    'recall': np.mean([m['recall'] for m in fold_metrics]),
    'f1': np.mean([m['f1'] for m in fold_metrics])
}

print("\nAverage Metrics Across All Folds:")
print(f"Average Accuracy: {avg_metrics['accuracy']:.4f}")
print(f"Average Precision: {avg_metrics['precision']:.4f}")
print(f"Average Recall: {avg_metrics['recall']:.4f}")
print(f"Average F1 Score: {avg_metrics['f1']:.4f}")


Fold 1
Epoch 1/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 66ms/step - accuracy: 0.7846 - loss: 2.1489 - val_accuracy: 0.9476 - val_loss: 0.7870
Epoch 2/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m980s[0m 5s/step - accuracy: 0.9481 - loss: 0.6881 - val_accuracy: 0.9602 - val_loss: 0.4520
Epoch 3/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 65ms/step - accuracy: 0.9618 - loss: 0.4245 - val_accuracy: 0.9567 - val_loss: 0.3471
Epoch 4/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 66ms/step - accuracy: 0.9679 - loss: 0.3096 - val_accuracy: 0.9634 - val_loss: 0.2663
Epoch 5/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1009s[0m 5s/step - accuracy: 0.9717 - loss: 0.2421 - val_accuracy: 0.9578 - val_loss: 0.2367
Epoch 6/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 66ms/step - accuracy: 0.9729 - loss: 0.2061 - val_accuracy: 0.9624 - val_loss: 0.2044
Epoch 7/1

### Convolutional Neural network + GloVe word embeddings (100D) + 5-Fold Cross Validation + L2 regularization

In [12]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Concatenate, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.utils.class_weight import compute_class_weight

# Set seeds
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)

# Load GloVe embeddings once (this doesn't cause data leakage)
def load_glove_embeddings(path, embedding_dim=100):
    print("Loading GloVe embeddings...")
    embeddings_index = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print(f"Loaded {len(embeddings_index)} word vectors.")
    return embeddings_index

def create_embedding_matrix(word_index, embeddings_index, vocab_size, embedding_dim):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i >= vocab_size:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

def create_model(vocab_size, embedding_matrix, max_sequence_length):
    input_layer = Input(shape=(max_sequence_length,))
    embedding_layer = Embedding(
        input_dim=vocab_size,
        output_dim=embedding_matrix.shape[1],
        weights=[embedding_matrix],
        trainable=False,  # Set to False for pre-trained embeddings
        input_length=max_sequence_length
    )(input_layer)

    convs = []
    for kernel_size in [3, 4, 5]:
        conv = Conv1D(
            filters=64,
            kernel_size=kernel_size,
            activation='relu',
            kernel_regularizer=l2(0.01)
        )(embedding_layer)
        pool = GlobalMaxPooling1D()(conv)
        convs.append(pool)

    merged = Concatenate()(convs)
    dense = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(merged)
    drop = Dropout(0.2)(dense)
    output = Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01))(drop)

    model = Model(inputs=input_layer, outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def process_fold_data(train_texts, val_texts, tokenizer, max_sequence_length):
    """Process text data for a single fold"""
    # Fit tokenizer on training data only
    tokenizer.fit_on_texts(train_texts)
    
    # Convert texts to sequences
    X_train = tokenizer.texts_to_sequences(train_texts)
    X_val = tokenizer.texts_to_sequences(val_texts)
    
    # Pad sequences
    X_train = pad_sequences(X_train, maxlen=max_sequence_length)
    X_val = pad_sequences(X_val, maxlen=max_sequence_length)
    
    return X_train, X_val, tokenizer

def main():
    # Parameters
    max_sequence_length = 300
    vocab_size = 5000
    embedding_dim = 100
    
    # Load GloVe embeddings
    glove_path = './glove.6B.100d.txt'
    embeddings_index = load_glove_embeddings(glove_path, embedding_dim)
    
    # Prepare for cross-validation
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    fold_metrics = []
    
    for fold, (train_idx, val_idx) in enumerate(kfold.split(data['processed_full_content'], data['label']), 1):
        print(f"\nFold {fold}")
        
        # Split data
        train_texts = data['processed_full_content'].iloc[train_idx]
        val_texts = data['processed_full_content'].iloc[val_idx]
        y_train = data['label'].iloc[train_idx].values
        y_val = data['label'].iloc[val_idx].values
        
        # Initialize new tokenizer for each fold
        tokenizer = Tokenizer(num_words=vocab_size)
        
        # Process data for this fold
        X_train, X_val, tokenizer = process_fold_data(
            train_texts, val_texts, tokenizer, max_sequence_length
        )
        
        # Create embedding matrix for this fold's vocabulary
        embedding_matrix = create_embedding_matrix(
            tokenizer.word_index, embeddings_index, vocab_size, embedding_dim
        )
        
        # Compute class weights
        class_weights = compute_class_weight('balanced', 
                                          classes=np.unique(y_train), 
                                          y=y_train)
        class_weights_dict = dict(enumerate(class_weights))
        
        # Create and train model
        model = create_model(vocab_size, embedding_matrix, max_sequence_length)
        
        # Train the model
        history = model.fit(
            X_train, y_train,
            epochs=10,
            batch_size=256,
            validation_data=(X_val, y_val),
            class_weight=class_weights_dict,
            verbose=1
        )
        
        # Evaluate
        y_pred = (model.predict(X_val) > 0.5).astype(int)
        
        # Calculate metrics
        accuracy = accuracy_score(y_val, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_pred, average='binary')
        
        fold_metrics.append({
            'fold': fold,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1
        })
        
        print(f"\nFold {fold} Results:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-score: {f1:.4f}")
    
    # Calculate and print average metrics
    avg_metrics = {
        'accuracy': np.mean([m['accuracy'] for m in fold_metrics]),
        'precision': np.mean([m['precision'] for m in fold_metrics]),
        'recall': np.mean([m['recall'] for m in fold_metrics]),
        'f1': np.mean([m['f1'] for m in fold_metrics])
    }
    
    print("\nAverage Metrics Across All Folds:")
    for metric, value in avg_metrics.items():
        print(f"{metric.capitalize()}: {value:.4f}")

main()

Loading GloVe embeddings...
Loaded 400000 word vectors.

Fold 1
Epoch 1/10




[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 69ms/step - accuracy: 0.7589 - loss: 2.5455 - val_accuracy: 0.9299 - val_loss: 0.6792
Epoch 2/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 70ms/step - accuracy: 0.9388 - loss: 0.5804 - val_accuracy: 0.9527 - val_loss: 0.3940
Epoch 3/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 70ms/step - accuracy: 0.9520 - loss: 0.3740 - val_accuracy: 0.9565 - val_loss: 0.3081
Epoch 4/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 70ms/step - accuracy: 0.9555 - loss: 0.3049 - val_accuracy: 0.9569 - val_loss: 0.2742
Epoch 5/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 70ms/step - accuracy: 0.9585 - loss: 0.2749 - val_accuracy: 0.9582 - val_loss: 0.2608
Epoch 6/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 70ms/step - accuracy: 0.9584 - loss: 0.2639 - val_accuracy: 0.9589 - val_loss: 0.2562
Epoch 7/10
[1m190/190[0m 



[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 71ms/step - accuracy: 0.7521 - loss: 2.4965 - val_accuracy: 0.9402 - val_loss: 0.6234
Epoch 2/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 69ms/step - accuracy: 0.9415 - loss: 0.5493 - val_accuracy: 0.9503 - val_loss: 0.3870
Epoch 3/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 69ms/step - accuracy: 0.9545 - loss: 0.3636 - val_accuracy: 0.9620 - val_loss: 0.3019
Epoch 4/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 69ms/step - accuracy: 0.9572 - loss: 0.2993 - val_accuracy: 0.9623 - val_loss: 0.2699
Epoch 5/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 69ms/step - accuracy: 0.9601 - loss: 0.2696 - val_accuracy: 0.9623 - val_loss: 0.2582
Epoch 6/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 68ms/step - accuracy: 0.9626 - loss: 0.2595 - val_accuracy: 0.9639 - val_loss: 0.2520
Epoch 7/10
[1m190/190[0m 



[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 70ms/step - accuracy: 0.7444 - loss: 2.4655 - val_accuracy: 0.9435 - val_loss: 0.5717
Epoch 2/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 71ms/step - accuracy: 0.9403 - loss: 0.5127 - val_accuracy: 0.9549 - val_loss: 0.3501
Epoch 3/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 73ms/step - accuracy: 0.9530 - loss: 0.3445 - val_accuracy: 0.9500 - val_loss: 0.2999
Epoch 4/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 72ms/step - accuracy: 0.9553 - loss: 0.2912 - val_accuracy: 0.9584 - val_loss: 0.2651
Epoch 5/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 72ms/step - accuracy: 0.9588 - loss: 0.2718 - val_accuracy: 0.9554 - val_loss: 0.2617
Epoch 6/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 71ms/step - accuracy: 0.9583 - loss: 0.2617 - val_accuracy: 0.9590 - val_loss: 0.2532
Epoch 7/10
[1m190/190[0m 



[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 73ms/step - accuracy: 0.7573 - loss: 2.4107 - val_accuracy: 0.9435 - val_loss: 0.5787
Epoch 2/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 74ms/step - accuracy: 0.9382 - loss: 0.5142 - val_accuracy: 0.9577 - val_loss: 0.3531
Epoch 3/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 72ms/step - accuracy: 0.9507 - loss: 0.3443 - val_accuracy: 0.9606 - val_loss: 0.2867
Epoch 4/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 72ms/step - accuracy: 0.9577 - loss: 0.2864 - val_accuracy: 0.9605 - val_loss: 0.2644
Epoch 5/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 71ms/step - accuracy: 0.9587 - loss: 0.2680 - val_accuracy: 0.9610 - val_loss: 0.2544
Epoch 6/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 70ms/step - accuracy: 0.9594 - loss: 0.2582 - val_accuracy: 0.9610 - val_loss: 0.2508
Epoch 7/10
[1m190/190[0m 



[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 72ms/step - accuracy: 0.7690 - loss: 2.3907 - val_accuracy: 0.9413 - val_loss: 0.5437
Epoch 2/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 73ms/step - accuracy: 0.9401 - loss: 0.4869 - val_accuracy: 0.9531 - val_loss: 0.3433
Epoch 3/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 72ms/step - accuracy: 0.9502 - loss: 0.3382 - val_accuracy: 0.9592 - val_loss: 0.2848
Epoch 4/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 75ms/step - accuracy: 0.9541 - loss: 0.2904 - val_accuracy: 0.9555 - val_loss: 0.2709
Epoch 5/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 73ms/step - accuracy: 0.9563 - loss: 0.2730 - val_accuracy: 0.9604 - val_loss: 0.2559
Epoch 6/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 74ms/step - accuracy: 0.9574 - loss: 0.2620 - val_accuracy: 0.9609 - val_loss: 0.2531
Epoch 7/10
[1m190/190[0m 

### Convolutional Neural network + Custom-trained word2vec word embeddings + 5-Fold Cross Validation + L2 Regularization + GridSearchCV


In [13]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from gensim.models import Word2Vec
from tensorflow.keras.regularizers import l2

# Set seeds for reproducibility
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

def train_word2vec_and_create_embeddings(train_texts, word_index, max_words, embedding_dim=100):
    """Train Word2Vec on training data only and create embedding matrix"""
    # Train Word2Vec on training data only
    train_sentences = [text.split() for text in train_texts]
    word2vec_model = Word2Vec(sentences=train_sentences, 
                            vector_size=embedding_dim, 
                            window=5, 
                            min_count=2, 
                            workers=4)
    
    # Create embedding matrix with correct dimensions
    vocab_size = min(max_words, len(word_index) + 1)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    
    for word, i in word_index.items():
        if i < vocab_size:  # Only include words within max_words limit
            if word in word2vec_model.wv:
                embedding_matrix[i] = word2vec_model.wv[word]
            else:
                embedding_matrix[i] = np.random.normal(size=(embedding_dim,))
            
    return embedding_matrix

def create_model(max_sequence_length, vocab_size, embedding_dim, embedding_matrix, 
                filters, dropout_rate):
    input_layer = Input(shape=(max_sequence_length,))
    embedding_layer = Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        trainable=True
    )(input_layer)

    x = Conv1D(
        filters=filters,
        kernel_size=5,
        activation='relu',
        kernel_regularizer=l2(0.01)
    )(embedding_layer)
    
    x = GlobalMaxPooling1D()(x)
    x = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(x)
    x = Dropout(dropout_rate)(x)
    output_layer = Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01))(x)

    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def main():
    # Define parameter grid
    param_grid = {
        'filters': [64, 128],
        'dropout_rate': [0.2, 0.3, 0.4, 0.5]
    }

    # Initialize variables to track results
    results = []
    best_score = 0
    best_params = None

    # Constants
    max_words = 10000
    max_sequence_length = 300
    embedding_dim = 100

    # Perform grid search with cross-validation
    for filters in param_grid['filters']:
        for dropout_rate in param_grid['dropout_rate']:
            print(f"\nTesting filters={filters}, dropout_rate={dropout_rate}")
            
            # Initialize cross-validation
            kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
            fold_scores = []
            
            # Perform k-fold cross-validation
            for fold, (train_idx, val_idx) in enumerate(kfold.split(data['processed_full_content'], data['label']), 1):
                print(f"\nFold {fold}")
                
                # Split data
                train_texts = data['processed_full_content'].iloc[train_idx]
                val_texts = data['processed_full_content'].iloc[val_idx]
                y_train = data['label'].iloc[train_idx]
                y_val = data['label'].iloc[val_idx]
                
                # Fit tokenizer on training data only
                tokenizer = Tokenizer(num_words=max_words)
                tokenizer.fit_on_texts(train_texts)
                
                # Convert texts to sequences
                X_train = pad_sequences(tokenizer.texts_to_sequences(train_texts), 
                                      maxlen=max_sequence_length)
                X_val = pad_sequences(tokenizer.texts_to_sequences(val_texts), 
                                    maxlen=max_sequence_length)
                
                # Get vocab size for this fold
                vocab_size = min(max_words, len(tokenizer.word_index) + 1)
                
                # Create embedding matrix using training data only
                embedding_matrix = train_word2vec_and_create_embeddings(
                    train_texts, 
                    tokenizer.word_index,
                    max_words,
                    embedding_dim
                )
                
                # Create and train model
                model = create_model(
                    max_sequence_length=max_sequence_length,
                    vocab_size=vocab_size,
                    embedding_dim=embedding_dim,
                    embedding_matrix=embedding_matrix,
                    filters=filters,
                    dropout_rate=dropout_rate
                )
                
                # Train model
                history = model.fit(
                    X_train, y_train,
                    epochs=10,
                    batch_size=64,
                    validation_data=(X_val, y_val),
                    verbose=1
                )
                
                # Evaluate using F1-score
                y_pred = (model.predict(X_val) > 0.5).astype(int)
                fold_score = f1_score(y_val, y_pred)
                fold_scores.append(fold_score)
                
                print(f"Fold {fold} F1-score: {fold_score:.4f}")
            
            # Calculate average score for this parameter combination
            avg_score = np.mean(fold_scores)
            print(f"Average F1-score: {avg_score:.4f}")
            
            # Store results
            results.append({
                'filters': filters,
                'dropout_rate': dropout_rate,
                'avg_f1_score': avg_score,
                'fold_scores': fold_scores
            })
            
            # Update best parameters if necessary
            if avg_score > best_score:
                best_score = avg_score
                best_params = {'filters': filters, 'dropout_rate': dropout_rate}

    # Print final results
    print("\nGrid Search Results:")
    for result in results:
        print(f"Filters: {result['filters']}, Dropout: {result['dropout_rate']}, "
              f"F1-score: {result['avg_f1_score']:.4f}")

    print("\nBest Parameters:")
    print(f"Filters: {best_params['filters']}")
    print(f"Dropout Rate: {best_params['dropout_rate']}")
    print(f"Best F1-Score: {best_score:.4f}")

    # Save results to DataFrame for easy analysis
    import pandas as pd
    results_df = pd.DataFrame(results)
    print("\nResults Summary:")
    print(results_df.sort_values('avg_f1_score', ascending=False))

if __name__ == "__main__":
    main()


Testing filters=64, dropout_rate=0.2

Fold 1
Epoch 1/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 19ms/step - accuracy: 0.8743 - loss: 1.1206 - val_accuracy: 0.9584 - val_loss: 0.3045
Epoch 2/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 18ms/step - accuracy: 0.9613 - loss: 0.2693 - val_accuracy: 0.9654 - val_loss: 0.1955
Epoch 3/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 18ms/step - accuracy: 0.9684 - loss: 0.1877 - val_accuracy: 0.9662 - val_loss: 0.1728
Epoch 4/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 18ms/step - accuracy: 0.9720 - loss: 0.1653 - val_accuracy: 0.9679 - val_loss: 0.1634
Epoch 5/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 18ms/step - accuracy: 0.9742 - loss: 0.1523 - val_accuracy: 0.9687 - val_loss: 0.1567
Epoch 6/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 19ms/step - accuracy: 0.9782 - loss: 0.1398 - val_accura