In [4]:
import os
import sys
from pathlib import Path

if "workding_dir" not in locals():
    workding_dir = str(Path.cwd().parent)
os.chdir(workding_dir)
sys.path.append(workding_dir)
print("working dir:", workding_dir)

working dir: /Users/inflaton/code/engd/papers/DM-Fake-News-Detection


# From Detection to Credibility: A Machine Learning Framework for Assessing News Source Reliability



In [5]:
# !pip3 install -r ../requirements.txt

In [6]:
# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm


# Text Preprocessing and NLP
import nltk
# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords
# Tokenizing sentences/words
from nltk.tokenize import word_tokenize
# Part-of-speech tagging
from nltk import pos_tag
# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer


# Data Preparation (Loading CSV)

Load the processed_data `csv` file into pandas DataFrames
- `processed_data.csv` is loaded into `data` DataFrame (stemming has been performed to reduce processing time.)

In [7]:
data = pd.read_csv('./processed_data.csv')

In [8]:
data['label'].value_counts()

label
0    34770
1    28162
Name: count, dtype: int64

In [9]:
data.info()
print("Dataframe Shape:", data.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62932 entries, 0 to 62931
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   label                   62932 non-null  int64 
 1   full_content            62932 non-null  object
 2   processed_full_content  62932 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB
Dataframe Shape: (62932, 3)


In [10]:
# # Ensure required NLTK data is downloaded
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('all')

### Basic Convolutional Neural Network (Tokenizer + Embedding Layer) + 5 Fold Cross-Validation + L2 Regularization

In [11]:
import tensorflow as tf
import numpy as np
import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)

# Tokenization and Padding Parameters
max_words = 10000  # Max vocabulary size
max_sequence_length = 300  # Max length of sequences

# Tokenize and Pad Sequences
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data['processed_full_content'])
sequences = tokenizer.texts_to_sequences(data['processed_full_content'])
X = pad_sequences(sequences, maxlen=max_sequence_length)
y = data['label'].values  # Target labels

# Define the CNN Model with L2 Regularization
def create_basic_cnn():
    model = Sequential()
    model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_sequence_length))
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu', kernel_regularizer=l2(0.01)))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.01)))
    model.add(Dropout(0.2))  # Add dropout for regularization
    model.add(Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01)))  # Binary classification

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# 5-Fold Cross-Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
all_fold_metrics = {'accuracy': [], 'precision': [], 'recall': [], 'f1': []}

for train_index, val_index in kf.split(X, y):
    print(f"\nTraining fold {fold}...")
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    model = create_basic_cnn()
    history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_val, y_val), verbose=1)
    
    # Predict and evaluate
    y_pred = (model.predict(X_val) > 0.5).astype(int)
    accuracy = accuracy_score(y_val, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_pred, average='binary')
    
    # Store metrics for this fold
    all_fold_metrics['accuracy'].append(accuracy)
    all_fold_metrics['precision'].append(precision)
    all_fold_metrics['recall'].append(recall)
    all_fold_metrics['f1'].append(f1)
    
    print(f"Fold {fold} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
    fold += 1

# Calculate and print average metrics across all folds
avg_accuracy = np.mean(all_fold_metrics['accuracy'])
avg_precision = np.mean(all_fold_metrics['precision'])
avg_recall = np.mean(all_fold_metrics['recall'])
avg_f1 = np.mean(all_fold_metrics['f1'])

print("\nAverage Evaluation Metrics across 5 folds:")
print(f"Average Accuracy: {avg_accuracy:.4f}")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1 Score: {avg_f1:.4f}")


Training fold 1...
Epoch 1/10




[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 19ms/step - accuracy: 0.8411 - loss: 0.7438 - val_accuracy: 0.9598 - val_loss: 0.2297
Epoch 2/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 20ms/step - accuracy: 0.9624 - loss: 0.2133 - val_accuracy: 0.9651 - val_loss: 0.2023
Epoch 3/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 20ms/step - accuracy: 0.9747 - loss: 0.1804 - val_accuracy: 0.9641 - val_loss: 0.1930
Epoch 4/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 21ms/step - accuracy: 0.9808 - loss: 0.1604 - val_accuracy: 0.9627 - val_loss: 0.1895
Epoch 5/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 21ms/step - accuracy: 0.9859 - loss: 0.1461 - val_accuracy: 0.9591 - val_loss: 0.1898
Epoch 6/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 22ms/step - accuracy: 0.9879 - loss: 0.1360 - val_accuracy: 0.9583 - val_loss: 0.1890
Epoch 7/10
[1m787/787[0m 



[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 25ms/step - accuracy: 0.8355 - loss: 0.7390 - val_accuracy: 0.9523 - val_loss: 0.2346
Epoch 2/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 25ms/step - accuracy: 0.9589 - loss: 0.2173 - val_accuracy: 0.9595 - val_loss: 0.2092
Epoch 3/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 25ms/step - accuracy: 0.9727 - loss: 0.1834 - val_accuracy: 0.9615 - val_loss: 0.1936
Epoch 4/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 24ms/step - accuracy: 0.9789 - loss: 0.1633 - val_accuracy: 0.9626 - val_loss: 0.1884
Epoch 5/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 25ms/step - accuracy: 0.9838 - loss: 0.1488 - val_accuracy: 0.9615 - val_loss: 0.1837
Epoch 6/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 25ms/step - accuracy: 0.9866 - loss: 0.1379 - val_accuracy: 0.9634 - val_loss: 0.1788
Epoch 7/10
[1m787/787[0m 



[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 24ms/step - accuracy: 0.8437 - loss: 0.7434 - val_accuracy: 0.9571 - val_loss: 0.2248
Epoch 2/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 24ms/step - accuracy: 0.9630 - loss: 0.2126 - val_accuracy: 0.9612 - val_loss: 0.2031
Epoch 3/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 25ms/step - accuracy: 0.9740 - loss: 0.1804 - val_accuracy: 0.9627 - val_loss: 0.1946
Epoch 4/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 25ms/step - accuracy: 0.9805 - loss: 0.1613 - val_accuracy: 0.9641 - val_loss: 0.1873
Epoch 5/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 24ms/step - accuracy: 0.9847 - loss: 0.1468 - val_accuracy: 0.9640 - val_loss: 0.1838
Epoch 6/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 24ms/step - accuracy: 0.9875 - loss: 0.1369 - val_accuracy: 0.9647 - val_loss: 0.1792
Epoch 7/10
[1m787/787[0m 



[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 23ms/step - accuracy: 0.8385 - loss: 0.7469 - val_accuracy: 0.9604 - val_loss: 0.2228
Epoch 2/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 23ms/step - accuracy: 0.9614 - loss: 0.2154 - val_accuracy: 0.9647 - val_loss: 0.1988
Epoch 3/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 23ms/step - accuracy: 0.9721 - loss: 0.1817 - val_accuracy: 0.9649 - val_loss: 0.1910
Epoch 4/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 23ms/step - accuracy: 0.9795 - loss: 0.1612 - val_accuracy: 0.9638 - val_loss: 0.1849
Epoch 5/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 23ms/step - accuracy: 0.9844 - loss: 0.1472 - val_accuracy: 0.9646 - val_loss: 0.1809
Epoch 6/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 23ms/step - accuracy: 0.9877 - loss: 0.1360 - val_accuracy: 0.9650 - val_loss: 0.1763
Epoch 7/10
[1m787/787[0m 



[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 26ms/step - accuracy: 0.8481 - loss: 0.7581 - val_accuracy: 0.9600 - val_loss: 0.2264
Epoch 2/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 26ms/step - accuracy: 0.9645 - loss: 0.2144 - val_accuracy: 0.9618 - val_loss: 0.2055
Epoch 3/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 26ms/step - accuracy: 0.9750 - loss: 0.1806 - val_accuracy: 0.9620 - val_loss: 0.1975
Epoch 4/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 26ms/step - accuracy: 0.9814 - loss: 0.1608 - val_accuracy: 0.9619 - val_loss: 0.1920
Epoch 5/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 25ms/step - accuracy: 0.9853 - loss: 0.1465 - val_accuracy: 0.9633 - val_loss: 0.1845
Epoch 6/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 25ms/step - accuracy: 0.9890 - loss: 0.1355 - val_accuracy: 0.9644 - val_loss: 0.1816
Epoch 7/10
[1m787/787[0m 

### Convolutional Neural Network + TF-IDF Vectorizer

Using TF-IDF vectorizer along with CNN led to a drastic fall in performance. Below are some reasons why we should not use TF-IDF vectorizer along with a CNN or other neural networks.

#### Lack of Spatial Structure:

TF-IDF vectors are sparse and non-sequential representations where each position in the vector represents a word, not a spatial pattern.
CNNs are designed to detect patterns in sequential or spatially structured data (e.g., images or sentences), so they might struggle to find meaningful patterns in TF-IDF vectors.

#### High-Dimensional Sparse Data:

TF-IDF vectors, especially with a high max_features value (like 10,000), result in a high-dimensional but sparse input.
CNNs are generally not well-suited for such high-dimensional sparse data; they perform better with dense embeddings where words have contextually meaningful dimensions.

#### Mismatch Between Input Type and CNN Architecture:

CNNs are typically effective when applied to word embeddings (like GloVe or Word2Vec) because embeddings maintain semantic relationships and neighborhood structures.
TF-IDF, however, does not capture word order or semantic relationships, which means the convolution operation might not yield meaningful feature maps.


In [12]:
import tensorflow as tf
import numpy as np
import random

tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)

import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Dense, Dropout, Reshape, Input
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Step 1: Apply TF-IDF Vectorization
max_features = 10000  # Limit TF-IDF to top 10,000 features
tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
X_tfidf = tfidf_vectorizer.fit_transform(data['processed_full_content']).toarray()

# Convert the labels
y = data['label'].values  # Target labels

# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Step 3: Define the CNN Model for TF-IDF Input
def create_cnn_with_tfidf():
    inputs = Input(shape=(max_features,))
    x = Reshape((max_features, 1))(inputs)  # Reshape TF-IDF output to be compatible with Conv1D

    # Convolutional layer
    x = Conv1D(filters=128, kernel_size=5, activation='relu')(x)
    x = GlobalMaxPooling1D()(x)
    
    # Fully connected layer
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.5)(x)  # Dropout for regularization
    outputs = Dense(1, activation='sigmoid')(x)  # Output layer for binary classification

    # Create model
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Step 4: Train the Model
model = create_cnn_with_tfidf()
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), verbose=1)

# Step 5: Evaluate the Model
y_pred = (model.predict(X_test) > 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

print("\nEvaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Epoch 1/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 78ms/step - accuracy: 0.5434 - loss: 0.6882 - val_accuracy: 0.5696 - val_loss: 0.6805
Epoch 2/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 78ms/step - accuracy: 0.5523 - loss: 0.6852 - val_accuracy: 0.5800 - val_loss: 0.6777
Epoch 3/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 78ms/step - accuracy: 0.5543 - loss: 0.6822 - val_accuracy: 0.5769 - val_loss: 0.6762
Epoch 4/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 77ms/step - accuracy: 0.5575 - loss: 0.6809 - val_accuracy: 0.5761 - val_loss: 0.6762
Epoch 5/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 78ms/step - accuracy: 0.5582 - loss: 0.6803 - val_accuracy: 0.5769 - val_loss: 0.6746
Epoch 6/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 77ms/step - accuracy: 0.5587 - loss: 0.6803 - val_accuracy: 0.5798 - val_loss: 0.6744
Epoch 7/10
[1m7

### Convolutional Neural Networks + Count Vectorization (conversion to sequences) + Stratified 5-Fold CV + L2 Regularization

In [13]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tensorflow.keras.regularizers import l2

seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)

# Step 1: Text Vectorization using CountVectorizer
max_features = 10000  # Max vocabulary size for CountVectorizer
vectorizer = CountVectorizer(max_features=max_features)
X_counts = vectorizer.fit_transform(data['processed_full_content'])
word_index = vectorizer.vocabulary_

# Convert CountVectorizer output to sequences
index_to_word = {i: word for word, i in word_index.items()}

def counts_to_sequences(X_counts):
    sequences = []
    for i in range(X_counts.shape[0]):
        indices = X_counts[i].nonzero()[1]
        words = [index_to_word[idx] for idx in indices]
        seq = [word_index[word] + 1 for word in words]  # +1 because 0 is reserved for padding
        sequences.append(seq)
    return sequences

sequences = counts_to_sequences(X_counts)
max_sequence_length = 300  # Adjust to your needs
X = pad_sequences(sequences, maxlen=max_sequence_length)
y = data['label'].values  # Target labels

# Define the Basic CNN Model with L2 Regularization
def create_basic_cnn_with_l2():
    model = Sequential()
    
    # Embedding layer with random initialization
    model.add(Embedding(input_dim=max_features + 1, output_dim=128))
    
    # Convolutional layer with L2 regularization
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu', kernel_regularizer=l2(0.01)))
    model.add(GlobalMaxPooling1D())
    
    # Fully connected layer with L2 regularization
    model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.01)))
    model.add(Dropout(0.5))  # Add dropout for regularization
    
    # Output layer with L2 regularization
    model.add(Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01)))  # Binary classification
    
    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Step 4: Stratified 5-Fold Cross-Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = create_basic_cnn_with_l2()
    model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2, verbose=1)
    
    # Evaluate the model
    y_pred = (model.predict(X_test) > 0.5).astype(int)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
    
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

# Step 5: Print Cross-Validation Results
print("\nCross-Validation Metrics:")
print(f"Average Accuracy: {np.mean(accuracy_scores):.4f}")
print(f"Average Precision: {np.mean(precision_scores):.4f}")
print(f"Average Recall: {np.mean(recall_scores):.4f}")
print(f"Average F1 Score: {np.mean(f1_scores):.4f}")


Epoch 1/10
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 27ms/step - accuracy: 0.7907 - loss: 0.8565 - val_accuracy: 0.9303 - val_loss: 0.2637
Epoch 2/10
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 27ms/step - accuracy: 0.9325 - loss: 0.2664 - val_accuracy: 0.9450 - val_loss: 0.2344
Epoch 3/10
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 27ms/step - accuracy: 0.9484 - loss: 0.2316 - val_accuracy: 0.9483 - val_loss: 0.2259
Epoch 4/10
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 26ms/step - accuracy: 0.9608 - loss: 0.2088 - val_accuracy: 0.9514 - val_loss: 0.2198
Epoch 5/10
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 26ms/step - accuracy: 0.9676 - loss: 0.1931 - val_accuracy: 0.9533 - val_loss: 0.2171
Epoch 6/10
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 25ms/step - accuracy: 0.9726 - loss: 0.1792 - val_accuracy: 0.9554 - val_loss: 0.2114
Epoch 7/10
[1m6

### Convolutional Neural Network + Custom-trained Word2Vec Embeddings + 5-Fold Cross Validation + L2 Regularization

#### Why do we use word embedding over other preprocessing techniques (eg. tf-idf, count vectorizer), for our task of fake news classification?


##### 1. Word embeddings capture the semantic relationships between words in a dense, low-dimensional space.
Fake news often uses subtle language, and word embeddings like GloVe can capture the semantic context of words, allowing the model to understand relationships between words that simple vectorizers would miss. This helps in detecting nuanced differences in language use between real and fake news.

##### 2. Word embeddings produce dense, low-dimensional vectors (e.g., 100-300 dimensions) that capture rich word information.
Pre-trained embeddings are built on large corpora like Wikipedia and news articles, giving our model external knowledge that’s useful for distinguishing between real news and fake news. This boosts the model's ability to generalize on unseen test data from our web scraping.

##### 3. Efficient Representation of Semantics
Words in fake news can appear in different contexts, but with similar underlying meanings (e.g., "hoax" and "lie"). GloVe embeddings represent these similar words in close proximity in the vector space, helping the model recognize fake news patterns more effectively than TF-IDF or Count Vectorizer.

##### 4. Handling Synonyms and Rare Words:
Fake news often uses alternative phrases or rare terminology. Pre-trained embeddings like GloVe can handle these rare words because they’ve seen a broad variety of language during training, making our model more robust against unusual vocabulary choices in fake news.

## Cross Validation 
We use Stratified K-Fold Cross-Validation with n_splits=5 to evaluate the model on different splits of the data. 
For each fold, we store the metrics (accuracy, precision, recall, and F1 score) and then calculate the average metrics across all folds for a robust evaluation.

Cross-validation helps us understand the model’s performance more robustly by testing it on multiple splits of the data. This approach gives a more reliable estimate of model performance and helps reduce the risk of overfitting to any single train-test split.

In [14]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from gensim.models import Word2Vec
from tensorflow.keras.regularizers import l2
import random

# Set seeds
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)

# Parameters
max_words = 5000
max_sequence_length = 300
embedding_dim = 100

def create_embedding_matrix(word2vec_model, tokenizer, vocab_size, embedding_dim):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in tokenizer.word_index.items():
        if i < vocab_size:
            if word in word2vec_model.wv:
                embedding_matrix[i] = word2vec_model.wv[word]
            else:
                embedding_matrix[i] = np.random.normal(size=(embedding_dim,))
    return embedding_matrix

def create_cnn_with_l2(vocab_size, embedding_dim, embedding_matrix):
    input_layer = Input(shape=(max_sequence_length,))
    embedding_layer = Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        trainable=True
    )(input_layer)
    
    x = Conv1D(filters=128, kernel_size=5, activation='relu', kernel_regularizer=l2(0.01))(embedding_layer)
    x = GlobalMaxPooling1D()(x)
    x = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(x)
    x = Dropout(0.5)(x)
    output_layer = Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01))(x)
    
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Implement cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_metrics = []

for fold, (train_idx, val_idx) in enumerate(kfold.split(data['processed_full_content'], data['label']), 1):
    print(f"\nFold {fold}")
    
    # Split data
    train_texts = data['processed_full_content'].iloc[train_idx]
    val_texts = data['processed_full_content'].iloc[val_idx]
    train_labels = data['label'].iloc[train_idx]
    val_labels = data['label'].iloc[val_idx]
    
    # Tokenization
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(train_texts)
    
    # Create sequences
    X_train = pad_sequences(tokenizer.texts_to_sequences(train_texts), maxlen=max_sequence_length)
    X_val = pad_sequences(tokenizer.texts_to_sequences(val_texts), maxlen=max_sequence_length)
    
    # Train Word2Vec on training data only
    train_sentences = [text.split() for text in train_texts]
    word2vec_model = Word2Vec(train_sentences, vector_size=embedding_dim, window=5, min_count=2, workers=4)
    
    # Create embedding matrix
    vocab_size = min(max_words, len(tokenizer.word_index) + 1)
    embedding_matrix = create_embedding_matrix(word2vec_model, tokenizer, vocab_size, embedding_dim)
    
    # Create and train model
    model = create_cnn_with_l2(vocab_size, embedding_dim, embedding_matrix)
    
    history = model.fit(
        X_train, train_labels,
        epochs=10,
        batch_size=256,
        validation_data=(X_val, val_labels),
        verbose=1
    )
    
    # Evaluate
    y_pred = (model.predict(X_val) > 0.5).astype(int)
    
    # Calculate metrics
    accuracy = accuracy_score(val_labels, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(val_labels, y_pred, average='binary')
    
    fold_metrics.append({
        'fold': fold,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    })
    
    print(f"\nFold {fold} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

# Calculate and display average metrics
avg_metrics = {
    'accuracy': np.mean([m['accuracy'] for m in fold_metrics]),
    'precision': np.mean([m['precision'] for m in fold_metrics]),
    'recall': np.mean([m['recall'] for m in fold_metrics]),
    'f1': np.mean([m['f1'] for m in fold_metrics])
}

print("\nAverage Metrics Across All Folds:")
print(f"Average Accuracy: {avg_metrics['accuracy']:.4f}")
print(f"Average Precision: {avg_metrics['precision']:.4f}")
print(f"Average Recall: {avg_metrics['recall']:.4f}")
print(f"Average F1 Score: {avg_metrics['f1']:.4f}")


Fold 1
Epoch 1/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 76ms/step - accuracy: 0.8040 - loss: 2.0129 - val_accuracy: 0.9501 - val_loss: 0.6970
Epoch 2/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 75ms/step - accuracy: 0.9505 - loss: 0.5997 - val_accuracy: 0.9655 - val_loss: 0.3715
Epoch 3/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 76ms/step - accuracy: 0.9633 - loss: 0.3513 - val_accuracy: 0.9683 - val_loss: 0.2660
Epoch 4/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 75ms/step - accuracy: 0.9674 - loss: 0.2568 - val_accuracy: 0.9705 - val_loss: 0.2148
Epoch 5/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 74ms/step - accuracy: 0.9721 - loss: 0.2070 - val_accuracy: 0.9722 - val_loss: 0.1870
Epoch 6/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 74ms/step - accuracy: 0.9755 - loss: 0.1797 - val_accuracy: 0.9715 - val_loss: 0.1752
Epoch 7/

### Convolutional Neural network + GloVe word embeddings (100D) + 5-Fold Cross Validation + L2 regularization

In [18]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Concatenate, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.utils.class_weight import compute_class_weight

# Set seeds
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)

# Load GloVe embeddings once (this doesn't cause data leakage)
def load_glove_embeddings(path, embedding_dim=100):
    print("Loading GloVe embeddings...")
    embeddings_index = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print(f"Loaded {len(embeddings_index)} word vectors.")
    return embeddings_index

def create_embedding_matrix(word_index, embeddings_index, vocab_size, embedding_dim):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i >= vocab_size:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

def create_model(vocab_size, embedding_matrix, max_sequence_length):
    input_layer = Input(shape=(max_sequence_length,))
    embedding_layer = Embedding(
        input_dim=vocab_size,
        output_dim=embedding_matrix.shape[1],
        weights=[embedding_matrix],
        trainable=False,  # Set to False for pre-trained embeddings
        input_length=max_sequence_length
    )(input_layer)

    convs = []
    for kernel_size in [3, 4, 5]:
        conv = Conv1D(
            filters=64,
            kernel_size=kernel_size,
            activation='relu',
            kernel_regularizer=l2(0.01)
        )(embedding_layer)
        pool = GlobalMaxPooling1D()(conv)
        convs.append(pool)

    merged = Concatenate()(convs)
    dense = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(merged)
    drop = Dropout(0.2)(dense)
    output = Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01))(drop)

    model = Model(inputs=input_layer, outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def process_fold_data(train_texts, val_texts, tokenizer, max_sequence_length):
    """Process text data for a single fold"""
    # Fit tokenizer on training data only
    tokenizer.fit_on_texts(train_texts)
    
    # Convert texts to sequences
    X_train = tokenizer.texts_to_sequences(train_texts)
    X_val = tokenizer.texts_to_sequences(val_texts)
    
    # Pad sequences
    X_train = pad_sequences(X_train, maxlen=max_sequence_length)
    X_val = pad_sequences(X_val, maxlen=max_sequence_length)
    
    return X_train, X_val, tokenizer

def main():
    # Parameters
    max_sequence_length = 300
    vocab_size = 5000
    embedding_dim = 100
    
    # Load GloVe embeddings
    glove_path = './glove.6B.100d.txt'
    embeddings_index = load_glove_embeddings(glove_path, embedding_dim)
    
    # Prepare for cross-validation
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    fold_metrics = []
    
    for fold, (train_idx, val_idx) in enumerate(kfold.split(data['processed_full_content'], data['label']), 1):
        print(f"\nFold {fold}")
        
        # Split data
        train_texts = data['processed_full_content'].iloc[train_idx]
        val_texts = data['processed_full_content'].iloc[val_idx]
        y_train = data['label'].iloc[train_idx].values
        y_val = data['label'].iloc[val_idx].values
        
        # Initialize new tokenizer for each fold
        tokenizer = Tokenizer(num_words=vocab_size)
        
        # Process data for this fold
        X_train, X_val, tokenizer = process_fold_data(
            train_texts, val_texts, tokenizer, max_sequence_length
        )
        
        # Create embedding matrix for this fold's vocabulary
        embedding_matrix = create_embedding_matrix(
            tokenizer.word_index, embeddings_index, vocab_size, embedding_dim
        )
        
        # Compute class weights
        class_weights = compute_class_weight('balanced', 
                                          classes=np.unique(y_train), 
                                          y=y_train)
        class_weights_dict = dict(enumerate(class_weights))
        
        # Create and train model
        model = create_model(vocab_size, embedding_matrix, max_sequence_length)
        
        # Train the model
        history = model.fit(
            X_train, y_train,
            epochs=10,
            batch_size=256,
            validation_data=(X_val, y_val),
            class_weight=class_weights_dict,
            verbose=1
        )
        
        # Evaluate
        y_pred = (model.predict(X_val) > 0.5).astype(int)
        
        # Calculate metrics
        accuracy = accuracy_score(y_val, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_pred, average='binary')
        
        fold_metrics.append({
            'fold': fold,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1
        })
        
        print(f"\nFold {fold} Results:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-score: {f1:.4f}")
    
    # Calculate and print average metrics
    avg_metrics = {
        'accuracy': np.mean([m['accuracy'] for m in fold_metrics]),
        'precision': np.mean([m['precision'] for m in fold_metrics]),
        'recall': np.mean([m['recall'] for m in fold_metrics]),
        'f1': np.mean([m['f1'] for m in fold_metrics])
    }
    
    print("\nAverage Metrics Across All Folds:")
    for metric, value in avg_metrics.items():
        print(f"{metric.capitalize()}: {value:.4f}")

main()

Loading GloVe embeddings...
Loaded 400000 word vectors.

Fold 1
Epoch 1/10




[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 55ms/step - accuracy: 0.7694 - loss: 2.4560 - val_accuracy: 0.9422 - val_loss: 0.6089
Epoch 2/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 55ms/step - accuracy: 0.9420 - loss: 0.5295 - val_accuracy: 0.9546 - val_loss: 0.3673
Epoch 3/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 58ms/step - accuracy: 0.9525 - loss: 0.3485 - val_accuracy: 0.9584 - val_loss: 0.2968
Epoch 4/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 59ms/step - accuracy: 0.9548 - loss: 0.2914 - val_accuracy: 0.9581 - val_loss: 0.2729
Epoch 5/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 58ms/step - accuracy: 0.9572 - loss: 0.2721 - val_accuracy: 0.9594 - val_loss: 0.2647
Epoch 6/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 57ms/step - accuracy: 0.9578 - loss: 0.2641 - val_accuracy: 0.9597 - val_loss: 0.2611
Epoch 7/10
[1m197/197[0m 



[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 55ms/step - accuracy: 0.7632 - loss: 2.4318 - val_accuracy: 0.9383 - val_loss: 0.5954
Epoch 2/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 54ms/step - accuracy: 0.9393 - loss: 0.5210 - val_accuracy: 0.9492 - val_loss: 0.3738
Epoch 3/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 54ms/step - accuracy: 0.9507 - loss: 0.3518 - val_accuracy: 0.9510 - val_loss: 0.3071
Epoch 4/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 56ms/step - accuracy: 0.9555 - loss: 0.2956 - val_accuracy: 0.9541 - val_loss: 0.2822
Epoch 5/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 57ms/step - accuracy: 0.9564 - loss: 0.2743 - val_accuracy: 0.9513 - val_loss: 0.2774
Epoch 6/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 57ms/step - accuracy: 0.9590 - loss: 0.2639 - val_accuracy: 0.9510 - val_loss: 0.2747
Epoch 7/10
[1m197/197[0m 



[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 55ms/step - accuracy: 0.7281 - loss: 2.4923 - val_accuracy: 0.9386 - val_loss: 0.5944
Epoch 2/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 57ms/step - accuracy: 0.9365 - loss: 0.5256 - val_accuracy: 0.9490 - val_loss: 0.3722
Epoch 3/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 56ms/step - accuracy: 0.9500 - loss: 0.3552 - val_accuracy: 0.9518 - val_loss: 0.3046
Epoch 4/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 59ms/step - accuracy: 0.9549 - loss: 0.2980 - val_accuracy: 0.9527 - val_loss: 0.2796
Epoch 5/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 58ms/step - accuracy: 0.9564 - loss: 0.2772 - val_accuracy: 0.9533 - val_loss: 0.2701
Epoch 6/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 58ms/step - accuracy: 0.9570 - loss: 0.2672 - val_accuracy: 0.9558 - val_loss: 0.2617
Epoch 7/10
[1m197/197[0m 



[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 60ms/step - accuracy: 0.7584 - loss: 2.3604 - val_accuracy: 0.9432 - val_loss: 0.5397
Epoch 2/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 59ms/step - accuracy: 0.9415 - loss: 0.4803 - val_accuracy: 0.9536 - val_loss: 0.3378
Epoch 3/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 59ms/step - accuracy: 0.9533 - loss: 0.3266 - val_accuracy: 0.9564 - val_loss: 0.2826
Epoch 4/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 60ms/step - accuracy: 0.9559 - loss: 0.2827 - val_accuracy: 0.9572 - val_loss: 0.2646
Epoch 5/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 60ms/step - accuracy: 0.9559 - loss: 0.2684 - val_accuracy: 0.9590 - val_loss: 0.2580
Epoch 6/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 59ms/step - accuracy: 0.9575 - loss: 0.2624 - val_accuracy: 0.9587 - val_loss: 0.2551
Epoch 7/10
[1m197/197[0m 



[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 59ms/step - accuracy: 0.7681 - loss: 2.3442 - val_accuracy: 0.9337 - val_loss: 0.5304
Epoch 2/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 59ms/step - accuracy: 0.9394 - loss: 0.4653 - val_accuracy: 0.9522 - val_loss: 0.3369
Epoch 3/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 58ms/step - accuracy: 0.9511 - loss: 0.3278 - val_accuracy: 0.9571 - val_loss: 0.2850
Epoch 4/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 59ms/step - accuracy: 0.9545 - loss: 0.2860 - val_accuracy: 0.9587 - val_loss: 0.2664
Epoch 5/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 59ms/step - accuracy: 0.9571 - loss: 0.2699 - val_accuracy: 0.9592 - val_loss: 0.2594
Epoch 6/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 59ms/step - accuracy: 0.9572 - loss: 0.2618 - val_accuracy: 0.9600 - val_loss: 0.2567
Epoch 7/10
[1m197/197[0m 

### Convolutional Neural network + Custom-trained word2vec word embeddings + 5-Fold Cross Validation + L2 Regularization + GridSearchCV


In [19]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from gensim.models import Word2Vec
from tensorflow.keras.regularizers import l2

# Set seeds for reproducibility
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

def train_word2vec_and_create_embeddings(train_texts, word_index, max_words, embedding_dim=100):
    """Train Word2Vec on training data only and create embedding matrix"""
    # Train Word2Vec on training data only
    train_sentences = [text.split() for text in train_texts]
    word2vec_model = Word2Vec(sentences=train_sentences, 
                            vector_size=embedding_dim, 
                            window=5, 
                            min_count=2, 
                            workers=4)
    
    # Create embedding matrix with correct dimensions
    vocab_size = min(max_words, len(word_index) + 1)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    
    for word, i in word_index.items():
        if i < vocab_size:  # Only include words within max_words limit
            if word in word2vec_model.wv:
                embedding_matrix[i] = word2vec_model.wv[word]
            else:
                embedding_matrix[i] = np.random.normal(size=(embedding_dim,))
            
    return embedding_matrix

def create_model(max_sequence_length, vocab_size, embedding_dim, embedding_matrix, 
                filters, dropout_rate):
    input_layer = Input(shape=(max_sequence_length,))
    embedding_layer = Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        trainable=True
    )(input_layer)

    x = Conv1D(
        filters=filters,
        kernel_size=5,
        activation='relu',
        kernel_regularizer=l2(0.01)
    )(embedding_layer)
    
    x = GlobalMaxPooling1D()(x)
    x = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(x)
    x = Dropout(dropout_rate)(x)
    output_layer = Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01))(x)

    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def main():
    # Define parameter grid
    param_grid = {
        'filters': [64, 128],
        'dropout_rate': [0.2, 0.3, 0.4, 0.5]
    }

    # Initialize variables to track results
    results = []
    best_score = 0
    best_params = None

    # Constants
    max_words = 10000
    max_sequence_length = 300
    embedding_dim = 100

    # Perform grid search with cross-validation
    for filters in param_grid['filters']:
        for dropout_rate in param_grid['dropout_rate']:
            print(f"\nTesting filters={filters}, dropout_rate={dropout_rate}")
            
            # Initialize cross-validation
            kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
            fold_scores = []
            
            # Perform k-fold cross-validation
            for fold, (train_idx, val_idx) in enumerate(kfold.split(data['processed_full_content'], data['label']), 1):
                print(f"\nFold {fold}")
                
                # Split data
                train_texts = data['processed_full_content'].iloc[train_idx]
                val_texts = data['processed_full_content'].iloc[val_idx]
                y_train = data['label'].iloc[train_idx]
                y_val = data['label'].iloc[val_idx]
                
                # Fit tokenizer on training data only
                tokenizer = Tokenizer(num_words=max_words)
                tokenizer.fit_on_texts(train_texts)
                
                # Convert texts to sequences
                X_train = pad_sequences(tokenizer.texts_to_sequences(train_texts), 
                                      maxlen=max_sequence_length)
                X_val = pad_sequences(tokenizer.texts_to_sequences(val_texts), 
                                    maxlen=max_sequence_length)
                
                # Get vocab size for this fold
                vocab_size = min(max_words, len(tokenizer.word_index) + 1)
                
                # Create embedding matrix using training data only
                embedding_matrix = train_word2vec_and_create_embeddings(
                    train_texts, 
                    tokenizer.word_index,
                    max_words,
                    embedding_dim
                )
                
                # Create and train model
                model = create_model(
                    max_sequence_length=max_sequence_length,
                    vocab_size=vocab_size,
                    embedding_dim=embedding_dim,
                    embedding_matrix=embedding_matrix,
                    filters=filters,
                    dropout_rate=dropout_rate
                )
                
                # Train model
                history = model.fit(
                    X_train, y_train,
                    epochs=10,
                    batch_size=64,
                    validation_data=(X_val, y_val),
                    verbose=1
                )
                
                # Evaluate using F1-score
                y_pred = (model.predict(X_val) > 0.5).astype(int)
                fold_score = f1_score(y_val, y_pred)
                fold_scores.append(fold_score)
                
                print(f"Fold {fold} F1-score: {fold_score:.4f}")
            
            # Calculate average score for this parameter combination
            avg_score = np.mean(fold_scores)
            print(f"Average F1-score: {avg_score:.4f}")
            
            # Store results
            results.append({
                'filters': filters,
                'dropout_rate': dropout_rate,
                'avg_f1_score': avg_score,
                'fold_scores': fold_scores
            })
            
            # Update best parameters if necessary
            if avg_score > best_score:
                best_score = avg_score
                best_params = {'filters': filters, 'dropout_rate': dropout_rate}

    # Print final results
    print("\nGrid Search Results:")
    for result in results:
        print(f"Filters: {result['filters']}, Dropout: {result['dropout_rate']}, "
              f"F1-score: {result['avg_f1_score']:.4f}")

    print("\nBest Parameters:")
    print(f"Filters: {best_params['filters']}")
    print(f"Dropout Rate: {best_params['dropout_rate']}")
    print(f"Best F1-Score: {best_score:.4f}")

    # Save results to DataFrame for easy analysis
    import pandas as pd
    results_df = pd.DataFrame(results)
    print("\nResults Summary:")
    print(results_df.sort_values('avg_f1_score', ascending=False))

if __name__ == "__main__":
    main()


Testing filters=64, dropout_rate=0.2

Fold 1
Epoch 1/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 14ms/step - accuracy: 0.8760 - loss: 1.0972 - val_accuracy: 0.9591 - val_loss: 0.2982
Epoch 2/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 14ms/step - accuracy: 0.9594 - loss: 0.2668 - val_accuracy: 0.9677 - val_loss: 0.1964
Epoch 3/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 14ms/step - accuracy: 0.9680 - loss: 0.1878 - val_accuracy: 0.9695 - val_loss: 0.1770
Epoch 4/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 14ms/step - accuracy: 0.9720 - loss: 0.1661 - val_accuracy: 0.9706 - val_loss: 0.1670
Epoch 5/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 14ms/step - accuracy: 0.9752 - loss: 0.1529 - val_accuracy: 0.9702 - val_loss: 0.1603
Epoch 6/10
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 15ms/step - accuracy: 0.9785 - loss: 0.1420 - val_accura