In [1]:
import os
import sys
from pathlib import Path

if "workding_dir" not in locals():
    workding_dir = str(Path.cwd().parent)
os.chdir(workding_dir)
sys.path.append(workding_dir)
print("working dir:", workding_dir)

working dir: /Users/inflaton/code/engd/papers/DM-Fake-News-Detection


# From Detection to Credibility: A Machine Learning Framework for Assessing News Source Reliability



In [2]:
# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm


# Text Preprocessing and NLP
import nltk

# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords

# Tokenizing sentences/words
from nltk.tokenize import word_tokenize

# Part-of-speech tagging
from nltk import pos_tag

# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer

# Data Preparation (Loading CSV)

Load the processed_data `csv` file into pandas DataFrames
- `processed_data.csv` is loaded into `data` DataFrame (stemming has been performed to reduce processing time.)

In [3]:
data = pd.read_csv("./processed_data_filtered.csv")

In [4]:
data["label"].value_counts()

label
0    34030
1    26461
Name: count, dtype: int64

In [5]:
data.info()
print("Dataframe Shape:", data.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60491 entries, 0 to 60490
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   label                   60491 non-null  int64 
 1   full_content            60491 non-null  object
 2   processed_full_content  60491 non-null  object
 3   word_count              60491 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 1.8+ MB
Dataframe Shape: (60491, 4)


### Convolutional Neural network + Custom-trained word2vec word embeddings + 5-Fold Cross Validation + L2 Regularization + GridSearchCV


In [6]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Embedding,
    Conv1D,
    GlobalMaxPooling1D,
    Dense,
    Dropout,
    Input,
)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from gensim.models import Word2Vec
from tensorflow.keras.regularizers import l2

# Set seeds for reproducibility
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)


def train_word2vec_and_create_embeddings(
    train_texts, word_index, max_words, embedding_dim=100
):
    """Train Word2Vec on training data only and create embedding matrix"""
    # Train Word2Vec on training data only
    train_sentences = [text.split() for text in train_texts]
    word2vec_model = Word2Vec(
        sentences=train_sentences,
        vector_size=embedding_dim,
        window=5,
        min_count=2,
        workers=4,
    )

    # Create embedding matrix with correct dimensions
    vocab_size = min(max_words, len(word_index) + 1)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    for word, i in word_index.items():
        if i < vocab_size:  # Only include words within max_words limit
            if word in word2vec_model.wv:
                embedding_matrix[i] = word2vec_model.wv[word]
            else:
                embedding_matrix[i] = np.random.normal(size=(embedding_dim,))

    return embedding_matrix


def create_model(
    max_sequence_length,
    vocab_size,
    embedding_dim,
    embedding_matrix,
    filters,
    dropout_rate,
):
    input_layer = Input(shape=(max_sequence_length,))
    embedding_layer = Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        trainable=True,
    )(input_layer)

    x = Conv1D(
        filters=filters, kernel_size=5, activation="relu", kernel_regularizer=l2(0.01)
    )(embedding_layer)

    x = GlobalMaxPooling1D()(x)
    x = Dense(64, activation="relu", kernel_regularizer=l2(0.01))(x)
    x = Dropout(dropout_rate)(x)
    output_layer = Dense(1, activation="sigmoid", kernel_regularizer=l2(0.01))(x)

    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    return model


def main():
    filters = 64
    dropout_rate = 0.3

    # Initialize variables to track results
    results = []
    best_params = None

    # Constants
    max_words = 10000
    max_sequence_length = 300
    embedding_dim = 100

    print(f"\nTesting filters={filters}, dropout_rate={dropout_rate}")

    # Initialize cross-validation
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    # Perform k-fold cross-validation
    for fold, (train_idx, val_idx) in enumerate(
        kfold.split(data["processed_full_content"], data["label"]), 1
    ):
        print(f"\nFold {fold}")

        # Split data
        train_texts = data["processed_full_content"].iloc[train_idx]
        val_texts = data["processed_full_content"].iloc[val_idx]
        y_train = data["label"].iloc[train_idx]
        y_val = data["label"].iloc[val_idx]

        # Fit tokenizer on training data only
        tokenizer = Tokenizer(num_words=max_words)
        tokenizer.fit_on_texts(train_texts)

        # Convert texts to sequences
        X_train = pad_sequences(
            tokenizer.texts_to_sequences(train_texts), maxlen=max_sequence_length
        )
        X_val = pad_sequences(
            tokenizer.texts_to_sequences(val_texts), maxlen=max_sequence_length
        )

        # Get vocab size for this fold
        vocab_size = min(max_words, len(tokenizer.word_index) + 1)

        # Create embedding matrix using training data only
        embedding_matrix = train_word2vec_and_create_embeddings(
            train_texts, tokenizer.word_index, max_words, embedding_dim
        )

        # Create and train model
        model = create_model(
            max_sequence_length=max_sequence_length,
            vocab_size=vocab_size,
            embedding_dim=embedding_dim,
            embedding_matrix=embedding_matrix,
            filters=filters,
            dropout_rate=dropout_rate,
        )

        # Train model
        history = model.fit(
            X_train,
            y_train,
            epochs=10,
            batch_size=64,
            validation_data=(X_val, y_val),
            verbose=1,
        )

        # Evaluate using F1-score
        y_pred = (model.predict(X_val) > 0.5).astype(int)
        accuracy = accuracy_score(y_val, y_pred)
        precision = precision_score(y_val, y_pred)
        recall = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

        print(f"Fold {fold} F1-score: {f1:.4f}")

    # Calculate average score for this parameter combination
    avg_score = np.mean(f1_scores)
    print(f"Average F1-score: {avg_score:.4f}")

    # Store results
    results.append(
        {
            "filters": filters,
            "dropout_rate": dropout_rate,
            "avg_accuracy": np.mean(accuracy_scores),
            "avg_precision": np.mean(precision_scores),
            "avg_recall": np.mean(recall_scores),
            "avg_f1_score": avg_score,
        }
    )

    print("\nResults:")
    for result in results:
        for key, value in result.items():
            print(f"{key}: {value}")


if __name__ == "__main__":
    main()


Testing filters=64, dropout_rate=0.3

Fold 1
Epoch 1/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 23ms/step - accuracy: 0.8731 - loss: 1.1074 - val_accuracy: 0.9588 - val_loss: 0.2836
Epoch 2/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 24ms/step - accuracy: 0.9606 - loss: 0.2565 - val_accuracy: 0.9589 - val_loss: 0.2063
Epoch 3/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 25ms/step - accuracy: 0.9669 - loss: 0.1882 - val_accuracy: 0.9672 - val_loss: 0.1722
Epoch 4/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 29ms/step - accuracy: 0.9727 - loss: 0.1656 - val_accuracy: 0.9658 - val_loss: 0.1681
Epoch 5/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 33ms/step - accuracy: 0.9762 - loss: 0.1527 - val_accuracy: 0.9702 - val_loss: 0.1581
Epoch 6/10
[1m757/757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 40ms/step - accuracy: 0.9790 - loss: 0.1419 - val_accura

In [7]:
def train_model(data, epochs=10, batch_size=64):
    # Set seeds for reproducibility
    seed = 42
    tf.random.set_seed(seed)
    np.random.seed(seed)
    filters = 64
    dropout_rate = 0.3

    # Initialize variables to track results
    results = []
    best_params = None

    # Constants
    max_words = 10000
    max_sequence_length = 300
    embedding_dim = 100

    print(f"\nTesting filters={filters}, dropout_rate={dropout_rate}")

    # Initialize cross-validation
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    # Perform k-fold cross-validation
    for fold, (train_idx, val_idx) in enumerate(
        kfold.split(data["processed_full_content"], data["label"]), 1
    ):
        print(f"\nFold {fold}")

        # Split data
        train_texts = data["processed_full_content"].iloc[train_idx]
        val_texts = data["processed_full_content"].iloc[val_idx]
        y_train = data["label"].iloc[train_idx]
        y_val = data["label"].iloc[val_idx]

        # Fit tokenizer on training data only
        tokenizer = Tokenizer(num_words=max_words)
        tokenizer.fit_on_texts(train_texts)

        # Convert texts to sequences
        X_train = pad_sequences(
            tokenizer.texts_to_sequences(train_texts), maxlen=max_sequence_length
        )
        X_val = pad_sequences(
            tokenizer.texts_to_sequences(val_texts), maxlen=max_sequence_length
        )

        # Get vocab size for this fold
        vocab_size = min(max_words, len(tokenizer.word_index) + 1)

        # Create embedding matrix using training data only
        embedding_matrix = train_word2vec_and_create_embeddings(
            train_texts, tokenizer.word_index, max_words, embedding_dim
        )

        # Create and train model
        model = create_model(
            max_sequence_length=max_sequence_length,
            vocab_size=vocab_size,
            embedding_dim=embedding_dim,
            embedding_matrix=embedding_matrix,
            filters=filters,
            dropout_rate=dropout_rate,
        )

        # Train model
        history = model.fit(
            X_train,
            y_train,
            epochs=epochs,
            batch_size=batch_size,
            validation_data=(X_val, y_val),
            verbose=1,
        )

        # Evaluate using F1-score
        y_pred = (model.predict(X_val) > 0.5).astype(int)
        accuracy = accuracy_score(y_val, y_pred)
        precision = precision_score(y_val, y_pred)
        recall = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

        print(f"Fold {fold} F1-score: {f1:.4f}")
        break

    # Calculate average score for this parameter combination
    avg_score = np.mean(f1_scores)
    print(f"Average F1-score: {avg_score:.4f}")

    # Store results
    result = {
        "filters": filters,
        "dropout_rate": dropout_rate,
        "accuracy": np.mean(accuracy_scores),
        "precision": np.mean(precision_scores),
        "recall": np.mean(recall_scores),
        "f1_score": avg_score,
    }

    print("\nResult:")
    for key, value in result.items():
        print(f"\t{key}: {value}")

    return model, train_idx, val_idx

In [8]:
model, train_idx, val_idx = train_model(data, epochs=10, batch_size=64)
model, train_idx, val_idx


Testing filters=64, dropout_rate=0.3

Fold 1
Epoch 1/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 22ms/step - accuracy: 0.8776 - loss: 1.0792 - val_accuracy: 0.9567 - val_loss: 0.2590
Epoch 2/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 22ms/step - accuracy: 0.9640 - loss: 0.2318 - val_accuracy: 0.9671 - val_loss: 0.1789
Epoch 3/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 21ms/step - accuracy: 0.9696 - loss: 0.1727 - val_accuracy: 0.9707 - val_loss: 0.1590
Epoch 4/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 21ms/step - accuracy: 0.9731 - loss: 0.1581 - val_accuracy: 0.9722 - val_loss: 0.1526
Epoch 5/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 21ms/step - accuracy: 0.9771 - loss: 0.1457 - val_accuracy: 0.9752 - val_loss: 0.1457
Epoch 6/10
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 22ms/step - accuracy: 0.9795 - loss: 0.1345 - val_accura

(<Functional name=functional_5, built=True>,
 array([    0,     1,     2, ..., 60488, 60489, 60490]),
 array([    9,    27,    51, ..., 60451, 60464, 60482]))

In [9]:
len(val_idx)

6050

In [10]:
test_df = data.iloc[val_idx].copy()
test_df = test_df[["label", "full_content"]]
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6050 entries, 9 to 60482
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   label         6050 non-null   int64 
 1   full_content  6050 non-null   object
dtypes: int64(1), object(1)
memory usage: 141.8+ KB


In [11]:
model.save("results/CNN_model.keras")

In [12]:
# load model
from tensorflow.keras.models import load_model

model2 = load_model("results/CNN_model.keras")
model2.summary()

In [13]:
import json

json_obj = {
    "train_idx": train_idx.tolist(),
    "val_idx": val_idx.tolist(),
}
json.dump(json_obj, open("results/train_val_idx.json", "w"), indent=4)

In [14]:
def evaluate_model(model, data, val_idx):
    max_words = 10000
    max_sequence_length = 300

    train_idx = np.setdiff1d(np.arange(len(data)), val_idx)
    train_texts = data["processed_full_content"].iloc[train_idx]
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(train_texts)

    y_val = data["label"].iloc[val_idx]
    val_texts = data["processed_full_content"].iloc[val_idx]

    X_val = pad_sequences(
        tokenizer.texts_to_sequences(val_texts), maxlen=max_sequence_length
    )
    y_pred = (model.predict(X_val) > 0.5).astype(int)

    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1: {f1:.4f}")

In [15]:
evaluate_model(model, data, val_idx)

[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Accuracy: 0.9762
Precision: 0.9750
Recall: 0.9705
F1: 0.9727


In [16]:
evaluate_model(model2, data, val_idx)

[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Accuracy: 0.9762
Precision: 0.9750
Recall: 0.9705
F1: 0.9727
