In [101]:
import os
import sys
from pathlib import Path

if "workding_dir" not in locals():
    workding_dir = str(Path.cwd().parent)
os.chdir(workding_dir)
sys.path.append(workding_dir)
print("working dir:", workding_dir)

working dir: /Users/inflaton/code/engd/papers/DM-Fake-News-Detection


# Applying our best model (CNN + Word2Vec) on the scraped data

In [102]:
# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define function to process text
import string
from nltk.stem import *
from nltk.stem.porter import *

# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm

# Plotting and Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Language Detection packages
# `langdetect` for detecting language
from langdetect import detect as langdetect_detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

# `langid` for an alternative language detection method
from langid import classify as langid_classify

# Text Preprocessing and NLP
# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords

# Tokenizing sentences/words
from nltk.tokenize import word_tokenize

# Part-of-speech tagging
from nltk import pos_tag

# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer
import nltk

# Regular expressions for text pattern matching
import re


def process_full_review(text):
    # Convert to lowercase and tokenize
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in string.punctuation]
    stemmer = PorterStemmer()
    # List of stopwords
    stop_words = stopwords.words("english")
    allowed_words = [
        "no",
        "not",
        "don't",
        "dont",
        "don",
        "but",
        "however",
        "never",
        "wasn't",
        "wasnt",
        "shouldn't",
        "shouldnt",
        "mustn't",
        "musnt",
    ]

    stemmed = [
        stemmer.stem(word)
        for word in tokens
        if word not in stop_words or word in allowed_words
    ]
    return " ".join(stemmed)

In [103]:
from datasets import load_dataset, concatenate_datasets, Dataset

datasets = load_dataset(
    "csv",
    data_files={
        "train": [
            "dataset/train_data_1.csv",
            "dataset/train_data_2.csv",
            "dataset/train_data_3.csv",
            "dataset/train_data_4.csv",
        ],
    },
)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'full_content', 'processed_full_content'],
        num_rows: 54441
    })
})

In [104]:
# load model
from tensorflow.keras.models import load_model

model = load_model("results/CNN_model.keras")
model.summary()

  saveable.load_own_variables(weights_store.get(inner_path))


In [105]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import pandas as pd
from tqdm import tqdm

def evaluate_model(model, train_data, val_data, force_reprocess=False):
    # Apply process_full_review function with tqdm progress bar and expand the results into separate columns.
    processed_columns = "processed_full_content"
    if force_reprocess or processed_columns not in train_data.columns:
        # Enable tqdm for pandas (progress bar)
        tqdm.pandas(desc="Processing Train Data")
        
        train_data[processed_columns] = train_data["full_content"].progress_apply(
            lambda x: pd.Series(process_full_review(x))
        )

    if force_reprocess or processed_columns not in val_data.columns:
        # Enable tqdm for pandas (progress bar)
        tqdm.pandas(desc="Processing Val Data")

        # Apply process_full_review function with tqdm progress bar and expand the results into separate columns.
        val_data[processed_columns] = val_data["full_content"].progress_apply(
            lambda x: pd.Series(process_full_review(x))
        )

    print("Evaluating Model")
    
    max_words = 10000
    max_sequence_length = 300

    train_texts = train_data["processed_full_content"]
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(train_texts)

    val_texts = val_data["processed_full_content"]

    X_val = pad_sequences(
        tokenizer.texts_to_sequences(val_texts), maxlen=max_sequence_length
    )
    y_pred = (model.predict(X_val) > 0.5).astype(int)

    if "label" not in val_data.columns:
        return y_pred

    y_val = val_data["label"]
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1: {f1:.4f}")

In [106]:
df_train = datasets["train"].to_pandas()
df_test = pd.read_csv("dataset/scrapped_news.csv")
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   source                  1313 non-null   object
 1   full_content            1313 non-null   object
 2   processed_full_content  1313 non-null   object
dtypes: object(3)
memory usage: 30.9+ KB


In [107]:
predictions = evaluate_model(model, df_train, df_test)

# Print results
print("\nPrediction Results:")
print(f"Total articles: {len(predictions)}")
print(f"Predicted Real: {sum(predictions == 1)}")
print(f"Predicted Fake: {sum(predictions == 0)}")

Evaluating Model
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Prediction Results:
Total articles: 1313
Predicted Real: [611]
Predicted Fake: [702]


In [109]:
df_test.to_csv("dataset/scrapped_news.csv", index=False)

In [111]:
data_copy = df_test.copy()

# Add predictions to scraped data
data_copy['predicted_label'] = predictions

# Print predictions by source
print("\nPredictions by Source:")
source_predictions = data_copy.groupby('source')['predicted_label'].value_counts().unstack()
source_predictions.columns = ['Predicted Real', 'Predicted Fake']
print(source_predictions)

# Calculate percentage of fake news by source
fake_percentages = data_copy.groupby('source')['predicted_label'].mean() * 100
print("\nPercentage of Fake News by Source:")
print(fake_percentages.sort_values(ascending=False))


Predictions by Source:
                  Predicted Real  Predicted Fake
source                                          
AP                           124              20
BBC                           51              20
Breitbart                    137              39
CNN                           68              40
Guardian                      49              50
NPR                           61              18
Natural News                  57              71
News Max                      84             105
The Daily Caller              14             147
Zerohedge                     57             101

Percentage of Fake News by Source:
source
The Daily Caller    91.304348
Zerohedge           63.924051
News Max            55.555556
Natural News        55.468750
Guardian            50.505051
CNN                 37.037037
BBC                 28.169014
NPR                 22.784810
Breitbart           22.159091
AP                  13.888889
Name: predicted_label, dtype: float64


In [118]:
print("source: ", data_copy[data_copy["predicted_label"] == 0].iloc[0]["source"])
print(data_copy[data_copy["predicted_label"] == 0].iloc[0]["full_content"])

source:  Breitbart
House Republicans say the Biden-Harris White House might have broken the law when they altered President Joe Biden’s remarks in the official transcript to imply he did not call Trump supporters “garbage.”
Biden on Tuesday during a video call with Voto Latino in support of Vice President Kamala Harris’s presidential campaign said, “The only garbage I see floating out there is his supporters.”
However, after facing immediate backlash from Republicans and even some Democrats, the White House claimed that Biden did not call Trump supporters “garbage,” but was instead referring to one Trump supporter — namely, comedian Tony Hinchliffe.
The White House released a transcript that reinforced that argument, adding an apostrophe to “supporters” to read “supporter’s,” and then adding an em-dash to make it seem like Biden had not completed his sentence.
The transcript said (emphasis added):
The only garbage I see floating out there is his supporter’s — his — his demonization of 

In [117]:
print("source: ", data_copy[data_copy["predicted_label"] == 1].iloc[0]["source"])
print(data_copy[data_copy["predicted_label"] == 1].iloc[0]["full_content"])

source:  Breitbart
CLAIM: Vice President Kamala Harris’s campaign implied former President Donald Trump called for Liz Cheney to be put before a firing squad.
FACT CHECK: False. Trump’s comments on “guns trained on her face” were about Cheney stoking foreign wars without being willing to fight in their front lines.
Vice President Kamala Harris’s campaign took former President Donald Trump’s words out of context, to suggest that he was calling for Liz Cheney to be put before a firing squad.
Her campaign posted a video of Trump speaking to Tucker Carlson on X/Twitter, with a caption that paraphrased him:
Let’s put Liz Cheney with a rifle standing there with nine barrels shooting at her. Let’s see how she feels about it when the guns are trained on her face
Rather than suggesting she be executed, Trump was speaking about how neoconservatives are quick to send young Americans to fight overseas in war — which Trump has long opposed, particularly the Iraq and Afghanistan Wars championed by C