In [5]:
import os
import sys
from pathlib import Path

if "workding_dir" not in locals():
    workding_dir = str(Path.cwd().parent)
os.chdir(workding_dir)
sys.path.append(workding_dir)
print("working dir:", workding_dir)

working dir: /Users/inflaton/code/engd/papers/DM-Fake-News-Detection


# Applying our best model (CNN + Word2Vec) on the scraped data

In [6]:
# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define function to process text
import string
from nltk.stem import *
from nltk.stem.porter import *

# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm

# Plotting and Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Language Detection packages
# `langdetect` for detecting language
from langdetect import detect as langdetect_detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

# `langid` for an alternative language detection method
from langid import classify as langid_classify

# Text Preprocessing and NLP
# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords

# Tokenizing sentences/words
from nltk.tokenize import word_tokenize

# Part-of-speech tagging
from nltk import pos_tag

# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer
import nltk

# Regular expressions for text pattern matching
import re


def process_full_review(text):
    # Convert to lowercase and tokenize
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in string.punctuation]
    stemmer = PorterStemmer()
    # List of stopwords
    stop_words = stopwords.words("english")
    allowed_words = [
        "no",
        "not",
        "don't",
        "dont",
        "don",
        "but",
        "however",
        "never",
        "wasn't",
        "wasnt",
        "shouldn't",
        "shouldnt",
        "mustn't",
        "musnt",
    ]

    stemmed = [
        stemmer.stem(word)
        for word in tokens
        if word not in stop_words or word in allowed_words
    ]
    return " ".join(stemmed)

In [7]:
from datasets import load_dataset, concatenate_datasets, Dataset

datasets = load_dataset(
    "csv",
    data_files={
        "train": [
            "dataset/train_data_1.csv",
            "dataset/train_data_2.csv",
            "dataset/train_data_3.csv",
            "dataset/train_data_4.csv",
        ],
    },
)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'full_content', 'processed_full_content'],
        num_rows: 54441
    })
})

In [8]:
# load model
from tensorflow.keras.models import load_model

model = load_model("results/CNN_model.keras")
model.summary()

  saveable.load_own_variables(weights_store.get(inner_path))


In [9]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import pandas as pd
from tqdm import tqdm

def evaluate_model(model, train_data, val_data, force_reprocess=False):
    # Apply process_full_review function with tqdm progress bar and expand the results into separate columns.
    processed_columns = "processed_full_content"
    if force_reprocess or processed_columns not in train_data.columns:
        # Enable tqdm for pandas (progress bar)
        tqdm.pandas(desc="Processing Train Data")
        
        train_data[processed_columns] = train_data["full_content"].progress_apply(
            lambda x: pd.Series(process_full_review(x))
        )

    if force_reprocess or processed_columns not in val_data.columns:
        # Enable tqdm for pandas (progress bar)
        tqdm.pandas(desc="Processing Val Data")

        # Apply process_full_review function with tqdm progress bar and expand the results into separate columns.
        val_data[processed_columns] = val_data["full_content"].progress_apply(
            lambda x: pd.Series(process_full_review(x))
        )

    print("Evaluating Model")
    
    max_words = 10000
    max_sequence_length = 300

    train_texts = train_data["processed_full_content"]
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(train_texts)

    val_texts = val_data["processed_full_content"]

    X_val = pad_sequences(
        tokenizer.texts_to_sequences(val_texts), maxlen=max_sequence_length
    )
    y_pred = (model.predict(X_val) > 0.5).astype(int)

    if "label" not in val_data.columns:
        return y_pred

    y_val = val_data["label"]
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1: {f1:.4f}")

In [10]:
df_train = datasets["train"].to_pandas()
df_test = pd.read_csv("dataset/scrapped_news.csv")
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1275 entries, 0 to 1274
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   source                  1275 non-null   object
 1   full_content            1275 non-null   object
 2   processed_full_content  1275 non-null   object
dtypes: object(3)
memory usage: 30.0+ KB


In [11]:
predictions = evaluate_model(model, df_train, df_test)

# Print results
print("\nPrediction Results:")
print(f"Total articles: {len(predictions)}")
print(f"Predicted Real: {sum(predictions == 1)}")
print(f"Predicted Fake: {sum(predictions == 0)}")

Evaluating Model
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Prediction Results:
Total articles: 1275
Predicted Real: [587]
Predicted Fake: [688]


In [12]:
df_test.to_csv("dataset/scrapped_news.csv", index=False)

In [13]:
data_copy = df_test.copy()

# Add predictions to scraped data
data_copy['predicted_label'] = predictions

# Print predictions by source
print("\nPredictions by Source:")
source_predictions = data_copy.groupby('source')['predicted_label'].value_counts().unstack()
source_predictions.columns = ['Predicted Real', 'Predicted Fake']
print(source_predictions)

# Calculate percentage of fake news by source
fake_percentages = data_copy.groupby('source')['predicted_label'].mean() * 100
print("\nPercentage of Fake News by Source:")
print(fake_percentages.sort_values(ascending=False))


Predictions by Source:
                  Predicted Real  Predicted Fake
source                                          
AP                           120              18
BBC                           51              20
Breitbart                    137              37
CNN                           66              32
Guardian                      43              45
NPR                           60              17
Natural News                  57              71
News Max                      84             105
The Daily Caller              13             145
Zerohedge                     57              97

Percentage of Fake News by Source:
source
The Daily Caller    91.772152
Zerohedge           62.987013
News Max            55.555556
Natural News        55.468750
Guardian            51.136364
CNN                 32.653061
BBC                 28.169014
NPR                 22.077922
Breitbart           21.264368
AP                  13.043478
Name: predicted_label, dtype: float64


In [14]:
print("source: ", data_copy[data_copy["predicted_label"] == 0].iloc[0]["source"])
print(data_copy[data_copy["predicted_label"] == 0].iloc[0]["full_content"])

source:  Breitbart
House Republicans say the Biden-Harris White House might have broken the law when they altered President Joe Biden’s remarks in the official transcript to imply he did not call Trump supporters “garbage.”
Biden on Tuesday during a video call with Voto Latino in support of Vice President Kamala Harris’s presidential campaign said, “The only garbage I see floating out there is his supporters.”
However, after facing immediate backlash from Republicans and even some Democrats, the White House claimed that Biden did not call Trump supporters “garbage,” but was instead referring to one Trump supporter — namely, comedian Tony Hinchliffe.
The White House released a transcript that reinforced that argument, adding an apostrophe to “supporters” to read “supporter’s,” and then adding an em-dash to make it seem like Biden had not completed his sentence.
The transcript said (emphasis added):
The only garbage I see floating out there is his supporter’s — his — his demonization of 

The information you’ve provided is accurate. In October 2024, during a video call with Voto Latino, President Joe Biden criticized remarks made by comedian Tony Hinchcliffe at a Trump rally, where Hinchcliffe referred to Puerto Rico as a “floating island of garbage.” Biden stated, “The only garbage I see floating out there is his supporters.” Following backlash, the White House released an official transcript altering “supporters” to “supporter’s,” suggesting Biden was referring specifically to Hinchcliffe. This modification led to objections from White House stenographers and prompted House Republicans to question the legality of altering official records, citing potential violations of the Presidential Records Act.  ￼ ￼

For more context, here’s a news segment covering the incident: [AP sources: White House altered record of Biden’s ‘garbage’ remarks despite stenographer concerns](https://apnews.com/article/biden-garbage-transcript-puerto-rico-trump-326e2f516a94a470a423011a946b6252?utm_source=chatgpt.com)

In [15]:
print("source: ", data_copy[data_copy["predicted_label"] == 1].iloc[-1]["source"])
print(data_copy[data_copy["predicted_label"] == 1].iloc[-1]["full_content"])

source:  Guardian
Michigan congresswoman Rashida Tlaib declined to endorse Kamala Harris at a union rally in Detroit, where the war in Gaza is the top issue for the largest block of Arab American voters in the country.
Tlaib, the first Palestinian American woman to serve in Congress, is the only one of the so-called leftist “Squad” that has not endorsed the Democrat candidate. The other three members – Ayanna Pressley of Massachusetts, Ilhan Omar of Minnesota and Alexandria Ocasio-Cortez of New York – endorsed Harris in July.
“Don’t underestimate the power you all have,” Tlaib told a get-out-the-vote United Auto Workers rallygoers. “More than those ads, those lawn signs, those billboards, you all have more power to turn out people that understand we’ve got to fight back against corporate greed in our country.”
Tlaib’s non-endorsement of Harris comes as a voter survey published on Friday suggested that 43% of Muslim American voters support the Green party candidate, Jill Stein.
After Hi

The information you’ve provided is accurate. In November 2024, Congresswoman Rashida Tlaib declined to endorse Vice President Kamala Harris during a United Auto Workers rally in Detroit. This decision was influenced by the ongoing war in Gaza, a significant concern for Arab American voters in Michigan. Tlaib, the first Palestinian American woman in Congress, has been vocal in her criticism of the Democratic Party’s stance on the conflict, expressing feelings of invisibility and frustration over the lack of Palestinian American representation at the party’s convention. Her non-endorsement coincided with a survey indicating that 43% of Muslim American voters supported Green Party candidate Jill Stein, raising concerns among Democrats about potential impacts on the election outcome.  ￼

For more context, here’s a news segment covering Tlaib’s decision: [Michigan congresswoman Rashida Tlaib declines to endorse Kamala Harris](https://www.theguardian.com/us-news/2024/nov/02/rashida-tlaib-decline-endorsement-kamala-harris)