In [10]:
import os
import sys
from pathlib import Path

if "workding_dir" not in locals():
    workding_dir = str(Path.cwd().parent)
os.chdir(workding_dir)
sys.path.append(workding_dir)
print("working dir:", workding_dir)

working dir: /Users/inflaton/code/engd/papers/DM-Fake-News-Detection


In [11]:
# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define function to process text
import string
from nltk.stem import *
from nltk.stem.porter import *

# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm

# Plotting and Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Language Detection packages
# `langdetect` for detecting language
from langdetect import detect as langdetect_detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

# `langid` for an alternative language detection method
from langid import classify as langid_classify

# Text Preprocessing and NLP
# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords

# Tokenizing sentences/words
from nltk.tokenize import word_tokenize

# Part-of-speech tagging
from nltk import pos_tag

# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer
import nltk

# Regular expressions for text pattern matching
import re


def process_full_review(text):
    # Convert to lowercase and tokenize
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in string.punctuation]
    stemmer = PorterStemmer()
    # List of stopwords
    stop_words = stopwords.words("english")
    allowed_words = [
        "no",
        "not",
        "don't",
        "dont",
        "don",
        "but",
        "however",
        "never",
        "wasn't",
        "wasnt",
        "shouldn't",
        "shouldnt",
        "mustn't",
        "musnt",
    ]

    stemmed = [
        stemmer.stem(word)
        for word in tokens
        if word not in stop_words or word in allowed_words
    ]
    return " ".join(stemmed)

In [12]:
from datasets import load_dataset, concatenate_datasets, Dataset

datasets = load_dataset(
    "csv",
    data_files={
        "train": [
            "dataset/train_data_1.csv",
            "dataset/train_data_2.csv",
            "dataset/train_data_3.csv",
            "dataset/train_data_4.csv",
        ],
        "test": "dataset/test_data.csv",
        "rewritten_test": "dataset/rewritten_test_data.csv",
    },
)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'full_content', 'processed_full_content'],
        num_rows: 54441
    })
    test: Dataset({
        features: ['label', 'full_content', 'processed_full_content'],
        num_rows: 6050
    })
    rewritten_test: Dataset({
        features: ['label', 'full_content', 'processed_full_content'],
        num_rows: 6050
    })
})

In [13]:
# load model
from tensorflow.keras.models import load_model

model = load_model("results/CNN_model.keras")
model.summary()

  saveable.load_own_variables(weights_store.get(inner_path))


In [14]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import pandas as pd
from tqdm import tqdm

def evaluate_model(model, train_data, val_data, force_reprocess=False):
    # Apply process_full_review function with tqdm progress bar and expand the results into separate columns.
    processed_columns = "processed_full_content"
    if force_reprocess or processed_columns not in train_data.columns:
        # Enable tqdm for pandas (progress bar)
        tqdm.pandas(desc="Processing Train Data")
        
        train_data[processed_columns] = train_data["full_content"].progress_apply(
            lambda x: pd.Series(process_full_review(x))
        )

    if force_reprocess or processed_columns not in val_data.columns:
        # Enable tqdm for pandas (progress bar)
        tqdm.pandas(desc="Processing Val Data")

        # Apply process_full_review function with tqdm progress bar and expand the results into separate columns.
        val_data[processed_columns] = val_data["full_content"].progress_apply(
            lambda x: pd.Series(process_full_review(x))
        )

    print("Evaluating Model")
    
    max_words = 10000
    max_sequence_length = 300

    train_texts = train_data["processed_full_content"]
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(train_texts)

    y_val = val_data["label"]
    val_texts = val_data["processed_full_content"]

    X_val = pad_sequences(
        tokenizer.texts_to_sequences(val_texts), maxlen=max_sequence_length
    )
    y_pred = (model.predict(X_val) > 0.5).astype(int)

    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1: {f1:.4f}")

In [15]:
df_train = datasets["train"].to_pandas()
df_test = datasets["test"].to_pandas()
evaluate_model(model, df_train, df_test, force_reprocess=True)

Processing Train Data: 100%|██████████| 54441/54441 [02:49<00:00, 322.09it/s]
Processing Val Data: 100%|██████████| 6050/6050 [00:18<00:00, 326.50it/s]


Evaluating Model
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Accuracy: 0.9740
Precision: 0.9684
Recall: 0.9724
F1: 0.9704


In [16]:
df_test_rewritten = datasets["rewritten_test"].to_pandas()
evaluate_model(model, df_train, df_test_rewritten, force_reprocess=True)

Processing Train Data:   0%|          | 0/54441 [00:00<?, ?it/s]

Processing Train Data: 100%|██████████| 54441/54441 [02:46<00:00, 326.10it/s] 
Processing Val Data: 100%|██████████| 6050/6050 [00:13<00:00, 461.34it/s]


Evaluating Model
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Accuracy: 0.8218
Precision: 0.9025
Recall: 0.6645
F1: 0.7654


In [17]:
(df_train == datasets["train"].to_pandas()).all()

label                      True
full_content               True
processed_full_content    False
dtype: bool

In [28]:
# Number of splits
n_splits = 4

# Split the DataFrame into n_splits parts
split_dfs = np.array_split(df_train, n_splits)

# Each element in split_dfs is a DataFrame
for i, split_df in enumerate(split_dfs):
    split_df = split_df.reset_index(drop=True)
    print(f"DataFrame part {i}:\n{len(split_df)}\n")
    file_name = f"dataset/train_data_{i + 1}.csv"
    print(file_name)
    df_split = pd.read_csv(file_name)
    # print(split_df.info())
    # print(df_split.info())
    print((df_split == split_df).all())
    split_df.to_csv(file_name, index=False)

  return bound(*args, **kwds)


DataFrame part 0:
13611

dataset/train_data_1.csv
label                     True
full_content              True
processed_full_content    True
dtype: bool
DataFrame part 1:
13610

dataset/train_data_2.csv
label                     True
full_content              True
processed_full_content    True
dtype: bool
DataFrame part 2:
13610

dataset/train_data_3.csv
label                     True
full_content              True
processed_full_content    True
dtype: bool
DataFrame part 3:
13610

dataset/train_data_4.csv
label                     True
full_content              True
processed_full_content    True
dtype: bool
