In [11]:
import os
import sys
from pathlib import Path

if "workding_dir" not in locals():
    workding_dir = str(Path.cwd().parent)
os.chdir(workding_dir)
sys.path.append(workding_dir)
print("working dir:", workding_dir)

working dir: /Users/inflaton/code/engd/papers/DM-Fake-News-Detection


# Text Preprocessing for NLP

Here we will define a function `process_full_review` that takes a textual value as input and applies the following processing steps in sequence:

1. Convert the input text to lowercase using the `lower()` function.

2. Tokenize the lowercase text using the `word_tokenize` function from the NLTK library.

3. Create a list (`alphabetic_tokens`) containing only alphanetic tokens using a list comprehension with a regular expression match.

4. Remove stopwords
-   Obtain a set of English stopwords using the `stopwords.words('english')` method.
-   Define a list of `allowed_words` that should not be removed.
-   Remove the stopwords (excluding those that should not be removed).

5. Apply stemming to each token.

6. Join the stemmed tokens into a single processed text using the `join` method and return the processed text.

In [12]:
import nltk

# Ensure require NLTK data is downloaded
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("averaged_perceptron_tagger")
nltk.download("wordnet")
nltk.download("all")

[nltk_data] Downloading package punkt to /Users/inflaton/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/inflaton/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/inflaton/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/inflaton/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/inflaton/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/inflaton/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading packa

True

In [13]:
# Define function to process text
import string
from nltk.stem import *
from nltk.stem.porter import *

# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm

# Plotting and Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Language Detection packages
# `langdetect` for detecting language
from langdetect import detect as langdetect_detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

# `langid` for an alternative language detection method
from langid import classify as langid_classify

# Text Preprocessing and NLP
# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords

# Tokenizing sentences/words
from nltk.tokenize import word_tokenize

# Part-of-speech tagging
from nltk import pos_tag

# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer
import nltk

# Regular expressions for text pattern matching
import re


def process_full_review(text):
    # Convert to lowercase and tokenize
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in string.punctuation]
    stemmer = PorterStemmer()
    # List of stopwords
    stop_words = stopwords.words("english")
    allowed_words = [
        "no",
        "not",
        "don't",
        "dont",
        "don",
        "but",
        "however",
        "never",
        "wasn't",
        "wasnt",
        "shouldn't",
        "shouldnt",
        "mustn't",
        "musnt",
    ]

    stemmed = [
        stemmer.stem(word)
        for word in tokens
        if word not in stop_words or word in allowed_words
    ]
    return " ".join(stemmed)

In [14]:
import pandas as pd

df = pd.read_csv("results/llm_rewrite_results.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6050 entries, 0 to 6049
Data columns (total 3 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   label                          6050 non-null   int64 
 1   full_content                   6050 non-null   object
 2   qwen2.5:7b_8k/shots-00(3.319)  6050 non-null   object
dtypes: int64(1), object(2)
memory usage: 141.9+ KB


In [15]:
data = df[["label"]].copy()
data["full_content"] = df["qwen2.5:7b_8k/shots-00(3.319)"]
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6050 entries, 0 to 6049
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   label         6050 non-null   int64 
 1   full_content  6050 non-null   object
dtypes: int64(1), object(1)
memory usage: 94.7+ KB


In [16]:
from tqdm import tqdm

# Enable tqdm for pandas (progress bar)
tqdm.pandas(desc="Processing News Articles")

# Apply process_full_review function with tqdm progress bar and expand the results into separate columns.
processed_columns = ["processed_full_content"]
data[processed_columns] = data["full_content"].progress_apply(
    lambda x: pd.Series(process_full_review(x))
)

data

Processing News Articles: 100%|██████████| 6050/6050 [00:13<00:00, 460.29it/s]


Unnamed: 0,label,full_content,processed_full_content
0,0,{'content': 'BRUSSELS (Reuters) – The European...,'content 'brussel reuter – european parliament...
1,0,"{'content': ""HARARE (Reuters) - Zimbabwe Presi...",'content `` harar reuter zimbabw presid robert...
2,1,{'content': 'Obamacare was pitched as a way to...,'content 'obamacar pitch way provid health ins...
3,0,{'content': 'WASHINGTON (Reuters) - The U.S. H...,'content 'washington reuter u.s. hous repres t...
4,1,{'content': '### Collapse of the \'Russia Hack...,'content collaps \'russia hack\ witch hunt\n\n...
...,...,...,...
6045,1,{'content': '**Republicans Hilariously Go Ball...,'content republican hilari go ballist \'americ...
6046,1,"{'content': ""### Official Count Shows Trump La...",'content `` offici count show trump landslid v...
6047,1,{'content': '**Gold and Oil: A Historical Pers...,'content gold oil histor perspect reveal fragi...
6048,0,{'content': 'NEW YORK (Reuters) - American fin...,'content 'new york reuter american financi adv...


Here we create a new `csv` file called `processed_data.csv` where we store the new processed dataframe.

In [17]:
import json

idx = json.load(open("results/train_val_idx.json"))
idx.keys()

dict_keys(['train_idx', 'val_idx'])

In [18]:
df = pd.read_csv("processed_data_filtered.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60491 entries, 0 to 60490
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   label                   60491 non-null  int64 
 1   full_content            60491 non-null  object
 2   processed_full_content  60491 non-null  object
 3   word_count              60491 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 1.8+ MB


In [19]:
for i, index in enumerate(idx["val_idx"]):
    df.at[index, "full_content"] = data.iloc[i]["full_content"]
    df.at[index, "processed_full_content"] = data.iloc[i]["processed_full_content"]
    df.at[index, "label"] = data.iloc[i]["label"]
    df.at[index, "word_count"] = len(str(data.iloc[i]["full_content"]).split())

In [20]:
df.to_csv("processed_data_test_rewritten.csv", index=False)