In [2]:
import kagglehub
# Download latest version
path = kagglehub.dataset_download("clmentbisaillon/fake-and-real-news-dataset")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\D.Ivanovas\.cache\kagglehub\datasets\clmentbisaillon\fake-and-real-news-dataset\versions\1


### Polars

In [3]:
import polars as pl
import os

# Read the data
fake_df = pl.read_csv(os.path.join(path, 'fake.csv'))
true_df = pl.read_csv(os.path.join(path, 'true.csv'))

# Add labels
fake_df = fake_df.with_columns(pl.lit(0).alias("label"))
true_df = true_df.with_columns(pl.lit(1).alias("label"))

# Concatenate and shuffle the DataFrame
joined_df = pl.concat([fake_df, true_df]).sample(fraction=1).with_row_index().select(pl.exclude("index"))


In [4]:
len(joined_df)

44898

### Pandas

In [None]:
import pandas as pd
import os 

fake_df = pd.read_csv(os.path.join(path, 'fake.csv'))
true_df = pd.read_csv(os.path.join(path, 'true.csv'))

fake_df["label"]=0
true_df["label"]=1

joined_df = pd.concat([fake_df, true_df], axis=0, ignore_index=True) # sujungiam sulablintus datasetus
joined_df = joined_df.sample(frac=1).reset_index(drop=True) # sushufflinam dataseta


# Features

In [3]:
from spellchecker import SpellChecker
import spacy
import swifter
import re

spell = SpellChecker()
nlp = spacy.load('en_core_web_sm')

def extract_features(text):
    doc = nlp(text)
    word_count = len(text.split())
    stopword_count = sum(1 for token in doc if token.is_stop)
    punct_count = sum(1 for token in doc if token.is_punct)
    ent_count = len(doc.ents)
    # misspelled = spell.unknown(text)  # Using `spell.unknown` to get misspelled words
    # wierd_symbols_count = len(misspelled)
    
    return word_count, stopword_count, punct_count, ent_count


### Polars

title,text,subject,date,label
str,str,str,str,i32
""" Donald Trump Sends Out Embarr…","""Donald Trump just couldn t wis…","""News""","""December 31, 2017""",0
""" Drunk Bragging Trump Staffer …","""House Intelligence Committee C…","""News""","""December 31, 2017""",0
""" Sheriff David Clarke Becomes …","""On Friday, it was revealed tha…","""News""","""December 30, 2017""",0
""" Trump Is So Obsessed He Even …","""On Christmas day, Donald Trump…","""News""","""December 29, 2017""",0
""" Pope Francis Just Called Out …","""Pope Francis used his annual C…","""News""","""December 25, 2017""",0
…,…,…,…,…
""" ‘Responsible’ Gun Owner Tries…","""Okay, we all probably get it. …","""News""","""July 4, 2016""",0
""" Watch A Trump Supporter GO BO…","""Anyone trying to defend Donald…","""News""","""July 4, 2016""",0
""" Donald Trump Hilariously Gets…","""Donald Trump could have just s…","""News""","""July 4, 2016""",0
""" WOW: Duck Dynasty Star Phil R…","""No one can accuse Duck Dynasty…","""News""","""July 4, 2016""",0


In [None]:
import multiprocessing as mp

def process_chunks(chunk):
    chunk = chunk.with_columns(
        pl.struct("text").map_elements(lambda s: extract_features(s['text']), return_dtype=list).alias("Features")
    )
    

num_partitions = 8
chunk_size = len(joined_df) // num_partitions
print(f"Selected chunk size: {chunk_size}")
chunks = []
for i in range(num_partitions):
    chunks.append(joined_df.slice(i*chunk_size, (i+1)*chunk_size))
    print("Chunk i added")
    
print("Starting processing")
with mp.Pool(num_partitions) as pool:
    result_chunks = pool.map(process_chunks, chunks)
    
polars_df = pl.concat(result_chunks)

Selected chunk size: 5612
Chunk i added
Chunk i added
Chunk i added
Chunk i added
Chunk i added
Chunk i added
Chunk i added
Chunk i added
Starting processing


In [15]:
joined_df = joined_df.with_columns(
    pl.struct("text").map_elements(lambda s: extract_features(s['text']), return_dtype=list).alias("Features")
)

polars.dataframe.frame.DataFrame

### Pandas


In [None]:
joined_df[['word_count', 'stopword_count', 'punct_count', 'wierd_symbols_count', 'ent_count']] = \
    joined_df['text'].swifter.apply(lambda text: extract_features(text)).apply(pd.Series)

joined_df['ws_ratio'] = joined_df['word_count'] / joined_df['stopword_count']

Pandas Apply:   0%|          | 0/44898 [00:00<?, ?it/s]

# Text processing

In [None]:
# '\xad' soft hyphen | \u200e  left to right symbol | \u200a Unicode Character “ ” | \u200b Zero-width space
# \u200f Right-to-Left Mark
unwanted_symbols = ['–', '—', '‘', '“', '”', '•', '…', '☑', '➡', 'ツ',  '¯','°', '´', '¿', '\xad', '\u200e', '\u200a', '\u200b', '\u200f']

def process_text(text):
    # Turkey’s -> Turkey's
    text = re.sub(r'(?<=\w)’(?=s)', "'", text)
    # replace all whitespaces, including unicode spaces and tabs with regular space
    pattern = '[' + re.escape(''.join(unwanted_symbols)) + ']'  # Escapes special regex characters
    text = re.sub(pattern, '', text)    
    text = re.sub(r'\s', ' ', text)
    doc = nlp(text)
    sent = []
    # check for stop words and punctuation
    for token in doc:
        if not token.is_punct and not token.is_stop:
            sent.append(token.text)
    
    return ' '.join(sent)

# use swifter for faster .apply()
joined_df['processed_text'] = joined_df['text'].apply(process_text)

joined_df.to_csv(os.path.join(path, 'processed_text.csv'))

Pandas Apply:   0%|          | 0/44898 [00:00<?, ?it/s]

In [14]:
import sys
sys.path.append(r'C:\Users\D.Ivanovas\Desktop\uni\Fake-News')

In [None]:
from paths import DATA_DIR
import shutil

shutil.move(str(os.path.join(path, 'processed_text.csv')), str(DATA_DIR))

'C:\\Users\\D.Ivanovas\\Desktop\\uni\\Fake-News\\data'