In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import re

In [2]:
df = pd.read_csv('politifact_full_dataset.csv')

In [3]:
dff = df.copy()

In [4]:
dff['ruling'] = dff['ruling'].astype(str).str.lower().str.strip()

In [5]:
dff['ruling'].value_counts()

ruling
false          1039
pants-fire      238
barely-true      96
half-true        56
mostly-true      40
true             24
full-flop         4
half-flip         2
Name: count, dtype: int64

#### mapping ruling to 3 labels 'supports', 'refutes', and 'uncertain'

In [15]:
#mapping ruling to 3 labels supports, refutes, and uncertain
def map_ruling_to_label(r):
    if r in ['true', 'mostly-true', 'half-true']:
        return 'supports'

    elif r in ['false', 'pants-fire', 'full-flop', 'half-flip']:
        return 'refutes'

    elif r in ['barely_true']:
        return 'uncertain'

    else: 
        return 'uncertain'

# dff['ruling'] = dff['ruling'].apply(lambda r: map_ruling_to_label(r))

In [8]:
dff['label_multi'] = dff['ruling'].apply(lambda r: map_ruling_to_label(r))

In [10]:
dff['label_multi'].value_counts()

label_multi
refutes      1283
supports      120
uncertain      96
Name: count, dtype: int64

In [12]:
dff['label_bin'] = dff['label_multi'].map({'supports': 1, 'refutes': 0, 'uncertain': -1})

#### Cleaning text fields present

In [16]:
import html

def clean_text(s):
    if pd.isna(s):
        return ""
    s = str(s)
    s = html.unescape(s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

In [19]:
for col in ['claim', 'desc', 'tags', 'summary', 'article', 'speaker']:
    dff[col] = dff[col].fillna('').astype(str).apply(clean_text)

In [21]:
#removing url's in the article
dff['article_nourl'] = dff['article'].str.replace(r'http\S+', '', regex=True).str.strip()

#### Changing dates to a datetime format and handling exceptions

In [32]:
from dateutil import parser

def parse_date(s):
    try:
        return parser.parse(s)
    except Exception:
        return pd.NaT

In [47]:
test_series = dff['date'].apply(parse_date)

In [48]:
null_date_indexes = test_series[test_series.isna() == True].index

In [63]:
dff.loc[null_date_indexes, 'date'].head(20)

5      Septiembre 5, 2025
24        Agosto 15, 2025
29         Agosto 7, 2025
55         Julio 16, 2025
67          Julio 7, 2025
68          Julio 3, 2025
69          Julio 2, 2025
115         Mayo 28, 2025
123         Mayo 21, 2025
124         Mayo 21, 2025
133         Mayo 16, 2025
141          Mayo 9, 2025
170        Abril 11, 2025
184         Abril 4, 2025
194         Abril 3, 2025
216        Marzo 27, 2025
217        Marzo 27, 2025
221        Marzo 25, 2025
222        Marzo 25, 2025
226        Marzo 21, 2025
Name: date, dtype: object

In [64]:
#from the above experiment we observe that in some cases, the names of month written in Spanish.

In [70]:
spanish_to_english = {
    r'\bEnero\b': 'January', r'\benero\b': 'January',
    r'\bFebrero\b': 'February', r'\bfebrero\b': 'February',
    r'\bMarzo\b': 'March', r'\bmarzo\b': 'March',
    r'\bAbril\b': 'April', r'\babril\b': 'April',
    r'\bMayo\b': 'May', r'\bmayo\b': 'May',
    r'\bJunio\b': 'June', r'\bjunio\b': 'June',
    r'\bJulio\b': 'July', r'\bjulio\b': 'July',
    r'\bAgosto\b': 'August', r'\bagosto\b': 'August',
    r'\bSeptiembre\b': 'September', r'\bseptiembre\b': 'September',
    r'\bOctubre\b': 'October', r'\boctubre\b': 'October',
    r'\bNoviembre\b': 'November', r'\bnoviembre\b': 'November',
    r'\bDiciembre\b': 'December', r'\bdiciembre\b': 'December'
}

In [71]:
def replace_months(series):
    s = series.fillna("").astype(str)
    s = s.str.strip()
    for pattern, repl in spanish_to_english.items():
        s = s.str.replace(pattern, repl, regex = True)
    s = s.replace('', pd.NA)
    return s

In [72]:
dff['date_norm'] = replace_months(dff['date'])

In [77]:
dff['date_parsed'] = dff['date_norm'].apply(parse_date)

In [78]:
print(dff['date_parsed'].isna().sum(), 'missing dates')

0 missing dates


In [79]:
# so by converting spanish month names to english using regex replacement, we were able to achieve 100% accuracy in date conversion.

#### Creating claim and article lengths

In [84]:
dff['claim_len'] = dff['claim'].str.split().apply(len)
dff['article_len'] = dff['article'].str.split().apply(len)

#### Making evidence snippets of few words

In [85]:
def make_snippet(row, n_words = 80):
    if row['summary']:
        return row['summary'][:2000]
    art = row['article_nourl'] if 'article_nourl' in row else row['article']
    words = art.split()
    return " ".join(words[:n_words])

dff['evidence_snippet'] = dff.apply(make_snippet, axis = 1)

In [93]:
social_keywords = ['facebook','instagram','x posts','twitter','tik tok','tiktok','threads','social media','viral']
dff['is_social'] = dff['speaker'].str.lower().apply(lambda s: any(k in s for k in social_keywords))

#### dropping duplicates for safety

In [103]:
dff = dff.drop_duplicates(subset = ['url'])
dff = dff.drop_duplicates(subset = ['claim', 'article'])

In [104]:
dff.to_csv('politifact_preprocessed_step1.csv', index = True)

#### making a claims dataset

In [107]:
claims_df = dff[dff['label_multi'].isin(['supports', 'refutes'])].copy()
claim_cols = ['claim', 'label_multi', 'label_bin', 'speaker', 'date_parsed']
claims_df = claims_df[claim_cols]
claims_df.to_csv('claims_dataset.csv', index = False)
print('Claims dataset:', claims_df.shape)

Claims dataset: (1403, 5)


#### chunking large articles into pieces

In [1]:
def chunk_text(text, chunk_size = 250):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

evidence_rows = []
chunk_id = 0
for indx, row in dff.iterrows():
    art = row['article_nourl'] if 'article_nourl' in row else row['article']
    if not art.strip():
        continue
    chunks = chunk_text(art, chunk_size = 250)
    for c in chunks:
        evidence_rows.append({
            "chunk_id": chunk_id,
            "url": row['url'],
            "claim": row['claim'],
            "ruling": row['ruling'],
            "label_multi": row['label_multi'],
            "speaker": row['speaker'],
            "date": row['date_parsed'],
            "chunk_text": c
        })
        chunk_id += 1

evidence_df = pd.DataFrame(evidence_rows)

NameError: name 'dff' is not defined

In [118]:
print(evidence_df.shape)

(1180048, 8)


In [126]:
evidence_df.to_parquet('evidence_chunks.parquet', index = False, engine = 'fastparquet')

#### making a claim-evidence pairs dataset

In [128]:
pairs = []
for indx, row in dff.iterrows():
    pairs.append({
        'claim': row['claim'],
        'evidence': row['evidence_snippet'],
        'label': row['label_multi']
    })

pairs_df = pd.DataFrame(pairs)
pairs_df.to_csv('claim_evidence_pairs.csv', index = False)