In [1]:
import pandas as pd

# load & parse as before
df = pd.read_parquet("data/filtered_articles.parquet")
df["parsed_date"] = pd.to_datetime(df["date_published"], errors="coerce")

# grab only the failures
bad = df[df["parsed_date"].isna() & df["date_published"].notna()]

# group by source and show up to 5 unique examples each
for src, grp in bad.groupby("source_domain"):
    samples = grp["date_published"].unique()[:5]
    print(f"\n=== {src} ({len(grp)} failures) ===")
    for s in samples:
        print("  ", s)



=== abcnews.go.com (747 failures) ===
   2025-05-31T20:00:04Z
   2025-05-30T17:26:27Z
   2025-05-30T10:29:59Z
   2025-05-30T12:59:35Z
   2025-05-29T01:33:41Z

=== apnews.com (3584 failures) ===
   2024-10-03T21:39:58
   2025-05-19T09:07:26
   2025-05-07T10:07:03
   2025-05-02T22:35:51
   2025-05-17T10:09:43

=== cnbc.com (582 failures) ===
   2023-10-07T16:23:27+0000
   2023-10-08T07:48:54+0000
   2023-10-07T06:57:36+0000
   2023-10-08T18:39:35+0000
   2023-10-08T16:09:31+0000

=== cnn.com (2085 failures) ===
   2025-05-21T06:52:00
   2025-05-28T06:38:00
   2025-05-06T16:14:00
   2025-05-06T18:48:00
   2025-05-09T11:00:00

=== dailymail.co.uk (2990 failures) ===
   00:53, 8 October 2023
   00:51, 8 October 2023
   00:00, 8 October 2023
   22:53, 7 October 2023
   22:22, 7 October 2023

=== express.co.uk (1919 failures) ===
   2023-10-31T22:47:00Z
   2023-10-31T22:01:00Z
   2023-10-31T21:48:00Z
   2023-10-31T19:31:00Z
   2023-10-31T19:04:00Z

=== foxnews.com (2749 failures) ===
    Mar

In [None]:
import pandas as pd
import dateparser

# 1. Load your filtered articles
df = pd.read_parquet("data/filtered_articles.parquet")

# 2. Define a parser that strips known prefixes and lets dateparser handle the rest
def parse_date(s):
    if not isinstance(s, str) or not s.strip():
        return pd.NaT
    # Remove common “Updated:” prefix
    s_clean = s.replace("Updated:", "").strip()
    # dateparser will handle ISO, GMT, “pm”/“am”, timezones, weekday names, etc.
    dt = dateparser.parse(
        s_clean,
        settings={
            "RETURN_AS_TIMEZONE_AWARE": False,
            "PREFER_DAY_OF_MONTH": "first",
            "PARSERS": ["timestamp", "relative-time", "custom-formats"]
        }
    )
    return pd.to_datetime(dt) if dt else pd.NaT

# 3. Apply it
df["parsed_date"]   = df["date_published"].apply(parse_date)
df["date_readable"] = df["parsed_date"].notna()

# 4. Inspect the results
stats = (
    df.groupby("source_domain")
      .agg(
         total_articles = ("date_readable","size"),
         readable_dates = ("date_readable","sum")
      )
      .reset_index()
)
stats["unreadable_dates"]  = stats["total_articles"] - stats["readable_dates"]
stats["pct_readable"]      = 100 * stats["readable_dates"] / stats["total_articles"]

print(stats.sort_values("pct_readable"))


In [None]:
import pandas as pd
import dateparser

# 1. Load your filtered articles
df = pd.read_parquet("data/filtered_articles.parquet")

# 2. Define a robust parser
def parse_date(s):
    if not isinstance(s, str) or not s.strip():
        return pd.NaT
    s_clean = s.replace("Updated:", "").strip()
    dt = dateparser.parse(
        s_clean,
        settings={"RETURN_AS_TIMEZONE_AWARE": False}
    )
    return pd.to_datetime(dt) if dt else pd.NaT

# 3. Apply to entire column
df["publish_date_cleaned"] = df["date_published"].apply(parse_date)

# 4. Flag readable vs unreadable
df["date_readable"] = df["publish_date_cleaned"].notna()

# 5. Compute per-source stats
stats = (
    df.groupby("source_domain")
      .agg(
         total_articles   = ("date_readable","size"),
         readable_dates   = ("date_readable","sum")
      )
      .reset_index()
)
stats["unreadable_dates"] = stats["total_articles"] - stats["readable_dates"]
stats["pct_readable"]     = 100 * stats["readable_dates"] / stats["total_articles"]

# 6. Print the stats
print(stats.sort_values("pct_readable"))

# 7. Save the new Parquet
df.to_parquet("data/filtered_articles_cleaned.parquet", index=False)
print("Saved cleaned data with publish_date_cleaned to data/filtered_articles_cleaned.parquet")


In [None]:
import pandas as pd
import re

# 1. Load the cleaned articles DataFrame
df = pd.read_parquet("data/filtered_articles_cleaned.parquet")

# 2. Function to extract YYYY/MM/DD from NYPost URLs
def extract_nypost_date(url):
    m = re.search(r"https?://[^/]+/(\d{4})/(\d{2})/(\d{2})/", url)
    if not m:
        return pd.NaT
    y, mth, d = m.groups()
    return pd.Timestamp(f"{y}-{mth}-{d}")

# 3. Apply to NYPost rows
mask = df["source_domain"] == "nypost.com"
df.loc[mask, "publish_date_cleaned"] = df.loc[mask, "url"].apply(extract_nypost_date)

# 4. Update readability
df["date_readable"] = df["publish_date_cleaned"].notna()

# 5. Recompute stats
stats = (
    df.groupby("source_domain")
      .agg(
         total_articles   = ("date_readable", "size"),
         readable_dates   = ("date_readable", "sum")
      )
      .reset_index()
)
stats["unreadable_dates"] = stats["total_articles"] - stats["readable_dates"]
stats["pct_readable"]     = 100 * stats["readable_dates"] / stats["total_articles"]

# 6. Print the updated stats
print(stats.sort_values("pct_readable").to_string(index=False))

# 7. Overwrite the cleaned Parquet
df.to_parquet("data/filtered_articles_cleaned.parquet", index=False)
print("\n✅ Saved updated DataFrame (with NYPost dates) to 'data/filtered_articles_cleaned.parquet'.")


In [None]:
import pandas as pd
import re

# 1. Load your cleaned articles DataFrame
df = pd.read_parquet("data/filtered_articles_cleaned.parquet")

# 2. Month name → number map for Guardian URLs
month_map = {
    "january":   "01", "february": "02", "march":    "03",
    "april":     "04", "may":      "05", "june":     "06",
    "july":      "07", "august":   "08", "september":"09",
    "october":   "10", "november": "11", "december": "12"
}

# 3. Function to extract date from Guardian URL
def extract_guardian_date(url):
    """
    Guardian URLs look like:
      https://www.theguardian.com/<section>/<year>/<month-name>/<day>/...
    This extracts year, month-name, day and returns a Timestamp.
    """
    m = re.search(r"https?://[^/]+/[^/]+/(\d{4})/([^/]+)/(\d{1,2})/", url)
    if not m:
        return pd.NaT
    year, mon_name, day = m.groups()
    mon = month_map.get(mon_name.lower())
    if not mon:
        return pd.NaT
    return pd.Timestamp(f"{year}-{mon}-{int(day):02d}")

# 4. Apply only to Guardian rows
mask = df["source_domain"] == "theguardian.com"
df.loc[mask, "publish_date_cleaned"] = df.loc[mask, "url"].apply(extract_guardian_date)

# 5. Update readability flag
df["date_readable"] = df["publish_date_cleaned"].notna()

# 6. Recompute per-source stats
stats = (
    df.groupby("source_domain")
      .agg(
         total_articles   = ("date_readable", "size"),
         readable_dates   = ("date_readable", "sum")
      )
      .reset_index()
)
stats["unreadable_dates"] = stats["total_articles"] - stats["readable_dates"]
stats["pct_readable"]     = 100 * stats["readable_dates"] / stats["total_articles"]

# 7. Print the updated stats
print(stats.sort_values("pct_readable").to_string(index=False))

# 8. Save back to Parquet
df.to_parquet("data/filtered_articles_cleaned.parquet", index=False)
print("\n✅ Updated 'publish_date_cleaned' for theguardian.com and saved to data/filtered_articles_cleaned.parquet.")


In [None]:
import pandas as pd
import re

# 1. Load your already‐cleaned DataFrame
df = pd.read_parquet("data/filtered_articles_cleaned.parquet")

# 2. Flexible regex to find the first 4-digit segment → month → day
guardian_re = re.compile(
    r"https?://[^/]+/(?:[^/]+/)*?(\d{4})/([^/]+)/(\d{1,2})/"
)

def extract_guardian_from_url(url):
    m = guardian_re.search(url)
    if not m:
        return pd.NaT
    year, mon_name, day = m.groups()
    # month as two digits: Jan→01, Feb→02, … May→05, etc.
    try:
        month = f"{pd.to_datetime(mon_name[:3], format='%b').month:02d}"
    except:
        return pd.NaT
    return pd.Timestamp(f"{year}-{month}-{int(day):02d}")

# 3. Only re‐extract for guardian rows still unreadable:
mask = (df["source_domain"] == "theguardian.com") & df["publish_date_cleaned"].isna()
df.loc[mask, "publish_date_cleaned"] = (
    df.loc[mask, "url"]
      .apply(extract_guardian_from_url)
)

# 4. Update the readability flag
df["date_readable"] = df["publish_date_cleaned"].notna()

# 5. Recompute & print per‐source stats
stats = (
    df.groupby("source_domain")
      .agg(
         total_articles   = ("date_readable","size"),
         readable_dates   = ("date_readable","sum")
      )
      .reset_index()
)
stats["unreadable_dates"] = stats["total_articles"] - stats["readable_dates"]
stats["pct_readable"]     = 100 * stats["readable_dates"] / stats["total_articles"]

print(stats.sort_values("pct_readable").to_string(index=False))

# 6. Save back to Parquet
df.to_parquet("data/filtered_articles_cleaned.parquet", index=False)
print("\n✔️ Updated guardian dates and saved to data/filtered_articles_cleaned.parquet")


In [3]:
import pandas as pd
import re

# 1. Load the cleaned articles DataFrame
df = pd.read_parquet("data/filtered_articles_cleaned.parquet")

# 2. Function to extract YYYY/MM/DD from CNN URLs
def extract_cnn_date(url):
    """
    CNN URLs typically start with:
       https://www.cnn.com/YYYY/MM/DD/...
    This regex captures the year, month, and day segments.
    """
    m = re.search(r"https?://(?:www\.)?cnn\.com/(\d{4})/(\d{2})/(\d{2})/", url)
    if not m:
        return pd.NaT
    year, month, day = m.groups()
    return pd.Timestamp(f"{year}-{month}-{day}")

# 3. Apply to CNN rows still missing a cleaned date
mask = (df["source_domain"] == "cnn.com") & df["publish_date_cleaned"].isna()
df.loc[mask, "publish_date_cleaned"] = (
    df.loc[mask, "url"]
      .apply(extract_cnn_date)
)

# 4. Update the readability flag
df["date_readable"] = df["publish_date_cleaned"].notna()

# 5. Recompute per-source stats
stats = (
    df.groupby("source_domain")
      .agg(
         total_articles = ("date_readable", "size"),
         readable_dates = ("date_readable", "sum")
      )
      .reset_index()
)
stats["unreadable_dates"] = stats["total_articles"] - stats["readable_dates"]
stats["pct_readable"]     = 100 * stats["readable_dates"] / stats["total_articles"]

# 6. Print the updated stats
print(stats.sort_values("pct_readable").to_string(index=False))

# 7. Save your updates back into the same Parquet
df.to_parquet("data/filtered_articles_cleaned.parquet", index=False)
print("\n✅ CNN dates extracted from URLs and saved to data/filtered_articles_cleaned.parquet")


     source_domain  total_articles  readable_dates  unreadable_dates  pct_readable
 indianexpress.com            2818            2622               196     93.044713
           bbc.com            2376            2253               123     94.823232
      newsweek.com            2163            2153                10     99.537679
    abcnews.go.com             747             747                 0    100.000000
           cnn.com            2085            2085                 0    100.000000
   dailymail.co.uk            2990            2990                 0    100.000000
        apnews.com            3584            3584                 0    100.000000
          cnbc.com             582             582                 0    100.000000
       foxnews.com            2749            2749                 0    100.000000
     express.co.uk            1919            1919                 0    100.000000
 independent.co.uk            6511            6511                 0    100.000000
hind