In [8]:
import pandas as pd
import json
from pathlib import Path

path = Path("prothomalo_articles.jsonl")
rows = []

# Try UTF-8 first, then fallback to latin-1 if needed
with open(path, "r", encoding="utf-8", errors="replace") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            rows.append(json.loads(line))
        except Exception:
            pass

df = pd.DataFrame(rows)

# Normalize list-like fields (tags)
if "tags" in df.columns:
    df["tags"] = df["tags"].apply(lambda x: x if isinstance(x, list) else ([] if pd.isna(x) else [x]))

print("\n--- BASIC INFO ---")
print(df.info())
print("\n--- SAMPLE ROWS ---")
print(df.sample(min(5, len(df))))

print("\n--- MISSING VALUES PER COLUMN ---")
print(df.isna().sum())

if "url" in df.columns:
    print("\n--- DUPLICATE URL COUNT ---", df["url"].duplicated().sum())

if "published_iso" in df.columns:
    df["published_dt"] = pd.to_datetime(df["published_iso"], errors="coerce")
    print("\n--- DATE RANGE ---")
    print(df["published_dt"].min(), "to", df["published_dt"].max())
    print("\n--- ARTICLES PER YEAR ---")
    print(df["published_dt"].dt.year.value_counts().sort_index())

if "section" in df.columns:
    print("\n--- TOP SECTIONS ---")
    print(df["section"].value_counts().head(20))

if "author" in df.columns:
    print("\n--- TOP AUTHORS ---")
    print(df["author"].value_counts().head(20))

if "word_count" in df.columns:
    print("\n--- WORD COUNT STATS ---")
    print(df["word_count"].describe())
    print("\n--- TOP LONGEST ARTICLES ---")
    print(df.nlargest(5, "word_count")[["url", "title", "word_count"]])

if "tags" in df.columns:
    tag_counts = pd.Series([t for sub in df["tags"] for t in sub]).value_counts()
    print("\n--- TOP TAGS ---")
    print(tag_counts.head(20))
    print(f"\nTotal unique tags: {tag_counts.shape[0]}")

if "published_dt" in df.columns:
    print("\n--- ARTICLES PER MONTH ---")
    print(df.groupby(df["published_dt"].dt.to_period("M")).size().sort_index())



--- BASIC INFO ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206762 entries, 0 to 206761
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   url             206762 non-null  object 
 1   title           206758 non-null  object 
 2   section         193277 non-null  object 
 3   author          184981 non-null  object 
 4   published_iso   206484 non-null  object 
 5   published_text  206737 non-null  object 
 6   tags            206762 non-null  object 
 7   body            189076 non-null  object 
 8   word_count      10146 non-null   float64
dtypes: float64(1), object(8)
memory usage: 14.2+ MB
None

--- SAMPLE ROWS ---
                                                              url  \
141737       https://www.prothomalo.com/sports/cricket/2hug13aa9g   
144657  https://www.prothomalo.com/bangladesh/district/fy74i787z8   
49512    https://www.prothomalo.com/entertainment/song/3hu6slbgnr   
185452   

  print(df.groupby(df["published_dt"].dt.to_period("M")).size().sort_index())
