In [39]:
import os
import re
import json
from datetime import datetime
import time
import random
from concurrent.futures import ThreadPoolExecutor, as_completed

import tqdm
import requests
from pydantic import BaseModel, HttpUrl, field_serializer
import pandas as pd
from dotenv import load_dotenv
import tiktoken
from openai import OpenAI
from sqlite_utils import Database

load_dotenv()
db = Database("data.db")
articles_db = db["article"]

In [20]:
class Article(BaseModel):
    pocket_item_id: int
    given_url: HttpUrl
    resolved_url: HttpUrl
    title: str
    time_added: datetime
    word_count: int
    domain: str
    text: str | None = None
    summary: str | None = None
    summary_attempted_message: str | None = None

    @field_serializer("given_url", "resolved_url")
    def serialize_url(self, url: HttpUrl) -> str:
        return str(url)

## 1. List of likes articles


Load the items already saved in the sqlite db:

In [None]:
q = """
select *
from article
"""
results = db.query(q) # generator
articles = [Article(**item) for item in results]

### From Pocket


In [21]:
POCKET_HEADERS = {
    "Content-Type": "application/json; charset=UTF8",
    "X-Accept": "application/json",
}
POCKET_CONSUMER_KEY = os.getenv("POCKET_CONSUMER_KEY")

#### Authorize


Skip to `Read from .env file` if you have authorized previously. Otherwise:


In [22]:
data = {"consumer_key": POCKET_CONSUMER_KEY, "redirect_uri": "https://127.0.0.1"}
url = "https://getpocket.com/v3/oauth/request"
r = requests.request("Post", url=url, headers=POCKET_HEADERS, json=data)
# {"code": "abcd-1234-abcd-1234", 'state': None}

POCKET_CODE = r.json()["code"]

Replace `your-code` in the following url with the code value you get, and open the url in your browser, grant access, close the tab, then get back here and run the next cell to get access token:

https://getpocket.com/auth/authorize?request_token=your-code&redirect_uri=https://127.0.0.1


In [None]:
data = {"consumer_key": POCKET_CONSUMER_KEY, "code": POCKET_CODE}
url = "https://getpocket.com/v3/oauth/authorize"
r = requests.request("Post", url=url, headers=POCKET_HEADERS, json=data)
# {"access_token": "efgh-1234-efgh-1234", "username": "your-username"}

POCKET_ACCESS_TOKEN = r.json()["access_token"]

Add the `access_token` to `.env` file, so you can skip the authorization steps above next times and just use the following:


#### Read from `.env` file


In [25]:
POCKET_ACCESS_TOKEN = os.getenv("POCKET_ACCESS_TOKEN")

In [36]:
def get_pocket_items(state: str = "archive") -> list[Article]:
    """
    Get all items archive in pocket
    """
    pocket_items = []
    count = 500
    offset = 0
    while True:
        payload = {
            "consumer_key": POCKET_CONSUMER_KEY,
            "access_token": POCKET_ACCESS_TOKEN,
            "state": state,
            "sort": "newest",
            "count": count,
            "offset": offset,
            "detailType": "complete",
            "total": "1",  ## total number of archived articles
        }

        url = "https://getpocket.com/v3/get"
        r = requests.request("Post", url=url, headers=POCKET_HEADERS, json=payload)

        for item in r.json()["list"]:
            pocket_item = r.json()["list"][item]
            try:
                pocket_items.append(
                    Article(
                        pocket_item_id=int(pocket_item["item_id"]),
                        given_url=pocket_item["given_url"],
                        resolved_url=pocket_item["resolved_url"],
                        title=pocket_item["resolved_title"],
                        time_added=datetime.fromtimestamp(
                            int(pocket_item["time_added"])
                        ),
                        word_count=int(pocket_item["word_count"]),
                        domain=pocket_item.get("domain_metadata", {}).get("name", ""),
                    )
                )
            except Exception as e:
                print(f"Error processing item {pocket_item['item_id']}: {e}")
                continue

        if int(r.json()["total"]) > count + offset:
            print(f"Fetching more articles... {count + offset} / {r.json()['total']}")
            offset += count
        else:
            print(f"Fetched {offset + count} articles")
            break

    return pocket_items


pocket_items = get_pocket_items()
print(len(pocket_items))
pocket_items[0]

Fetching more articles... 500 / 3784
Fetching more articles... 1000 / 3784
Fetching more articles... 1500 / 3784
Fetching more articles... 2000 / 3784
Error processing item 3945908435: 1 validation error for Article
resolved_url
  Input should be a valid URL, input is empty [type=url_parsing, input_value='', input_type=str]
    For further information visit https://errors.pydantic.dev/2.10/v/url_parsing
Fetching more articles... 2500 / 3784
Fetching more articles... 3000 / 3784
Fetching more articles... 3500 / 3784
Fetched 4000 articles
3783


Article(pocket_item_id=5598506, given_url=HttpUrl('https://nat.org/'), resolved_url=HttpUrl('http://nat.org/'), title='Nat Friedman', time_added=datetime.datetime(2025, 1, 15, 11, 20, 58), word_count=451, domain='nat.org', text=None, summary=None, summary_attempted_message=None)

Storing the articles data in the sqlite db:

In [None]:
articles_db.insert_all(
    [item.model_dump() for item in pocket_items], pk="pocket_item_id", ignore=True
)

<Table article (pocket_item_id, given_url, resolved_url, title, time_added, word_count, domain, text, summary, summary_attempted_message)>

Also storing the articles data in a pandas dataframe, just to be able to explore the data if needed:

In [27]:
df = pd.DataFrame([article.model_dump() for article in pocket_items])
df.head(2)

Unnamed: 0,pocket_item_id,given_url,resolved_url,title,time_added,word_count,domain,text,summary,summary_attempted_message
0,5598506,https://nat.org/,http://nat.org/,Nat Friedman,2025-01-15 11:20:58,451,nat.org,,,
1,17661119,https://www.jofreeman.com/joreen/tyranny.htm,https://www.jofreeman.com/joreen/tyranny.htm,THE TYRANNY of STRUCTURELESSNESS,2025-01-24 00:25:45,6486,www.jofreeman.com,,,


## Extract texts using `r.jina.ai`


I will use [Jina Reader](https://jina.ai/reader/) to extract the text of the saved articles. Its free API is limited to 20 requests per minute, but you can pay and use an API key to get 200 requests per minute limit. Visit [their website](https://jina.ai/) to get an API key.


In [53]:
JINA_API_KEY = os.getenv("JINA_API_KEY")

## not used currently
def count_tokens(text: str) -> int:
    ## using gpt-4o tokenizer just to get an estimate
    return len(tiktoken.encoding_for_model("gpt-4o").encode(text))

In [46]:
def get_url_text(url: str, with_api_key: bool = True) -> str:
    request_url = f"https://r.jina.ai/{url}"

    if with_api_key:
        headers = {"Authorization": f"Bearer {JINA_API_KEY}", "X-No-Cache": "true"}
    else:
        headers = {"X-No-Cache": "true"}

    r = requests.get(request_url, headers=headers)

    if r.status_code == 402:
        raise Exception("Token limit reached.")
    if r.status_code == 422:
        print(f"422 error, probably an invalid url: {url}")
        return "Error"
    if r.status_code != 200:
        print(f"Error fetching {url}: {r.status_code}, sleeping for 10 seconds ...")
        print(r.text)
        time.sleep(10)
        return get_url_text(url, with_api_key=with_api_key)
    return r.text

In [None]:
## slow, but less likely to get rate limited

for i, item in enumerate(articles):
    if item.text:
        continue

    print(f"{i}, processing {item.resolved_url} ...")

    item.text = get_url_text(item.resolved_url, with_api_key=False)
    articles_db.update(item.pocket_item_id, {"text": item.text})
    if i % 50 == 0:
        print(f"Processed {i} articles, {len(articles) - i} articles left ...")


3527, processing https://danielwirtz/blog/bottom-up-note-taking-in-capacities ...
422 error, probably an invalid url: https://danielwirtz/blog/bottom-up-note-taking-in-capacities
3558, processing https://www.reddit.com/r/Garmin/comments/1hkjty5/i_walked_100k_steps_in_24hrs/?share_id=1wgtlO8zlZBkSB0lLPRqY&utm_content=1&utm_medium=android_app&utm_name=androidcss&utm_source=share&utm_term=2 ...
3564, processing http://blog.rongarret.info/2024/12/i-have-failed-now-what.html ...
3585, processing https://raymondtukpe.com/exploring-alternatives-to-uuidv4-enter-ulids.html ...
3615, processing https://www.david-dahan.com/blog/comparing-fastapi-and-django ...
3645, processing https://nmn.gl/blog/blog/ai-senior-developer ...
3676, processing https://nikkin.dev/blog/llm-entropy.html ...
3680, processing https://blog.anj.ai/2025/01/llm-token-generation-probabilities.html ...
3682, processing https://danielwirtz/blog/utrecht-ultra-2024-race-report ...
422 error, probably an invalid url: https://dani

In [24]:
## fast, but you'll get rate limited and should try again

# def process_batch(items: list[tuple[int, object]], max_workers=10):
#     with ThreadPoolExecutor(max_workers=max_workers) as executor:
#         future_to_item = {
#             executor.submit(get_url_text, item.resolved_url): (i, item)
#             for i, item in items
#         }

#         for future in tqdm.tqdm(as_completed(future_to_item), total=len(items)):
#             i, item = future_to_item[future]
#             try:
#                 item.text = future.result()
#             except Exception as e:
#                 print(f"Error processing item {i}: {e}")


# items_with_index = [(i, item) for i, item in enumerate(pocket_items) if not item.text]
# process_batch(items_with_index)


0it [00:00, ?it/s]


## Summarize texts


In [None]:
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

We will use Google Gemini model with OpenAI Python SDK to summarize text into a few paragraphs:


In [None]:
llm_client = OpenAI(
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
    api_key=os.getenv("GEMINI_API_KEY"),
)


def summarize_text(text: str) -> str:
    try:
        completion = llm_client.chat.completions.create(
            model="gemini-2.0-flash",
            messages=[
                {
                    "role": "system",
                    "content": "Summarize the provided text in 2-5 paragraphs, maintaining the original author's first-person perspective and voice. The summary should read as if the original author wrote it themselves as a condensed version of their full text. Return the summary between <summary> and </summary> tags.",
                },
                {"role": "user", "content": f"<text>{text}</text>"},
            ],
        )
        summary = re.search(
            r"<summary>(.*?)</summary>",
            completion.choices[0].message.content,
            re.DOTALL,
        )
        if summary:
            return True, summary.group(1)
        else:
            return False, completion.choices[0].message.content
    except Exception as e:
        print(f"summary error: {e}, sleeping for 30 seconds ...")
        time.sleep(30)
        return None, None

In [52]:
for i, item in enumerate(articles):
    if item.summary or item.summary_attempted_message:
        continue

    print(f"{i}, processing {item.resolved_url} ...")
    success, message = summarize_text(item.text)
    if success:
        item.summary = message
        articles_db.update(item.pocket_item_id, {"summary": item.summary})
    else:
        item.summary_attempted_message = message
        articles_db.update(item.pocket_item_id, {"summary_attempted_message": item.summary_attempted_message})
        
    if i % 50 == 0:
        print(f"Processed {i} urls, {len(articles) - i} left ...")

3119, processing https://longform.asmartbear.com/stealth-mode/ ...
3695, processing https://data-people-group.github.io/blogs/2025/01/13/docwrangler/ ...
3741, processing https://kellysutton.com/2025/01/18/moving-on-from-react-a-year-later.html ...
3772, processing https://shekhargulati.com/2025/01/18/can-claude-single-call-and-zero-shot-do-what-devin-cant-do/ ...
