In [14]:
import os
from datetime import datetime

import requests
from pydantic import BaseModel, HttpUrl
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

True

## 1. List of likes articles


In [3]:
class Article(BaseModel):
    pocket_item_id: int
    given_url: HttpUrl
    resolved_url: HttpUrl
    title: str
    time_added: datetime
    word_count: int
    domain: str

### a. From Pocket

In [15]:
POCKET_HEADERS = {
    "Content-Type": "application/json; charset=UTF8",
    "X-Accept": "application/json",
}
POCKET_CONSUMER_KEY = os.getenv("POCKET_CONSUMER_KEY")

#### Authorize

Skip to `Read from .env file` if you have authorized previously. Otherwise:

In [None]:
data = {"consumer_key": POCKET_CONSUMER_KEY, "redirect_uri": "https://127.0.0.1"}
url = "https://getpocket.com/v3/oauth/request"
r = requests.request("Post", url=url, headers=POCKET_HEADERS, json=data)
# {"code": "abcd-1234-abcd-1234", 'state': None}

POCKET_CODE = r.json()["code"]

Replace `your-code` in the following url with the code value you get, and open the url in your browser, grant access, close the tab, then get back here and run the next cell to get access token:

https://getpocket.com/auth/authorize?request_token=your-code&redirect_uri=https://127.0.0.1


In [None]:
data = {"consumer_key": POCKET_CONSUMER_KEY, "code": POCKET_CODE}
url = "https://getpocket.com/v3/oauth/authorize"
r = requests.request("Post", url=url, headers=POCKET_HEADERS, json=data)
# {"access_token": "efgh-1234-efgh-1234", "username": "your-username"}

POCKET_ACCESS_TOKEN = r.json()["access_token"]

Add the `access_token` to `.env` file, so you can skip the authorization steps above next times and just use the following:

#### Read from `.env` file

In [16]:
POCKET_ACCESS_TOKEN = os.getenv("POCKET_ACCESS_TOKEN")

In [6]:
def get_pocket_items(state: str = "archive") -> list[Article]:
    """
    Get all items archive in pocket
    """
    pocket_items = []
    count = 500
    offset = 0
    while True:
        payload = {
            "consumer_key": POCKET_CONSUMER_KEY,
            "access_token": POCKET_ACCESS_TOKEN,
            "state": state,
            "sort": "newest",
            "count": count,
            "offset": offset,
            "detailType": "complete",
            "total": "1",  ## total number of archived articles
        }

        url = "https://getpocket.com/v3/get"
        r = requests.request("Post", url=url, headers=POCKET_HEADERS, json=payload)

        for item in r.json()["list"]:
            pocket_item = r.json()["list"][item]
            try:
                pocket_items.append(
                    Article(
                        pocket_item_id=int(pocket_item["item_id"]),
                        given_url=pocket_item["given_url"],
                        resolved_url=pocket_item["resolved_url"],
                        title=pocket_item["resolved_title"],
                        time_added=datetime.fromtimestamp(
                            int(pocket_item["time_added"])
                        ),
                        word_count=int(pocket_item["word_count"]),
                        domain=pocket_item.get("domain_metadata", {}).get("name", ""),
                    )
                )
            except Exception as e:
                print(f"Error processing item {pocket_item['item_id']}: {e}")
                continue

        if int(r.json()["total"]) > count + offset:
            print(f"Fetching more articles... {count + offset} / {r.json()['total']}")
            offset += count
        else:
            print(f"Fetched {offset + count} articles")
            break

    return pocket_items


pocket_items = get_pocket_items()
print(len(pocket_items))
pocket_items[0]

Fetching more articles... 500 / 3692
Fetching more articles... 1000 / 3692
Fetching more articles... 1500 / 3692
Fetching more articles... 2000 / 3692
Error processing item 3945908435: 1 validation error for Article
resolved_url
  Input should be a valid URL, input is empty [type=url_parsing, input_value='', input_type=str]
    For further information visit https://errors.pydantic.dev/2.10/v/url_parsing
Fetching more articles... 2500 / 3692
Fetching more articles... 3000 / 3692
Fetching more articles... 3500 / 3692
Fetched 4000 articles
3691


Article(pocket_item_id=5598506, given_url=HttpUrl('https://nat.org/'), resolved_url=HttpUrl('http://nat.org/'), title='Nat Friedman', time_added=datetime.datetime(2025, 1, 15, 11, 20, 58), word_count=451, domain='nat.org')

In [11]:
df = pd.DataFrame([article.model_dump() for article in pocket_items])
df.to_csv("pocket_articles.csv", index=False)
df.head()

Unnamed: 0,pocket_item_id,given_url,resolved_url,title,time_added,word_count,domain
0,5598506,https://nat.org/,http://nat.org/,Nat Friedman,2025-01-15 11:20:58,451,nat.org
1,42378269,https://matt.might.net/articles/shell-scripts-...,http://matt.might.net/articles/shell-scripts-f...,"3 shell scripts to improve your writing, or ""M...",2024-12-29 12:19:45,1881,matt.might.net
2,132470062,https://www.kalzumeus.com/2012/01/23/salary-ne...,https://www.kalzumeus.com/2012/01/23/salary-ne...,"Salary Negotiation: Make More Money, Be More V...",2024-11-15 12:07:03,6794,www.kalzumeus.com
3,172658679,https://stevehanov.ca/blog/index.php?id=132,http://stevehanov.ca/blog/index.php?id=132,20 lines of code that will beat A/B testing ev...,2025-01-15 00:24:49,970,stevehanov.ca
4,210539245,https://digital-photography-school.com/6-reaso...,https://digital-photography-school.com/6-reaso...,9 Reasons Why Photography Matters,2024-12-02 23:13:34,2312,Digital Photography School


### b. From csv file

In [17]:
df = pd.read_csv("pocket_articles.csv")
df.head()

Unnamed: 0,pocket_item_id,given_url,resolved_url,title,time_added,word_count,domain
0,5598506,https://nat.org/,http://nat.org/,Nat Friedman,2025-01-15 11:20:58,451,nat.org
1,42378269,https://matt.might.net/articles/shell-scripts-...,http://matt.might.net/articles/shell-scripts-f...,"3 shell scripts to improve your writing, or ""M...",2024-12-29 12:19:45,1881,matt.might.net
2,132470062,https://www.kalzumeus.com/2012/01/23/salary-ne...,https://www.kalzumeus.com/2012/01/23/salary-ne...,"Salary Negotiation: Make More Money, Be More V...",2024-11-15 12:07:03,6794,www.kalzumeus.com
3,172658679,https://stevehanov.ca/blog/index.php?id=132,http://stevehanov.ca/blog/index.php?id=132,20 lines of code that will beat A/B testing ev...,2025-01-15 00:24:49,970,stevehanov.ca
4,210539245,https://digital-photography-school.com/6-reaso...,https://digital-photography-school.com/6-reaso...,9 Reasons Why Photography Matters,2024-12-02 23:13:34,2312,Digital Photography School


## Extract texts using `r.jina.ai`

In [18]:
JINA_API_KEY = os.getenv("JINA_API_KEY")