In [None]:
! pip install wikipedia-api ollama pqdm psycopg2

# Select our targets

This cell selects `page_id` and `page_title` from the `page` table in the `wikipedia` database, and joins it with the `target_entities` table to filter for pages that are marked as targets. The result is a list of page IDs and titles that are relevant for inference.

In [2]:
%%sql
SELECT p.page_id, p.page_title FROM page AS p
INNER JOIN target_entities AS te
    ON te.page_id = p.page_id
WHERE p.page_namespace = 0

Unnamed: 0,page_id,page_title
0,64618347,Zohran_Mamdani
1,80285182,Rama_Duwaji
2,80669,Mira_Nair
3,31251727,Mahmood_Mamdani
4,51137169,Millie_Bobby_Brown
...,...,...
9995,337907,Drew_Pinsky
9996,1657125,Mary_McCormack
9997,68912211,Josh_Downs
9998,16568,John_Stauber


# Generic Housekeeping

This sets up ambient config and various data structures for inference and result handling.

In [3]:
import ollama
import pydantic
import math
import dataclasses
import uuid
import random
from tqdm.notebook import tqdm
import pandas as pd
import wikipediaapi
from sqlalchemy.orm import Mapped, mapped_column, declarative_base
from sqlalchemy.dialects.postgresql import insert
from sqlalchemy import String, Integer, Float, Boolean, UUID, create_engine
import time

tqdm.pandas()

DATABASE_URL = 'mysql://root@localhost:3306/wikipedia'
DATABASE_ENGINE = create_engine(DATABASE_URL)

RESULT_URL = 'postgresql://rickmark@localhost:5432/cncld'
RESULT_ENGINE = create_engine(RESULT_URL, isolation_level="AUTOCOMMIT")

In [5]:
@dataclasses.dataclass
class InferenceContext:
    page_id: int
    page_title: str
    page_type: str
    recent_news: list[str]
    wikipedia_body: str

    def __str__(self):
        return f"{self.page_title}\n\n{self.wikipedia_body}\n\n# Recent News:\n{'\n'.join(self.recent_news)}"

class CancelInferenceResult(pydantic.BaseModel):
    is_toxic: bool
    revocable: bool
    rationale: str
    penance: str | None

@dataclasses.dataclass()
class CancelResult:
    context: InferenceContext
    result: CancelInferenceResult
    confidence: float

Base = declarative_base()
@dataclasses.dataclass()
class Result(Base):
    __tablename__ = 'the_list'

    id: Mapped[uuid.UUID] = mapped_column(UUID, primary_key=True, default=uuid.uuid4, index=True)
    title: Mapped[str] = mapped_column(String)
    canceled: Mapped[bool] = mapped_column(Boolean)
    revocable: Mapped[bool] = mapped_column(Boolean)
    confidence: Mapped[float] = mapped_column(Float)
    rationale: Mapped[str] = mapped_column(String)
    penance: Mapped[str | None] = mapped_column(String, nullable=True)


    def __str__(self):
        return f"{self.title}: {'Canceled' if self.canceled else 'Not Canceled'} (confidence: {self.confidence:.2f})\nRationale: {self.rationale}{"\nPenance: " + self.penance if self.canceled else ""}\n\n"

SYSTEM_PROMPT = """
You are a competent, reasonable, investigative journalist.  You are a supporter of LGBT rights and evaluate the subjects of your investigation for their toxicity to the LGBT community.  You do not exclude trans individuals from this definition.  You are fair and understand that people can change over time.  You provide thoughtful rationale to your opinions and also provide an explanation of what if anything the subject of the investigation could do to no longer be considered toxic to the LGBT community.
"""

TASK_PROMPT = """
# Task

Given the following context, determine whether the subject of the investigation is toxic to the LGBT community.  Explain why you made this decision.  Determine whether the subject of the investigation can have this decision revoked.  Explain what steps would need to be taken to revoke this decision.

# Context

"""


# Enrichment

Block enriches each page with its Wikipedia body text. (And in the future news articles, etc.)

In [None]:
# Initialize the Wikipedia API wrapper
# You should set a meaningful User-Agent, as per MediaWiki API guidelines.
wiki_wiki = wikipediaapi.Wikipedia(
    user_agent='CnclD/1.0 (rickmark@outlook.com)',
    language='en'
)

def get_body(body_row):
    try:
        page = wiki_wiki.page(body_row['page_title'])

        if page.exists():
            return page.summary
    except Exception:
        return None

df = target_entities.copy()
df['page_body'] = df.progress_apply(get_body, axis=1)
df = df[df['page_body'].notnull()]

  0%|          | 0/10000 [00:00<?, ?it/s]

In [61]:
# Checkpoint to the database
df.to_sql('wiki_body', DATABASE_ENGINE, if_exists='replace', index=False, chunksize=5000)

1000

In [27]:
# Map to the input data shape for inference

def map_row(inference_row):
    return InferenceContext(
        page_id=inference_row['page_id'],
        page_title=inference_row['page_title'].replace("_", " "),
        page_type='person',
        recent_news=[],
        wikipedia_body=inference_row['page_body'],
    )

INPUT_CONTEXTS = []
for index, row in tqdm(df.iterrows(), total=len(df)):
    INPUT_CONTEXTS.append(map_row(row))
INPUT_CONTEXTS

  0%|          | 0/1000 [00:00<?, ?it/s]

[InferenceContext(page_id=192481, page_title='50 Cent', page_type='person', recent_news=[], wikipedia_body='Curtis James Jackson III (born July 6, 1975), known professionally as 50 Cent, is an American rapper, actor, and television producer. Born in Queens, a borough of New York City, Jackson began pursuing a musical career in 1996. In 1999–2000, he recorded his debut album, Power of the Dollar, for Columbia Records. During a shooting in May 2000, he was struck by nine bullets, causing its release to be canceled and Jackson to be dropped from the label. His 2002 mixtape Guess Who\'s Back? was discovered by Detroit rapper Eminem, who signed Jackson to his label Shady Records (an imprint of Interscope Records) that year.\nJackson\'s debut studio album, Get Rich or Die Tryin\' (2003), was released to critical acclaim and commercial success. Peaking atop the Billboard 200, it spawned the Billboard Hot 100-number one singles "In da Club" and "21 Questions" (featuring Nate Dogg), and receive

# Perform Inference

This block processes each target with the prompt and creates resulting judgement

In [28]:
def process_input(single_input: InferenceContext) -> CancelResult:
    client = ollama.Client(timeout=600.0)
    response = client.chat(
        model="llama4",
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": TASK_PROMPT + str(single_input)}
        ],
        logprobs=True,
        options={
            'num_predict': 2048
        },
        format=CancelInferenceResult.model_json_schema()
    )

    parsed_response = CancelInferenceResult.model_validate_json(response.message.content)

    return CancelResult(
        context=input_context,
        result=parsed_response,
        confidence=math.exp(response.logprobs[6].logprob)
    )

def process_result(result_input_context: InferenceContext) -> Result:
    inference_result_context = process_input(result_input_context)
    return Result(
        title=inference_result_context.context.page_title.replace('_', ' '),
        canceled=inference_result_context.result.is_toxic,
        revocable=inference_result_context.result.revocable,
        rationale=inference_result_context.result.rationale,
        penance=inference_result_context.result.penance,
        confidence=inference_result_context.confidence
    )


In [1]:
RESULTS = []
random.shuffle(INPUT_CONTEXTS)

with RESULT_ENGINE.connect() as conn:
    for input_context in tqdm(INPUT_CONTEXTS):
        if input_context.wikipedia_body is None:
            continue

        try:
            print(f"Processing {input_context.page_title} ({len(input_context.wikipedia_body)} bytes)")
            inference_result = process_result(input_context)
            print(str(inference_result))
            RESULTS.append(inference_result)

            data = {k: v for k, v in inference_result.__dict__.items() if not k.startswith('_')}

            stmt = insert(inference_result.__table__).values(data)
            stmt = stmt.on_conflict_do_update(
                index_elements=[inference_result.__table__.c.title],
                set_= dict(
                    canceled=stmt.excluded.canceled,
                    revocable=stmt.excluded.canceled,
                    rationale=stmt.excluded.canceled,
                    penance=stmt.excluded.canceled
                )
            )
            conn.execute(stmt)
            time.sleep(0.5)
        except Exception as e:
            print(f"Error processing {input_context.page_title}: {e}")

RESULT_DF = pd.DataFrame(RESULTS)
RESULT_DF

NameError: name 'random' is not defined