In [1]:
! pip install presto-python-client pandas numpy sqlalchemy mariadb ollama pydantic tqdm ipywidgets wikipyedia-md mysql-connector-python

Collecting presto-python-client
  Downloading presto_python_client-0.8.4-py3-none-any.whl.metadata (2.0 kB)
Collecting pandas
  Downloading pandas-2.3.3-cp313-cp313-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting numpy
  Downloading numpy-2.4.0-cp313-cp313-macosx_14_0_arm64.whl.metadata (6.6 kB)
Collecting sqlalchemy
  Using cached sqlalchemy-2.0.45-py3-none-any.whl.metadata (9.5 kB)
Collecting mariadb
  Downloading mariadb-1.1.14.tar.gz (111 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[38 lines of output][0m
  [31m   [0m /bin/sh: mariadb_config: command not found
  [31m   [0m Traceback (most recent call last):
  [31m   [0m   File [35m"/opt/homebrew/anaconda3/envs/cncld/lib/python3.13/site-pack

In [63]:
import os
import glob
import pandas as pd
from sqlalchemy import create_engine
import ollama
import pydantic
import dataclasses
import tqdm
import math

DATABASE_URL = 'mariadb+mariadbconnector://localhost:3033/wikipedia?unix_socket=/tmp/mysql.sock'

DATA_PATH = os.path.join(os.getcwd(), './ext/')

INDEX_FILE = [ name for name in glob.glob(os.path.join(DATA_PATH, '*.txt')) if 'multistream-index' in name ][0]

tqdm.tqdm.pandas()

In [64]:
%%sql
CREATE TABLE IF NOT EXISTS multistream_index (
    page_id bigint NOT NULL,
    stream_offset bigint NOT NULL,
    title text NOT NULL,
    PRIMARY KEY (page_id),
    INDEX (title),
    FOREIGN KEY (page_id) REFERENCES page(page_id)
);

Unnamed: 0,Count


In [None]:
%%sql
CREATE TABLE IF NOT EXISTS multistream_data (
    page_id bigint NOT NULL,
    body longtext NOT NULL,
    markdown_text longtext NOT NULL,
    PRIMARY KEY (page_id),
    FOREIGN KEY (page_id) REFERENCES page(page_id)
);

In [None]:
%%sql
CREATE TABLE IF NOT EXISTS target_entities (
    page_id bigint NOT NULL,
    entity_type int NOT NULL,
    PRIMARY KEY (page_id),
    FOREIGN KEY (page_id) REFERENCES page(page_id)
);

In [None]:
%%sql
CREATE TABLE IF NOT EXISTS target_news_entries (
    id bigint NOT NULL AUTO_INCREMENT,
    page_id bigint NOT NULL,
    url text NOT NULL,
    title text NOT NULL,
    full_text longtext NOT NULL,
    markdown_text longtext NOT NULL,
    PRIMARY KEY (id),
    INDEX (page_id),
    FOREIGN KEY (page_id) REFERENCES page(page_id)
)

In [65]:
with open(INDEX_FILE, 'r') as index:
    lines = [ row for row in [ line.split(':', maxsplit=3) for line in index.readlines() ] if len(row) == 3 ]
    INDEX_ROWS = pd.DataFrame(lines, columns=['stream_offset', 'page_id', 'title'])

In [66]:
engine = create_engine(DATABASE_URL)
INDEX_ROWS.to_sql('multistream_index', engine, if_exists='replace', index=False, chunksize=5000)

18636547

In [67]:
@dataclasses.dataclass
class InferenceContext:
    page_id: int
    page_title: str
    page_type: str
    recent_news: list[str]
    wikipedia_body: str

    def __str__(self):
        return f"{self.page_title}\n\n{self.wikipedia_body}\n\n# Recent News:\n{'\n'.join(self.recent_news)}"

class CancelInferenceResult(pydantic.BaseModel):
    is_toxic: bool
    revocable: bool
    rationale: str
    penance: str | None

@dataclasses.dataclass()
class CancelResult:
    context: InferenceContext
    result: CancelInferenceResult
    confidence: float

@dataclasses.dataclass()
class Result:
    page_id: int
    page_title: str
    page_type: str
    result: bool
    revocable: bool
    confidence: float
    rationale: str
    penance: str | None

SYSTEM_PROMPT = """
You are a competent, reasonable, investigative journalist.  You are a supporter of LGBT rights and evaluate the subjects of your investigation for their toxicity to the LGBT community.  You do not exclude trans individuals from this definition.  You are fair and understand that people can change over time.  You provide thoughtful rationale to your opinions and also provide an explanation of what if anything the subject of the investigation could do to no longer be considered toxic to the LGBT community.
"""

TASK_PROMPT = """
# Task

Given the following context, determine whether the subject of the investigation is toxic to the LGBT community.  Explain why you made this decision.  Determine whether the subject of the investigation can have this decision revoked.  Explain what steps would need to be taken to revoke this decision.

# Context

"""

def process_input(input_context: InferenceContext) -> CancelResult:
    response = ollama.chat(
        model="llama4",
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": TASK_PROMPT + str(input_context)}
        ],
        logprobs=True,
        format=CancelInferenceResult.model_json_schema()
    )

    parsed_response = CancelInferenceResult.model_validate_json(response.message.content)

    return CancelResult(
        context=input_context,
        result=parsed_response,
        confidence=math.exp(response.logprobs[6].logprob)
    )

def process_result(input_context: InferenceContext) -> Result:
    inference_result = process_input(input_context)
    return Result(
        page_id=inference_result.context.page_id,
        page_title=inference_result.context.page_title,
        page_type=inference_result.context.page_type,
        result=inference_result.result.is_toxic,
        revocable=inference_result.result.revocable,
        rationale=inference_result.result.rationale,
        penance=inference_result.result.penance,
        confidence=inference_result.confidence
    )


In [68]:
INPUT_CONTEXTS = [
    InferenceContext(
        page_id=100,
        page_title="Nicki Manaj",
        page_type="person",
        recent_news=["Nicki Manaj hates gays"],
        wikipedia_body="""
                       "Onika Tanya Maraj-Petty (born December 8, 1982), known professionally as Nicki Minaj (/ˈnɪki mɪˈnɑːʒ/ ⓘ NIK-ee min-AHZH), is a Trinidadian rapper, singer, and songwriter based in the United States. Dubbed the "Queen of Rap" and one of the most influential rappers of all time, she is noted for her dynamic rap flow, witty lyrics, musical versatility, and alter egos, and is credited as a driving force in the mainstream resurgence of female rap since the 2010s. Raised in New York City, Minaj began rapping professionally in the early 2000s and gained recognition with her three mixtapes between 2007 and 2009."""
    )
]

In [69]:
RESULTS = pd.DataFrame(map(process_result, tqdm.notebook.tqdm(INPUT_CONTEXTS)))

  0%|          | 0/1 [00:00<?, ?it/s]

In [71]:
RESULTS

Unnamed: 0,page_id,page_title,page_type,result,revocable,confidence,rationale,penance
0,100,Nicki Manaj,person,True,True,0.297844,Nicki Minaj has expressed anti-LGBTQ+ views in...,To no longer be considered toxic to the LGBT c...
