In [None]:
import importlib
from pathlib import Path

import pandas as pd
from IPython.display import Markdown

from cuery import pprint
from cuery.cli import set_env_vars
from cuery.tools import topics

GDRIVE = Path("~/thomas@graphext.com - Google Drive/Shared drives/Solutions").expanduser()

In [None]:
set_env_vars(apify_secrets=False)

# Electric vehicles

In [None]:
DATA_DIR = GDRIVE / "Research/tractor_v2/SEO"

veh = pd.read_csv(DATA_DIR / "electric_vehicles_twitter.csv")
veh

# Manual taxonomy

In [None]:
taxonomy_dict = {
    "Vehicle types and technology": [
        "Electric vehicles",
        "Hybrid vehicles",
        "Hydrogen vehicles",
        "Battery technology",
        "Charging infrastructure",
    ],
    "Manufacturing and Industry": [
        "Automobile manufacturing",
        "Battery production",
        "Manufacturing locations",
        "Industry challenges",
        "Corporate strategies",
    ],
    "Market and Economics": [
        "Sales trends",
        "Market competition",
        "Pricing and subsidies",
        "Trade and tariffs",
        "Investment and finance",
    ],
    "Environmental impact and sustainability": [
        "Carbon emissions",
        "Resource extraction",
        "Battery recycling",
        "Renewable energy",
        "Pollution and health effects",
    ],
    "Government and policy": [
        "Regulations and incentives",
        "Subsidies and credits",
        "Trade policy and disputes",
        "Environmental policy",
        "Transportation infrastructure",
    ],
    "Consumer experience and usage": [
        "Vehicle performance",
        "Range and charging",
        "Maintenance and reliability",
        "Safety and incidents",
        "User opinions",
    ],
    "Logistics and fleet management": [
        "Cargo and delivery vehicles",
        "Fleet electrification",
        "Public transportation",
        "Shared mobility",
        "Vehicle telematics",
    ],
}

taxonomy = topics.Topics(topics=taxonomy_dict)
taxonomy

In [None]:
assigner = topics.TopicAssigner(topics=taxonomy, texts=veh.text[:20])
tt = await assigner(n_concurrent=100)
tt.to_pandas()

# Simple Oneshot

In [None]:
extractor = topics.TopicExtractor(
    texts=veh.text,
    n_topics=10,
    n_subtopics=5,
    instructions="The texts contain tweets about electric vehicles",
    model="openai/gpt-4.1-mini",
    max_dollars=0.5,
    max_tokens=500_000,
    max_texts=10_000,
)

In [None]:
pprint(extractor.task)

In [None]:
taxonomy = await extractor()
taxonomy

In [None]:
assigner = topics.TopicAssigner(topics=taxonomy, texts=veh.text[:10])
pprint(assigner.task)

In [None]:
tt = await assigner(n_concurrent=100)
tt.to_pandas()

# Multitopic

In [None]:
assigner = topics.MultiTopicAssigner(topics=taxonomy, texts=veh.text[:10])
tt = await assigner(n_concurrent=100)
tt_df = tt.to_pandas(explode=False)
tt_df

# Flex

## Topic Extraction

In [None]:
from cuery.tools.flex import topics

importlib.reload(topics)

extractor = topics.TopicExtractor(
    records=veh[:200],
    n_topics=10,
    n_subtopics=5,
    instructions="The records contain tweets about electric vehicles",
    min_ldist=2,
    model="openai/gpt-4.1-mini",
    max_samples=50,
    attrs=["text", "viewCount", "author.userName"],
    record_format="attr_wise",
)

pprint(extractor.task)

In [None]:
result = await extractor()
result

In [None]:
result.to_dict()

In [None]:
# Markdown(extractor.task.queries[0]["messages"][0]["content"])

## Assignment

In [None]:
taxonomy_dict = {
    "Vehicle types and technology": [
        "Electric vehicles",
        "Hybrid vehicles",
        "Hydrogen vehicles",
    ],
    "Manufacturing and Industry": [
        "Automobile manufacturing",
        "Battery production",
    ],
    "Market and Economics": [
        "Sales trends",
        "Market competition",
    ],
}

taxonomy = topics.Topics(topics=taxonomy_dict)
taxonomy

In [None]:
assigner = topics.TopicAssigner(
    records=veh[:10],
    topics=taxonomy,
    attrs=["text", "viewCount", "author.userName"],
    record_format="text",
    model="openai/gpt-4.1-mini",
)

pprint(assigner.task)

In [None]:
labels = await assigner(n_concurrent=100)
labels

## Scorer

In [None]:
from cuery.tools.flex import score

importlib.reload(score)

kwds = ["what is an SUV", "Peugeot concesionarios", "SUV second hand", "electric car reviews"]
context = [{"keyword": kwd} for kwd in kwds]

scorer = score.Scorer(
    name="Purchase Probability ",
    type="integer",
    min=0,
    max=10,
    description="The likelihood of a user completing a purchase or taking a commercial action based on the keyword within the next 30 days.",
    records=context,
    record_format="text",
)

In [None]:
pprint(scorer.task)

In [None]:
result = await scorer(n_concurrent=100)
result

## Entities

In [None]:
import importlib

from cuery import pprint
from cuery.cli import set_env_vars
from cuery.tools.flex import entities

importlib.reload(entities)

set_env_vars(apify_secrets=False)

ents = {
    "brand_company": "A brand, label or company name",
    "product_service": "A product, item or service name",
    "technology": "A technology, software or hardware name",
    "other": "Any other relevant entity that does not fit into the above categories",
}

ai_overviews = [
    {"content": "Tesla is a leading electric vehicle manufacturer."},
    {"content": "The iPhone is a popular smartphone by Apple."},
    {
        "content": "Google's AI research focuses on machine learning and natural language processing."
    },
]

extractor = entities.EntityExtractor(
    entities=ents,
    records=ai_overviews,
    model="openai/gpt-4.1",
)

pprint(extractor.task)

In [None]:
result = await extractor()
result

In [None]:
from IPython.display import Markdown

Markdown(extractor.task.queries[0]["messages"][0]["content"])

## Classifier

Simple one-shot classifier with configurable categories.

In [None]:
import importlib

from cuery import pprint
from cuery.cli import set_env_vars
from cuery.tools.flex import classify

importlib.reload(classify)

set_env_vars(apify_secrets=False)


kwds = ["what is an SUV", "Peugeot concesionarios", "SUV second hand", "electric car reviews"]
records = [{"keyword": kwd} for kwd in kwds]

categories = {
    "informational": "Keywords search with *informational* intent",
    "transactional": "Keywords search with *transactional* intent",
}


classifier = classify.Classifier(
    categories=categories,
    records=records,
    instructions="Classify the Google search keyword into informational or transactional intent.",
    model="openai/gpt-4.1-mini",
)

pprint(classifier.task)

In [None]:
df = await classifier(n_concurrent=10)
df

In [None]:
Markdown(classifier.task.queries[3]["messages"][0]["content"])

## Generic Tool

In [None]:
import importlib

import pandas as pd

from cuery.cli import set_env_vars
from cuery.tools import schema
from cuery.tools.flex import generic

set_env_vars(apify_secrets=False)

importlib.reload(schema)
importlib.reload(generic)

In [None]:
texts_with_emails_urls = [
    "Please contact our support team at support@techcorp.com or visit our help center at https://help.techcorp.com for assistance.",
    "For business inquiries, reach out to sales@example.org. You can also check our pricing at www.example.org/pricing",
    "Send your resume to jobs@startup.io and learn more about our company culture at https://startup.io/careers",
    "Our customer service is available at help@mystore.net. Track your order status at https://mystore.net/orders/tracking",
    "Subscribe to our newsletter by emailing newsletter@blog.co or visit our latest posts at http://blog.co/latest",
    "Report any issues to admin@platform.dev. Check system status at https://status.platform.dev",
    "For media inquiries, contact press@newscorp.com. Read our press releases at https://newscorp.com/press",
    "Join our community discussion at forum@community.org or browse topics at www.community.org/forums",
    "Technical documentation is available at docs@api.service.com and online at https://docs.api.service.com",
    "Partnership opportunities: partners@bigtech.net. View our partner portal at https://partners.bigtech.net/login",
]

df = pd.DataFrame(texts_with_emails_urls, columns = ["text"])
df

In [None]:
from cuery.tools.flex import generic

autogen = generic.Auto(
    records=df,
    instructions="Extract any emails and URLs from the text column.",
    model="openai/gpt-4.1-mini",
)  # type: ignore

result = await autogen(n_concurrent=10)
result

In [None]:
# Generate a schema
schema_gen = schema.SchemaGenerator(
    instructions="I need a reponse schema for extracting emails and URLs from text.",
    model="openai/gpt-4.1",
)

response_schema = await schema_gen(max_retries=6)
pprint(response_schema.json_schema)

In [None]:
generator = generic.Generic(
    instructions="Extract emails and URLs from the text.",
    response_schema=response_schema.json_schema,
    records=df,
    model="openai/gpt-4.1-mini",
)
pprint(generator.task)

In [None]:
result = await generator(n_concurrent=10)
result

# Iterate

In [None]:
system = """
You will receive a tweet text from a larger dataset of tweets about electric vehicles, and a list of top-level topics in markdown format.
Your task is to identify new generalizable topics within the document that can act as top-level topics in the hierarchy.
If any topic mentioned is similar enough to an existing topic (paraphrasing it), return the existing one instead.
I.e. avoid duplicating topics with similar meanings, but different phrasing.

# Examples

## Example 1 (new topic, returning "Battery safety issues):
### Existing topics
- Charging infrastructure
### Document
I don't want my car to explode on contact, the cars are expensive enough hahaha
### Your response
[Safety issues, Pricing]

## Example 2 (no identifiable topic, returning an empty list):
### Existing topics
- Charging infrastructure
- Safety issues
### Document
You don't say. D'uh!
### Your response
[]

## Example 2 (different dataset context, returning an existing topic):
### Existing topics
- For sale
### Document
A friend of mine would like to sell his 850 MB SCSI drive for $800 + S/H.It is a full-height drive,
and has been used for about one and a half years.If anyone is interested, please e-mail me.
### Your response
[For sale]


# Instructions
Step 1: Determine topics mentioned in the document.
- The topic labels must be as GENERALIZABLE as possible. They must NOT be document-specific.
- The topics must reflect a SINGLE topic instead of a combination of topics.
- The topics must be broad enough to accommodate future subtopics.

Step 2: Perform ONE of the following operations:
- If there are already very similar, duplicate or relevant topics in the hierarchy, output those topics and stop here. 
- If the document contains no topic, return en empty list ([]).
- Otherwise, stop here and output the new topic(s) as a list
"""


user = """
Extract new topics from the text below if they are not already amongst existing topics.
### Existing topics
- Charging infrastructure
### Document
{{text}}
### Your response

"""

prompt = Prompt(
    messages=[
        {"role": "system", "content": system},
        {"role": "user", "content": user},
    ],
    required=["text"],
)

pprint(prompt)

In [None]:
class Topics(Response):
    names: list[str] = Field(
        ..., description="(Possibly empty) list of top-level, generalizable topics."
    )


topics = Task(prompt=prompt, response=Topics, model="gpt-4.1-mini")
pprint(topics)

In [None]:
def split_prompt_topics(text: str) -> tuple[list, str, str]:
    pre, post = text.split("### Existing topics", maxsplit=1)
    topics, post = post.split("### Document", maxsplit=1)
    topics = topics.strip().replace("- ", "")
    topics = [topic.strip() for topic in topics.split("\n") if topic.strip()]
    pre = pre + "### Existing topics\n"
    post = "\n### Document" + post
    return topics, pre, post


def update_prompt(response: Response, prompt: Prompt, context: dict | None = None) -> None:
    """Update the prompt with the given text."""
    new_topics = response.names
    if new_topics:
        message = prompt.messages[1].content
        old_topics, pre, post = split_prompt_topics(message)
        topics = set(old_topics) | set(new_topics)
        new_message = pre + "\n".join(f"- {x}" for x in sorted(topics)) + post
        prompt.messages[1].content = new_message


In [None]:
res = await topics.iter(veh[:100], callback=update_prompt)
res.to_pandas()

In [None]:
sorted(res.to_pandas().names.unique())

# Newsgroups

In [None]:
DATA_DIR = Path("/Users/thomas/data/text")
news = pd.read_csv(DATA_DIR / "newsgroups.csv")
display(news.label.value_counts())

sample = news.sample(500)
sample

# Keywords

In [None]:
import pandas as pd

from cuery import pprint
from cuery.topics import keywords, oneshot

#!uv pip install openpyxl
# !uv add levenshtein

In [None]:
kws = pd.read_excel("/Users/thomas/Documents/Cerraduras.xlsx")
kws

In [None]:
kws.Keywords.iloc[:50].tolist()

In [None]:
kws.Categoría.value_counts()

## Two-level topics

In [None]:
import importlib

from cuery import set_api_keys, utils

importlib.reload(utils)
importlib.reload(oneshot)
set_api_keys()

instructions = utils.dedent("""
Always return topics in the original language (do NOT translate).
Do NOT invent new keywords not in the original list.
Make sure subtopics are always more specific than their parent topics and different from each other.
""")

extractor = oneshot.TopicExtractor(
    domain="Google search keywords",
    n_topics=10,
    n_subtopics=5,
    extra=instructions,
)

topics = await extractor(
    kws.Keywords,
    # model="openai/gpt-4.1",
    model="google/gemini-2.5-flash-preview-05-20",
    max_dollars=0.5,
    max_tokens=500_000,
    max_texts=10_000,
    # temperature=0.1
)

topics.to_dict()

In [None]:
assigner = oneshot.TopicAssigner(topics)
pprint(assigner.task)

In [None]:
tt = await assigner(
    kws.rename(columns={"Keywords": "text"}), model="openai/gpt-4.1-mini", n_concurrent=20
)
tt.to_pandas()

In [None]:
tdf = tt.to_pandas()
tdf.groupby(["topic", "subtopic"], as_index=False).size()

## Cleaner

In [None]:
importlib.reload(keywords)

cleaner = keywords.KeywordCleaner(
    domain="Google search keywords",
    n_max=15,
    extra="Don't change the language of the keywords. Treat different languages separately.",
)

pprint(cleaner.task)

kwds = await cleaner(
    kws.Keywords,
    model="openai/gpt-4.1-mini",
    max_dollars=0.5,
    max_tokens=500_000,
    max_texts=10_000,
)

kwds.to_dict()

In [None]:
importlib.reload(keywords)

assigner = keywords.KeywordAssigner(kwds)
pprint(assigner.task)

In [None]:
assigns = await assigner(
    kws.rename(columns={"Keywords": "text"}),
    model="openai/gpt-4.1-mini",
    n_concurrent=100,
)

assigns.to_pandas()

# Server
Create example payloads to paste into FastAPI docs examples

In [None]:
import json

q = {
    "texts": [
        "cerradura de seguridad",
        "google smart lock",
        "smart lock google",
        "nuki smart lock 3.0 pro",
        "smartlock",
        "smart lock",
        "cerradura inteligente",
        "bombin de seguridad",
        "bombin de alta seguridad",
        "bombín de seguridad",
        "cerradura de seguridad para puerta exterior",
        "cerradura de seguridad para puerta",
        "mejor bombin de seguridad",
        "el mejor bombin de seguridad",
        "nuki smart lock 3.0",
        "cerradura electrónica",
        "cilindro de seguridad",
        "cerradura de seguridad puerta",
        "cerradura de seguridad anti bumping",
        "cerradura inteligente xiaomi",
    ],
    "n_topics": 3,
    "n_subtopics": 3,
    "max_tokens": 20_000,
    "model": "openai/gpt-3.5-turbo",
}

print(json.dumps(q, indent=2))

In [None]:
tts = {
    "model": "openai/gpt-3.5-turbo",
    "texts": q["texts"],
    "topics": {
        "topics": [
            {
                "topic": "Smart Locks",
                "subtopics": [
                    "cerradura de seguridad",
                    "google smart lock",
                    "nuki smart lock 3.0 pro",
                ],
            },
            {
                "topic": "Security Features",
                "subtopics": [
                    "bombin de seguridad",
                    "cilindro de seguridad",
                    "cerradura de seguridad puerta",
                ],
            },
        ]
    },
}

print(json.dumps(tts, indent=2))

In [None]:
tts = {
    "model": "openai/gpt-3.5-turbo",
    "texts": [
        "cerradura de seguridad",
        "google smart lock",
        "smart lock google",
        "nuki smart lock 3.0 pro",
        "smartlock",
        "smart lock",
        "cerradura inteligente",
        "bombin de seguridad",
        "bombin de alta seguridad",
        "bomb\u00edn de seguridad",
        "cerradura de seguridad para puerta exterior",
        "cerradura de seguridad para puerta",
        "mejor bombin de seguridad",
        "el mejor bombin de seguridad",
        "nuki smart lock 3.0",
        "cerradura electr\u00f3nica",
        "cilindro de seguridad",
        "cerradura de seguridad puerta",
        "cerradura de seguridad anti bumping",
        "cerradura inteligente xiaomi",
    ],
    "topics": {
        "Smart Locks": ["cerradura de seguridad", "google smart lock", "nuki smart lock 3.0 pro"],
        "Security Features": [
            "bombin de seguridad",
            "cilindro de seguridad",
            "cerradura de seguridad puerta",
        ],
    },
}

print(json.dumps(tts, indent=2))