In [1]:
import openai
import json
from dotenv import load_dotenv
from typing import Any 
from GPT_crawler_utils import get_text_snapshot, load_prompt

In [3]:
load_dotenv()
client = openai.Client()
class PromptFunction:
    """
    A class representing a reusable prompt function in POP.
    """
    def __init__(self, prompt: str):
        """
        Initializes a new prompt function with a prompt template.
        
        Args:
            prompt (str): The base prompt template for the function.
        """
        self.prompt = prompt
        self.temperature = 0.7

    def execute(self, *args, **kwargs) -> str:
        """
        Executes the prompt function with dynamic argument injection.

        Args:
            *args: Positional arguments to add before, after, or inject into the prompt.
            **kwargs: Key-value arguments to replace placeholders in the prompt or provide additional context.

        Returns:
            str: The AI-generated response.
        """
        use_model = kwargs.pop("USE_MODEL", "gpt-4o-mini")
        system_instruction = kwargs.pop("SYSTEM_INSTRUCTION", None)
        response_format = kwargs.pop("RESPONSE_FORMAT", None)
        # Step 1: Inject user arguments into the prompt
        prompt = self._prepare_prompt(*args, **kwargs)

        # Step 2: Call AI with the prepared prompt
        return self._call_ai(prompt, use_model=use_model, system_instruction=system_instruction, response_format=response_format)

    def _prepare_prompt(self, *args, **kwargs) -> str:
        """
        Prepares the prompt by dynamically injecting arguments.

        Args:
            *args: Positional arguments to add to the prompt.
            **kwargs: Key-value arguments for placeholder replacement or context.

        Returns:
            str: The modified prompt ready for execution.
        """
        # Default injection modes: Before, After, or Inject placeholders
        before = kwargs.pop("ADD_BEFORE", None)
        after = kwargs.pop("ADD_AFTER", None)
        

        prompt = self.prompt
        ## If a placeholder is provided, replace it with the value
        for key, value in kwargs.items():
            prompt = prompt.replace(f"{{{key}}}", str(value))

        # Add "before" text if provided
        prompt = f"{before}\n{prompt}" if before else prompt
        # Add "after" text if provided
        prompt = f"{prompt}\n{after}" if after else prompt
        # Append positional arguments to the end
        if args:
            prompt = f"{prompt}\n" + "\n".join(args)

        return prompt
    
    def _call_ai(self, formatted_prompt: str = None, use_model: str = "gpt-4o-mini", response_format: dict = None, system_instruction = None) -> Any:
        """
        Internal method to call OpenAI API using the chat completion API.

        Args:
            formatted_prompt (str): The prepared prompt for execution.
        
        Returns:
            Any: The API response.
        """
        if not formatted_prompt:
            formatted_prompt = self.prompt
        system_instruction = "" if system_instruction is None else system_instruction
        # Define request payload
        request_payload = {
            "model": use_model,
            "messages": [
                {"role": "system", "content": f"You are a helpful assistant that perform tasks user asked to. Here's addition instruction if any: \n{system_instruction}"},
                {"role": "user", "content": formatted_prompt}
            ],
            "temperature": self.temperature
        }

        if response_format:
            request_payload["response_format"] = {
                "type": "json_schema",
                "json_schema": response_format
            }

        response = client.chat.completions.create(**request_payload)
        return response.choices[0].message.content
    
    def _improve_prompt(self, replace = False, use_prompt = "fabric", instruction = None, system_instruction = None) -> str:
        """
        Improving prompt using the improve_prompt pattern provided by fabric or an user provided metaprompt.

        Args:
            replace (bool): Whether to replace the existing prompt in addition to returning it.
            use_prompt (str): The metaprompt to use for improving the prompt.

        Returns:
            str: The improved prompt.
        """
        if use_prompt == "fabric":
            file = "fabric-improve_prompt.md"
        
        try:
            with open(file, 'r', encoding='utf-8') as f:
                instruction = f.read()
        except FileNotFoundError:
            raise FileNotFoundError(f"File not found: {file}")
        
        # use instruction to improve prompt
        instruction = "" if instruction is None else instruction
        improved_prompt = self.execute(ADD_BEFORE=f"\naddition instruction:\n{instruction} \nyou must preserve the original placeholder in the improved prompt if there is one.\n\n"+instruction,
                                       USE_MODEL="gpt-4o", 
                                       SYSTEM_INSTRUCTION = system_instruction)
        
        improved_prompt = improved_prompt.split("# OUTPUT\n\n")[-1] if use_prompt == "fabric" else improved_prompt

        if replace:
            self.prompt = improved_prompt
        else:
            return improved_prompt
        
    def set_temperature(self, temperature: float) -> None:
        """
        Sets the temperature for the AI model.

        Args:
            temperature (float): The temperature value to set.
        """
        self.temperature = temperature

    def save(self, file_path: str) -> None:
        """
        Saves the prompt function to a file.

        Args:
            file_path (str): The file path to save the prompt function to.
        """
        with open(file_path, "w") as file:
            file.write(self.prompt)

In [20]:
pf = PromptFunction("For a given input, return a haiku about it. {input}")
pf._improve_prompt(replace=True)
print(pf.prompt)

```markdown
### Task

Create a haiku based on the following input:

**Input:** `{input}`

### Instructions

1. Capture the essence of the input in a three-line haiku.
2. Follow the traditional haiku structure with a syllable pattern of 5-7-5.
3. Ensure the haiku evokes imagery or emotion related to the input.

### Example

- **Input:** "Autumn leaves"
- **Haiku:**

  ```
  Leaves fall gracefully  
  Whispering secrets of time  
  Autumn's gentle breath  
  ```
```


In [21]:
print(pf.execute(input="A beautiful sunset"))

**Haiku:**

```
Sunset's gentle glow
Spreading warmth through the cool sky
Beauty fades to night
```


In [22]:
print(pf.execute(ADD_BEFORE = "use shakespear style", input="An apple"))

**Haiku:**

Apple's red delight  
Whispers of nature's sweet gifts  
In crisp morning light


## Test playground

In [81]:
pf = PromptFunction("Return the text content of the provided website text snapshot, do not include any ads or irrelevant information. Reformat the text to make it more readable.")

pf._improve_prompt(replace=True)
print(pf.prompt)

```markdown
**Task:** Extract and reformat the main text content from the provided website text snapshot.

**Instructions:**

1. **Extract Content:** Identify and return only the main text content from the snapshot. Exclude any advertisements or irrelevant information.

2. **Reformat for Readability:** Reformat the extracted text to enhance readability. This may include adjusting paragraph breaks, correcting any formatting issues, and ensuring the text flows logically.

3. **Output:** Present the reformatted text in a clean and organized manner.

**Note:** Ensure that the final output is free from any non-essential elements and focuses solely on delivering the core content in an easily digestible format.
```


In [85]:
url = "https://www.bbc.com/news/articles/cdd0vp95v9yo"
json_schema = {
    "type": "object",
    "properties": {
        "content": {
            "type": "string"
        }
    }
}
print(pf.execute(get_text_snapshot(url), response_format=json_schema))

**Title:** Car crashes through front of Iceland store in Smethwick

A car has smashed through the front of an Iceland supermarket on a high street in the West Midlands, coming to rest inside the store. The incident occurred on Bearwood Road in Bearwood, Smethwick, which was subsequently closed. The incident happened at about 19:55 GMT, after the store had closed for the day according to the store's online opening hours.

The fire service, which dispatched two appliances to the scene, reported that there had been no rescues. West Midlands Police also attended the scene. National Express West Midlands reported that five bus services had been diverted due to the incident and apologized for any inconvenience to passengers.


### Test on Generalization of a story crawler prompt

In [4]:
content_finder = PromptFunction(load_prompt("2024-11-19-content_finder.md"))
get_title_and_url = PromptFunction(load_prompt("2024-11-19-get_title_and_url.md"))
get_content = PromptFunction(load_prompt("2024-11-19-get_content.md"))

In [58]:
content_finder.execute(get_text_snapshot("www.bbc.com/news/articles"), USE_MODEL="gpt-4o")

'```json\n{\n    "error": "Refused: No suitable bedtime stories found."\n}\n```'

In [78]:
system_instruction = "Make this prompt more general so it can be used for any website for a given topic to crawl, not just bedtime story for kids."
general_content_finder = content_finder._improve_prompt(use_prompt="fabric", system_instruction=system_instruction)
general_get_title_and_url = get_title_and_url._improve_prompt(use_prompt="fabric", system_instruction=system_instruction)
general_get_content = get_content._improve_prompt(use_prompt="fabric", system_instruction=system_instruction)

general_content_finder = PromptFunction(general_content_finder)
general_get_title_and_url = PromptFunction(general_get_title_and_url)
general_get_content = PromptFunction(general_get_content)

print(general_content_finder.prompt)

```markdown
Given a webpage URL, your task is to analyze its content and identify categories relevant to the specified topic. Follow these instructions carefully:

1. **Extract Categories**:
   - Examine the webpage to identify categories relevant to the topic provided.

2. **Criteria for Selection**:
   - Categories should be broad enough to cover multiple subtopics but specific enough to offer a clear theme or genre.
   - Ensure the categories are present on the website with accessible URLs.
   - Each category should lead to a page that elaborates on the subject matter.

3. **Handling Unsuitable Content**:
   - If the webpage does not align with the specified topic or contains minimal relevant content, classify it as unsuitable.
   - For webpages with no suitable content, use the response format: `{"error": "Refused: No suitable content found."}`

4. **Error Handling**:
   - If the webpage URL is invalid or the content is inaccessible, provide an appropriate error message in the resp

In [79]:
general_content_finder.save("content_finder.md")
general_get_title_and_url.save("get_title_and_url.md")
general_get_content.save("get_content.md")

In [5]:
content_finder = PromptFunction(load_prompt("content_finder.md"))
get_title_and_url = PromptFunction(load_prompt("get_title_and_url.md"))
get_content = PromptFunction(load_prompt("get_content.md"))

In [27]:
response_format = {
        "name": "world_news_articles",
        "schema": {
            "type": "object",
            "properties": {
                "news_categories": {
                    "type": "object"
                }
            },
            "additionalProperties": False,
            "required": ["news_categories"],
        }
    }

content_finder.set_temperature(1)
categories = content_finder.execute(get_text_snapshot("www.bbc.com/news/articles", links_at_end=True), 
                                            USE_MODEL="gpt-4o", 
                                            ADD_BEFORE = "I'm looking for world news articles of different countries.",
                                            RESPONSE_FORMAT=response_format)
print(categories)

{"news_categories":{"US & Canada":"https://www.bbc.com/news/us-canada","UK":"https://www.bbc.com/news/uk","Africa":"https://www.bbc.com/news/world/africa","Asia":"https://www.bbc.com/news/world/asia","Australia":"https://www.bbc.com/news/world/australia","Europe":"https://www.bbc.com/news/world/europe","Latin America":"https://www.bbc.com/news/world/latin_america","Middle East":"https://www.bbc.com/news/world/middle_east","Israel-Gaza War":"https://www.bbc.com/news/topics/c2vdnvdg6xxt","War in Ukraine":"https://www.bbc.com/news/war-in-ukraine"}}


In [21]:
json.loads(categories).get("news_categories", )

{'News': 'https://www.bbc.com/news',
 'Sport': 'https://www.bbc.com/sport',
 'Business': 'https://www.bbc.com/business',
 'Innovation': 'https://www.bbc.com/innovation',
 'Culture': 'https://www.bbc.com/culture',
 'Arts': 'https://www.bbc.com/arts',
 'Travel': 'https://www.bbc.com/travel',
 'Earth': 'https://www.bbc.com/future-planet',
 'Israel-Gaza War': 'https://www.bbc.com/news/topics/c2vdnvdg6xxt',
 'War in Ukraine': 'https://www.bbc.com/news/war-in-ukraine',
 'US Election': 'https://www.bbc.com/news/topics/cj3ergr8209t',
 'US & Canada': 'https://www.bbc.com/news/us-canada',
 'UK': 'https://www.bbc.com/news/uk',
 'Africa': 'https://www.bbc.com/news/world/africa',
 'Asia': 'https://www.bbc.com/news/world/asia',
 'Australia': 'https://www.bbc.com/news/world/australia',
 'Europe': 'https://www.bbc.com/news/world/europe',
 'Latin America': 'https://www.bbc.com/news/world/latin_america',
 'Middle East': 'https://www.bbc.com/news/world/middle_east'}

In [28]:
categories = json.loads(categories).get("news_categories", {})
# get the first category
category = list(categories.keys())[0]
category_url = categories[category]

print(f"{category}: {category_url}")

US & Canada: https://www.bbc.com/news/us-canada


In [39]:
response_format = {
        "name": "news_articles",
        "schema": {
            "type": "object",
            "properties": {
                "title": {
                    "type": "string"
                },
                "url": {
                    "type": "string"
                }
            },
            "additionalProperties": False,
            "required": ["title", "url"],
        }
    }
snapshot = get_text_snapshot(category_url, links_at_end=True)
snapshot_links = snapshot[snapshot.find("Links/Buttons:"):]
titles_and_urls = get_title_and_url.execute(snapshot_links, 
                          ADD_BEFORE = f"I'm looking for news articles on {category}, please provide me with the title and URL of the articles.",
                          USE_MODEL="gpt-4o-mini",
                          RESPONSE_FORMAT=response_format)
titles_and_urls

'{"titles_and_urls":[{"title":"Trudeau says he is ready to work with Trump amid tariff threats","url":"https://www.bbc.com/news/articles/cj6kj2752jlo"},{"title":"NFL star handcuffed before game sees his traffic ticket dropped","url":"https://www.bbc.com/news/articles/c4gz14q32gxo"},{"title":"\'Like a golden ticket\' - Menendez brothers case sparks frenzy in LA","url":"https://www.bbc.com/news/articles/cdxy5d2q4y4o"},{"title":"Judge rejects effort to stop transgender college volleyball player competing","url":"https://www.bbc.com/news/articles/cvgxjmnnxwno"},{"title":"US universities warn foreign students on Trump immigration crackdown","url":"https://www.bbc.com/news/articles/czxvz4re5y1o"},{"title":"Triumph over legal cases seals Trump\'s comeback","url":"https://www.bbc.com/news/articles/c39nmy17em2o"},{"title":"US woman jailed for fatally shooting neighbour through door","url":"https://www.bbc.com/news/articles/c5ygd59q7l1o"},{"title":"President Biden pardons turkeys at White House"

In [40]:
next_page = json.loads(titles_and_urls).get("next_page", "")
titles_and_urls = json.loads(titles_and_urls).get("titles_and_urls", [])
titles_and_urls

[{'title': 'Trudeau says he is ready to work with Trump amid tariff threats',
  'url': 'https://www.bbc.com/news/articles/cj6kj2752jlo'},
 {'title': 'NFL star handcuffed before game sees his traffic ticket dropped',
  'url': 'https://www.bbc.com/news/articles/c4gz14q32gxo'},
 {'title': "'Like a golden ticket' - Menendez brothers case sparks frenzy in LA",
  'url': 'https://www.bbc.com/news/articles/cdxy5d2q4y4o'},
 {'title': 'Judge rejects effort to stop transgender college volleyball player competing',
  'url': 'https://www.bbc.com/news/articles/cvgxjmnnxwno'},
 {'title': 'US universities warn foreign students on Trump immigration crackdown',
  'url': 'https://www.bbc.com/news/articles/czxvz4re5y1o'},
 {'title': "Triumph over legal cases seals Trump's comeback",
  'url': 'https://www.bbc.com/news/articles/c39nmy17em2o'},
 {'title': 'US woman jailed for fatally shooting neighbour through door',
  'url': 'https://www.bbc.com/news/articles/c5ygd59q7l1o'},
 {'title': 'President Biden pard

In [42]:
article = titles_and_urls[0].get("title", "")
article_url = titles_and_urls[0].get("url", "")

print(f"Title: {article}\nURL: {article_url}")

Title: Trudeau says he is ready to work with Trump amid tariff threats
URL: https://www.bbc.com/news/articles/cj6kj2752jlo


In [47]:
response_format = {
        "name": "article_content",
        "schema": {
            "type": "object",
            "properties": {
                "content": {
                    "type": "string"
                }
            },
            "additionalProperties": False,
            "required": ["content"],
        }
    }

content = get_content.execute(get_text_snapshot(article_url), 
                                ADD_BEFORE = f"I'm looking for the content of the article titled {article}.",
                                USE_MODEL="gpt-4o-mini",
                                RESPONSE_FORMAT = response_format)
content

'{"title":"Trudeau says he is ready to work with Trump amid tariff threats","author":"James FitzGerald and Nadine Yousif","content":"Canadian Prime Minister Justin Trudeau has said his country is prepared to work with the US in \\"constructive ways\\" after President-elect Donald Trump vowed to introduce 25% tariffs on goods coming from Canada.\\n\\nThe tariff proposal made by Trump has caused deep concern among Canadian officials, including provincial leaders.\\n\\nTrump also said he would impose duties on Mexico and an extra 10% on goods coming from China.\\n\\nTrudeau sought to reassure Canadians and will hold a meeting on Wednesday with provincial and territorial leaders to discuss how to move forward.\\n\\n\\"This is a relationship that we know takes a certain amount of working on, and that\'s what we\'ll do,\\" Trudeau told reporters on Tuesday.\\n\\nHe and Trump spoke over the phone on Monday evening, after the president-elect announced the proposed tariffs in a post on Truth So

In [49]:
title = json.loads(content).get("title", "")
author = json.loads(content).get("author", "")
content = json.loads(content).get("content", "")

print(f"Title: {title}\nAuthor: {author}\nContent: {content}")

Title: Trudeau says he is ready to work with Trump amid tariff threats
Author: James FitzGerald and Nadine Yousif
Content: Canadian Prime Minister Justin Trudeau has said his country is prepared to work with the US in "constructive ways" after President-elect Donald Trump vowed to introduce 25% tariffs on goods coming from Canada.

The tariff proposal made by Trump has caused deep concern among Canadian officials, including provincial leaders.

Trump also said he would impose duties on Mexico and an extra 10% on goods coming from China.

Trudeau sought to reassure Canadians and will hold a meeting on Wednesday with provincial and territorial leaders to discuss how to move forward.

"This is a relationship that we know takes a certain amount of working on, and that's what we'll do," Trudeau told reporters on Tuesday.

He and Trump spoke over the phone on Monday evening, after the president-elect announced the proposed tariffs in a post on Truth Social.

The two discussed trade and borde