In [1]:
# import sys
# !{sys.executable} -m pip install --upgrade pip
# !{sys.executable} -m pip install requests
# !{sys.executable} -m pip install tiktoken
# !{sys.executable} -m pip install python-dotenv
# !{sys.executable} -m pip install phidata
# !{sys.executable} -m pip install groq
# !{sys.executable} -m pip install htmldate
# !{sys.executable} -m pip install pyshorteners

In [2]:
import os
# Read/write json
import json

In [3]:
GROQ_TOKEN_LIMIT = 6000
# Configuration filename
CONFIG_FILENAME = 'config.json'
# List of articles we have processed
ARTICLES_FILENAME = 'articles.json'
# Date format
DATE_FORMAT = '%Y-%m-%d'

In [4]:
# To count tokens
import tiktoken

def count_tokens(input_string: str) -> int:
    """
    Computes the the number of tokens using cl100k_base encoding
    Args:
        input_string (str): string to count tokens for
    Returns:
        int: number of tokens using cl100k_base encoding
    """
    tokenizer = tiktoken.get_encoding('cl100k_base')
    tokens = tokenizer.encode(input_string)
    return len(tokens)

In [5]:
from model import Article, ArticleList

def add_article(article:Article) -> None:
    """
    Adds an article to the articles list if the entry is new
    Args:
        article (Article): an article to add to the articles list
    """
    # Loads if the file already exists
    if os.path.isfile(ARTICLES_FILENAME):
        with open(ARTICLES_FILENAME, 'r') as file:
            json_data = json.load(file)
        article_list = ArticleList.model_validate(json_data)        
    else:
        # First time
        article_list = ArticleList(articles=[])
    # Search for the key; will return an empty list if article[key] not found
    article_found = [item for item in article_list.articles if item.key == article.key]

    if article_found:
        print(f'Skip appending to {ARTICLES_FILENAME} as key {article.key} exists')
    else:
        article_list.articles.append(article)
        # indent for pretty printing
        json_output = article_list.model_dump_json(indent=4)

        # Write the JSON string to a file
        with open(ARTICLES_FILENAME, 'w') as f:
            f.write(json_output)
        print(f'Appended to {ARTICLES_FILENAME} with key {article.key}')

In [6]:
# To read configuration file
import configparser
# To check a string is an URL or not
from urllib.parse import urlparse
# Http call to Jina AI
import requests
# To get the published date
from htmldate import find_date
# Read json
import json
# To check the date format
from datetime import datetime
# To shorten URL
import pyshorteners
# Article model
from model import Article

def scrape_jina_ai() -> str:
    """
    Removes markups from a site and save it in LLM friendly format
    Returns:
        key to the article in the articles list file
    """
    #1. Get the configuration data including url
    with open(CONFIG_FILENAME, 'r') as file:
        config = json.load(file)

    # Setup section
    setup_section = config['setup']
    article_url = setup_section['article_url']
    # Article publish date, defaults to none if not provided
    publish_date = setup_section.get('publish_date', None)
    # Title, defaults to empty if not provided
    article_title = setup_section.get('title', '')
    
    #2. Check the passed argument for an URL or not
    result = urlparse(article_url)
    assert all([result.scheme, result.netloc]), f'{article_url} is not a valid URL'

    #3. Shorten the url
    short_url = pyshorteners.Shortener().tinyurl.short(article_url)

    #4 Base name from the the shorten url
    parsed_url = urlparse(short_url)
    # Extract the last name from the path as the key
    article_key = parsed_url.path.split('/')[-1]

    #5. Get the article date if explicitly not set in the config
    if not publish_date:
        # Look for the publish date from the url
        publish_date = find_date(article_url)

    # We must have the date as it will be used in watchlist to remove duplicates
    assert publish_date != None, 'missing publish date'
    # It must be conform to our format
    try:
        datetime.strptime(publish_date, DATE_FORMAT)
    except ValueError:
        raise AssertionError(f'Date {publish_date} is not valid for format {DATE_FORMAT}')    
    # print(publish_date)
    
    #6. The filename to save the LLM friendly format
    input_file = os.path.join('llm', 'in', f'{article_key}.text')
    if os.path.isfile(input_file):
        print(f'Skip {input_file} as it exists')
    else:
        contents = requests.get(f'https://r.jina.ai/{article_url}')        
        with open(input_file, 'w') as file:
            file.write(contents.text)    
        print(f'Written LLM friendly output to {input_file}')

    #7. An article to add to the list of articles
    article = Article(key=article_key, url=short_url, title=article_title, pub_date=publish_date)
    add_article(article=article)

    #. Return the key of the article added to articles list
    return article.key

In [7]:
key = scrape_jina_ai()
key

Skip llm/in/29ww2d5a.text as it exists
Skip appending to articles.json as key 29ww2d5a exists


'29ww2d5a'

In [8]:
# To load modules
from model import StockList

# AI agent modules
from phi.agent import Agent, RunResponse
from phi.model.groq import Groq

# To read environment property file
from dotenv import load_dotenv

def run_agent(key:str) -> RunResponse:
    """
    Runs the agent for the article provided in the key argument
    Args:
        key (str): the key to the Article to run the agent
    Returns:
        RunResponse: agent run response
    """
    #1. Get the configuration data to access input_suffix parameter
    with open(CONFIG_FILENAME, 'r') as file:
        config = json.load(file)

    #2. Construct the input file
    input_file = os.path.join('llm', 'in', f'{key}.text')

    #3. The LLM input file must exist
    assert os.path.isfile(input_file), f"{input_file} doesn't exist"
    
    #4. Get the input text to pass to the agent
    with open(input_file, 'r') as file:
        content = file.read()

    #5. Check the token count limit
    token_count = count_tokens(input_string=content)
    assert token_count < GROQ_TOKEN_LIMIT,\
        f'{token_count} exceeds the free {GROQ_TOKEN_LIMIT} token limit'
        
    #6. Response filename - extract the base name from the input file
    filename_with_ext = os.path.basename(input_file)
    base_name = os.path.splitext(filename_with_ext)[0]
    response_file = os.path.join('llm', 'out', f'{base_name}.json')
            
    #7. Construct the agent
    agent_section = config['agent']
    agent = Agent(
        name=agent_section['name'],
        role=agent_section['role'],
        model=Groq(id=agent_section['model_id']),
        instructions=agent_section['instructions'],
        markdown=True,
        show_tool_calls=True,
        # debug_mode=True,
        save_response_to_file=response_file,
        response_model=StockList,
        response_format={'type': 'json_object'}
    )
    #8. Load environment variables from .env file
    load_dotenv()

    #9. Run the agent and return the response to the caller
    return agent.run(content)

In [9]:
from phi.utils.pprint import pprint_run_response
# Add the suffix to the key if you are spliiting the file
suffix = None #'1'
if suffix:
    response:RunResponse = run_agent(key=f'{key}-{suffix}')
else:
    response:RunResponse = run_agent(key=key)

In [10]:
pprint_run_response(response)