In [1]:
import os

# set path to parent directory (one level up from current)
os.chdir(os.getcwd() + "/..")
print(os.getcwd())

/mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/hugging_face_agents_course


 Find all Batman filming locations in the world, calculate the time to transfer via boat to there, and represent them on a map, with a color varying by boat transfer time. Also represent some supercar factories with the same boat transfer time.

In [2]:
# We first make a tool to get the cargo plane transfer time.
import math
from typing import Optional, Tuple

from smolagents import tool
import os
from PIL import Image
from smolagents import CodeAgent, GoogleSearchTool, HfApiModel, VisitWebpageTool
from dotenv import load_dotenv
from smolagents.utils import encode_image_base64, make_image_url
from smolagents import OpenAIServerModel, AzureOpenAIServerModel

load_dotenv()

True

In [3]:
@tool
def calculate_cargo_travel_time(
    origin_coords: Tuple[float, float],
    destination_coords: Tuple[float, float],
    cruising_speed_kmh: Optional[float] = 750.0,  # Average speed for cargo planes
) -> float:
    """
    Calculate the travel time for a cargo plane between two points on Earth using great-circle distance.

    Args:
        origin_coords: Tuple of (latitude, longitude) for the starting point
        destination_coords: Tuple of (latitude, longitude) for the destination
        cruising_speed_kmh: Optional cruising speed in km/h (defaults to 750 km/h for typical cargo planes)

    Returns:
        float: The estimated travel time in hours

    Example:
        >>> # Chicago (41.8781° N, 87.6298° W) to Sydney (33.8688° S, 151.2093° E)
        >>> result = calculate_cargo_travel_time((41.8781, -87.6298), (-33.8688, 151.2093))
    """

    def to_radians(degrees: float) -> float:
        return degrees * (math.pi / 180)

    # Extract coordinates
    lat1, lon1 = map(to_radians, origin_coords)
    lat2, lon2 = map(to_radians, destination_coords)

    # Earth's radius in kilometers
    EARTH_RADIUS_KM = 6371.0

    # Calculate great-circle distance using the haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = (
        math.sin(dlat / 2) ** 2
        + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
    )
    c = 2 * math.asin(math.sqrt(a))
    distance = EARTH_RADIUS_KM * c

    # Add 10% to account for non-direct routes and air traffic controls
    actual_distance = distance * 1.1

    # Calculate flight time
    # Add 1 hour for takeoff and landing procedures
    flight_time = (actual_distance / cruising_speed_kmh) + 1.0

    # Format the results
    return round(flight_time, 2)

In [4]:
model = HfApiModel(model_id = "Qwen/Qwen2.5-Coder-7B-Instruct")

In [5]:
task = """Can you find all real-world filming locations used in Batman movies across the globe, calculate the approximate cargo plane transfer time from each of those locations to Gotham (assumed to be at 40.7128° N, 74.0060° W — the coordinates of New York City), and present the results in a Pandas DataFrame? Additionally, can you find a list of global supercar manufacturing factories and identify those that would require the same cargo plane transfer time as the Batman filming locations? Please ensure all data points (especially geographic coordinates and distances) are verified by visiting the source URLs. Use parallel web searches (in a loop if needed) to collect comprehensive data"""

In [6]:
agent1 = CodeAgent(
    model=model,
    tools=[GoogleSearchTool(), VisitWebpageTool(), calculate_cargo_travel_time],
    additional_authorized_imports=["pandas"],
)

In [7]:
result = agent1.run(task)

In [8]:
print(result)

[['King Kong Studios', 1.01, nan], ['Sonoma County, California', 7.06, nan], ['Nairobi', 18.08, nan], ['London', 9.17, nan], [nan, 10.75, "Lamborghini, Sant'Agata Bolognese"], [nan, 10.83, 'Ferrari, Maranello'], [nan, 10.09, 'Bugatti, Molsheim'], [nan, 9.18, 'McLaren, Woking'], [nan, 9.18, 'Aston Martin, Woking']]


✌️ Splitting the task between two agents
Multi-agent structures allow to separate memories between different sub-tasks, with two great benefits:

Each agent is more focused on its core task, thus more performant
Separating memories reduces the count of input tokens at each step, thus reducing latency and cost.
Let's create a team with a dedicated web search agent, managed by another agent.

The manager agent should have plotting capabilities to redact its final report: so let us give it access to additional imports, including matplotlib, and geopandas + shapely for spatial plotting.

In [9]:
# Create a web agent to browse the web and find information
web_agent = CodeAgent(
    model=model,
    tools=[
        GoogleSearchTool(provider="serpapi"),
        VisitWebpageTool(),
        calculate_cargo_travel_time,
    ],
    name="web_agent",
    description="Browses the web to find information",
    verbosity_level=0,
    max_steps=10,
)

In [10]:
reasoning_model = OpenAIServerModel(
    model_id="gpt-4o",
    api_base=os.getenv("OPENAI_API_BASE", "https://api.openai.com/v1"),
    api_key=os.getenv("OPENAI_API_KEY"),
)

In [11]:
# import gc
# del reasoning_model
# gc.collect()

In [12]:
# reasoning_model = AzureOpenAIServerModel(
#     model_id="gpt-4o",
#     azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
#     api_key=os.getenv("AZURE_OPENAI_API_KEY"),
#     api_version=os.getenv("OPENAI_API_VERSION"),
# )

In [13]:
# messages = [{"role": "user", "content": "Explain quantum mechanics in simple terms."}]
# response = reasoning_model(messages)

In [14]:
# print(response.content)

In [15]:
def check_reasoning_and_plot(final_answer, agent_memory):
    filepath = "saved_map.png"
    assert os.path.exists(filepath), "Make sure to save the plot under saved_map.png!"
    image = Image.open(filepath)
    prompt = (
        f"Here is a user-given task and the agent steps: {agent_memory.get_succinct_steps()}. Now here is the plot that was made."
        "Please check that the reasoning process and plot are correct: do they correctly answer the given task?"
        "First list reasons why yes/no, then write your final decision: PASS in caps lock if it is satisfactory, FAIL if it is not."
        "Don't be harsh: if the plot mostly solves the task, it should pass."
        "To pass, a plot should be made using px.scatter_map and not any other method (scatter_map looks nicer)."
    )
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": prompt,
                },
                {
                    "type": "image_url",
                    "image_url": {"url": make_image_url(encode_image_base64(image))},
                },
            ],
        }
    ]
    output = reasoning_model(messages).content
    print("Feedback: ", output)
    if "FAIL" in output:
        raise Exception(output)
    return True

In [16]:
# what we want to do is to have a manager agent that manages the web agent and does reasoning and also plots the results and checks the reasoning process.
manager_agent = CodeAgent(
    model=reasoning_model,
    tools=[calculate_cargo_travel_time],
    managed_agents=[web_agent],
    additional_authorized_imports=[
        "geopandas",
        "plotly",
        "shapely",
        "json",
        "pandas",
        "numpy",
        "matplotlib.pyplot",
        "pillow",
    ],
    planning_interval=5,
    verbosity_level=2,
    final_answer_checks=[check_reasoning_and_plot],
    max_steps=10,
)

In [17]:
manager_agent.visualize()

In [18]:
op = manager_agent.run(
    """
Locate all real-world filming locations used in Batman movies worldwide and calculate the estimated cargo plane transfer time from each of those locations to Gotham (assumed to be at coordinates 40.7128° N, 74.0060° W, equivalent to New York City).

In addition, identify global supercar manufacturing facilities that have approximately the same cargo plane transfer time to Gotham as the filming locations. Ensure that at least 6 total data points are collected across filming and factory locations combined.

Visualize these locations on a world map using a scatter plot, where:
- Each point corresponds to a filming location or supercar factory.
- The color of each point reflects the estimated cargo plane travel time.
- Locations are labeled by name or city.

Save the generated map as 'saved_map.png'.

Here's an example of how to plot and return a map:
import plotly.express as px
df = px.data.carshare()
fig = px.scatter_map(df, lat="centroid_lat", lon="centroid_lon", text="name", color="peak_hour", size=100,
     color_continuous_scale=px.colors.sequential.Magma, size_max=15, zoom=1)
fig.show()
fig.write_image("saved_image.png")
final_answer(fig)

Important: Never try to manually parse strings with code — just print and examine them directly when needed.
Your final task is to return the final answer for all the places found, including the Batman filming locations and supercar factories, in a Pandas DataFrame.
"""
)


Feedback:  The plot attempts to meet the task requirements, but there are some issues:

Reasons why it meets the requirements:

1. **Visualization Method**: The plot uses `px.scatter_mapbox`, which is nice for displaying location-based data on a map, aligning with the task requirements.
2. **Travel Time Coloring**: The points are colored based on the estimated travel time, as requested in the task.
3. **Labeling**: Locations are labeled by name or city, making it easy to identify them visually.

Reasons why it does not fully meet the requirements:

1. **Data Completeness**: The task specifies that both Batman filming locations and supercar manufacturing facilities should be plotted, but the process largely relied on supercar factories due to difficulties in retrieving complete filming location data.
2. **Number of Points**: The task requires a combination of filming and factory locations amounting to at least six data points, but it's unclear if a sufficient number of filming locations

In [19]:
print(op)

We've encountered some challenges in fulfilling the original task that required both Batman filming locations and supercar manufacturing locations for map plotting. Due to data limitations and errors in execution, the primary solution concentrated on supercar manufacturing facilities' data.

To provide a clear path forward, here is a summary of the completed portions, along with further steps and refined strategies conducive to ensuring task success:

### Summary of Completed Work:
- **Collected Geocoordinates**: Successfully retrieved coordinates for supercar manufacturing cities.
- **Calculated Travel Times**: Computed cargo plane travel times from these locations to Gotham (New York City).
- **Data Compilation**: Collated the data into a clear and structured DataFrame.

### Unmet Task Elements:
- **Filming Locations**: Data retrieval for filming locations faced critical limitations due to access and extraction issues.
- **Visual Map Plot**: The plot visualization did not execute due

##### Vision Agents with smolagents

In this example, imagine Alfred, the butler at Wayne Manor, is tasked with verifying the identities of the guests attending the party. As you can imagine, Alfred may not be familiar with everyone arriving. To help him, we can use an agent that verifies their identity by searching for visual information about their appearance using a VLM. This will allow Alfred to make informed decisions about who can enter. Let’s build this example!

In this approach, images are passed to the agent at the start and stored as task_images alongside the task prompt. The agent then processes these images throughout its execution.

Consider the case where Alfred wants to verify the identities of the superheroes attending the party. He already has a dataset of images from previous parties with the names of the guests. Given a new visitor’s image, the agent can compare it with the existing dataset and make a decision about letting them in.

In this case, a guest is trying to enter, and Alfred suspects that this visitor might be The Joker impersonating Wonder Woman. Alfred needs to verify their identity to prevent anyone unwanted from entering.

In [20]:
from PIL import Image
import requests
from io import BytesIO

image_urls = [
    "https://upload.wikimedia.org/wikipedia/commons/e/e8/The_Joker_at_Wax_Museum_Plus.jpg",
    "https://upload.wikimedia.org/wikipedia/en/9/98/Joker_%28DC_Comics_character%29.jpg" ,
    "https://cdn.pixabay.com/photo/2023/06/03/07/01/wonder-woman-8037130_1280.png"
]

images = []
for url in image_urls:
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36" 
    }
    response = requests.get(url,headers=headers)
    image = Image.open(BytesIO(response.content)).convert("RGB")
    images.append(image)

In [22]:
# Instantiate the agent
agent = CodeAgent(
    tools=[],
    model=reasoning_model,
    max_steps=20,
    verbosity_level=2
)

response = agent.run(
    """
    Describe the costume and makeup that the comic character in these photos is wearing and return the description.
    Tell me if the guest is The Joker or Wonder Woman and provide a brief explanation of the look.
    Use the images provided to analyze the costume and makeup and return a detailed description for each image in the form of python string.
    The description should include colors, patterns, and any notable features of the costume and makeup.
    Make sure to use the images provided in the task.
    """,
    images=images
)

In [23]:
response

{'first_image': '\nThe character is wearing white makeup covering the entire face, with dark, dramatic blue eyeshadow and exaggerated red lips. \nThe eyebrows are accentuated in a dark, arched style. The costume consists of a purple jacket and a yellow shirt with a large, bright purple tie.\n',
 'second_image': '\nThe character features green hair and a white face, showcasing an exaggerated, sinister smile with red lips. \nThe costume includes a dark suit with a white shirt and a flower on the lapel. The character holds a card, possibly indicating a humorous yet menacing demeanor.\n',
 'third_image': '\nThe character is wearing a red and gold armor-like costume, with a star-emblazoned tiara on the forehead. \nThe makeup is bold, featuring dark eyeliner and a strong red lipstick. The costume includes a red flowing garment and gold accessories, conveying a powerful, warrior-like appearance.\n',
 'character_associations': {'first_and_second': 'The Joker',
  'third': 'Wonder Woman'},
 'exp

TODO:
1. This code is part of the SmolAgents course and is used to demonstrate how to create a web browser agent that can browse the web, find information, and perform calculations. Refer to <url>https://huggingface.co/agents-course/notebooks/blob/main/unit2/smolagents/vision_web_browser.py </url> for the complete code.

############################################### END OF THE FILE ########################################################