In [99]:
#basic imports
from langchain_core.tools import tool,Tool
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from dotenv import load_dotenv
import openai

import os

# special imports
from langchain_community.tools import DuckDuckGoSearchRun ###simple search tool
from langchain_community.tools import DuckDuckGoSearchResults ### returns snipeet,title,link
from langchain_community.utilities import GoogleSerperAPIWrapper
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
from io import BytesIO
import requests
from langchain_experimental.utilities import PythonREPL


load_dotenv() 

True

In [None]:
# --- Basic operations --- #

@tool
def multiply(a: float, b: float) -> float:
    """Multiplies two numbers.
    Args:
        a (float): the first number
        b (float): the second number
    """
    return a * b


@tool
def add(a: float, b: float) -> float:
    """Adds two numbers.
    Args:
        a (float): the first number
        b (float): the second number
    """
    return a + b


@tool
def subtract(a: float, b: float) -> int:
    """Subtracts two numbers.
    Args:
        a (float): the first number
        b (float): the second number
    """
    return a - b


@tool
def divide(a: float, b: float) -> float:
    """Divides two numbers.
    Args:
        a (float): the first float number
        b (float): the second float number
    """
    if b == 0:
        raise ValueError("Cannot divided by zero.")
    return a / b


@tool
def modulus(a: int, b: int) -> int:
    """Get the modulus of two numbers.
    Args:
        a (int): the first number
        b (int): the second number
    """
    return a % b


@tool
def power(a: float, b: float) -> float:
    """Get the power of two numbers.
    Args:
        a (float): the first number
        b (float): the second number
    """
    return a**b


@tool
def largest(a:list[float]) -> float;
    """Get the largest number in a list.
    Args:
        a (list[float]): the list of numbers
    """
    return max(a)

@tool
def smallest(a:list[float]) -> float;
    """Get the smallest number in a list.
    Args:
        a (list[float]): the list of numbers
    """
    return min(a)
@tool
def average(a:list[float]) -> float;
    """Get the average of a list of numbers.
    Args:
        a (list[float]): the list of numbers
    """
    return sum(a)/len(a)




In [2]:
# --- Advanced operations --- #

# duckduckgo search tool
simple_search = DuckDuckGoSearchRun()


# duckduckgo search tool returns snipeet,title,link
search = DuckDuckGoSearchResults(output_format="list")


# google search tool 
search = GoogleSerperAPIWrapper()
# Need to explicitly convert search function to a tool
google_search = Tool(
    name="Google Search",
    func=lambda q: str(search.results(q)),  # returns list of dicts with 'link'
    description="Search Google and get structured results including links.",
)


#python interpreter
python_repl = PythonREPL()
repl_tool = Tool(
    name="python_repl",
    description="A Python shell. Use this to execute python commands. Input should be a valid python command. If you want to see the output of a value, you should print it out with `print(...)`.",
    func=python_repl.run,
)

In [3]:
repl_tool.invoke("print(1+1)")

Python REPL can execute arbitrary code. Use with caution.


'2\n'

In [None]:

from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
from io import BytesIO
import requests
from playwright.async_api import async_playwright
import nest_asyncio
import asyncio
from langchain.tools import Tool

nest_asyncio.apply()

async def get_webpage_content_async(url: str) -> str:
    try:
        # Check for PDF
        head = requests.head(url, timeout=5)
        if url.lower().endswith(".pdf") or "application/pdf" in head.headers.get("Content-Type", "").lower():
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            pdf = PdfReader(BytesIO(response.content))
            text = ""
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
            return text[:20000] or "No readable text extracted from PDF."

        # Playwright browser with stealth settings
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=False)  # Headful helps bypass Cloudflare
            context = await browser.new_context(
                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
                viewport={'width': 1280, 'height': 800},
                java_script_enabled=True
            )
            page = await context.new_page()
            await page.goto(url, wait_until="networkidle", timeout=30000)
            await page.wait_for_timeout(5000)  # wait for Cloudflare challenge

            content = await page.content()
            await browser.close()

        soup = BeautifulSoup(content, "html.parser")
        text = soup.get_text(separator="\n", strip=True)
        return text[:20000] or "No readable text extracted from webpage."

    except Exception as e:
        return f"Error fetching content: {e}"

# Wrapper for Jupyter
def get_webpage_content_sync(url: str) -> str:
    loop = asyncio.get_event_loop()
    return loop.run_until_complete(get_webpage_content_async(url))

# LangChain tool
get_webpage_content = Tool(
    name="get_webpage_content",
    func=get_webpage_content_sync,
    description="Fetch readable text content from webpages or PDFs, handling anti-bot protections."
)


In [None]:
get_webpage_content.invoke("https://www.universetoday.com/articles/astronomers-watch-a-black-hole-wake-up-in-real-time")

  self._linkage_fixer(parent)


'Astronomers Watch a Black Hole Wake Up in Real Time - Universe Today\nUniverse Today\nHome\nVideos\nPodcast\nNewsletter\nJoin the Club\nRSS Feed\nAstronomers Watch a Black Hole Wake Up in Real Time\nBy Carolyn Collins Petersen\nApril 19, 2025\nA giant black hole awakens with brilliant bursts of optical and x-ray light. Courtesy ESA.\nYou never know exactly when a central supermassive black hole is going to power up and start gobbling matter. Contrary to the popular view that these monsters are constantly devouring nearby stars and gas clouds, it turns out they spend part of their existence dormant and inactive. New observations from the European Space Agency\'s XMM-Newton spacecraft opened a window on the "turn on event" for one of these monsters in a distant galaxy.\nThe supermassive black hole at the heart of the galaxy SDSS1335+0728 has been sleeping quietly for decades. Then, it suddenly lit up in 2019, and astronomers caught the outbursts in both optical and x-ray light. The gala

In [51]:
get_webpage_content.invoke("https://journal.finfar.org/articles/dragons-are-tricksy-the-uncanny-dragons-of-childrens-literature/")

'“Dragons are Tricksy”: The Uncanny Dragons of Children’s Literature |\nMenu\nSkip to content\nHome\nJournal\nCurrent Issue\nArchive\nFor Authors\nEditorial Policies\nSubmission Guidelines\nEditors and Contact Info\nAdvisory Board\nPublisher\nArchive\n2014\nFafnir 1/2014\nFafnir 2/2014\nFafnir 3/2014\nFafnir 4/2014\n2015\nFafnir 1/2015\nFafnir 2/2015\nFafnir 3/2015\nFafnir 4/2015\n2016\nFafnir 1/2016\nFafnir 2/2016\nFafnir 3/2016\nFafnir 4/2016\n2017\nFafnir 1/2017\nFafnir 2/2017\nFafnir 3–4/2017\n2018\nFafnir 1/2018\nFafnir 2/2018\n2019\nFafnir 1/2019\nFafnir 2/2019\n2020\nFafnir 1/2020\nFafnir 2/2020\n2021\nFafnir 1/2021\nFafnir 2/2021\n2022\nFafnir 1/2022\nFafnir 2/2022\n2023\nFafnir 1/2023\nFafnir 2/2023\nFafnir – Nordic Journal of Science Fiction and Fantasy Research\n, Volume 1, Issue 2, pages 41–54.\nEmily Midkiff\n“Dragons are Tricksy”:\nThe Uncanny Dragons of Children’s Literature\nAbstract\n: As early as the sixties, scholars of children’s literature have noted a trend to sof

In [45]:
result = get_webpage_content.func("https://www.universetoday.com/articles/astronomers-watch-a-black-hole-wake-up-in-real-time")
print(result[:10000])  # preview only first 1000 chars


Just a moment...
www.universetoday.com
Verifying you are human. This may take a few seconds.
www.universetoday.com needs to review the security of your connection before proceeding.
Verification successful
Waiting for www.universetoday.com to respond...
Enable JavaScript and cookies to continue
Ray ID:
93e9781dbe5e93ad
Performance & security by
Cloudflare


In [46]:
import asyncio
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
from io import BytesIO
import requests
from playwright.async_api import async_playwright
from langchain.tools import Tool

async def get_webpage_content_async(url: str) -> str:
    try:
        # Check for PDF
        head = requests.head(url, timeout=5)
        if url.lower().endswith(".pdf") or "application/pdf" in head.headers.get("Content-Type", "").lower():
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            pdf = PdfReader(BytesIO(response.content))
            text = ""
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
            return text[:20000] or "No readable text extracted from PDF."

        # Use Playwright Async
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()
            await page.goto(url, timeout=15000)
            content = await page.content()
            await browser.close()

        soup = BeautifulSoup(content, "html.parser")
        text = soup.get_text(separator="\n", strip=True)
        return text[:20000] or "No readable text extracted from webpage."

    except Exception as e:
        return f"Error fetching content: {e}"

# Synchronous wrapper for LangChain Tool
def get_webpage_content_sync(url: str) -> str:
    try:
        return asyncio.run(get_webpage_content_async(url))
    except RuntimeError:
        # If inside an already running event loop (like Jupyter), fallback using nest_asyncio
        import nest_asyncio
        nest_asyncio.apply()
        return asyncio.get_event_loop().run_until_complete(get_webpage_content_async(url))

# LangChain tool
get_webpage_content = Tool(
    name="get_webpage_content",
    func=get_webpage_content_sync,
    description=(
        "Use this tool to fetch metadata or links from a webpage or PDF URL. "
        "Returns the first 20000 characters of webpage or PDF content."
    )
)

In [47]:
get_webpage_content.invoke("https://www.universetoday.com/articles/astronomers-watch-a-black-hole-wake-up-in-real-time")

'Just a moment...\nwww.universetoday.com\nVerifying you are human. This may take a few seconds.\nwww.universetoday.com needs to review the security of your connection before proceeding.\nVerification successful\nWaiting for www.universetoday.com to respond...\nEnable JavaScript and cookies to continue\nRay ID:\n93e97a431eb6f892\nPerformance & security by\nCloudflare'

In [30]:
import requests
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
from io import BytesIO

# --- Webpage content extraction --- #
def get_webpage_content(url: str) -> str:
    try:
        headers = {
            "User-Agent": (
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/113.0.0.0 Safari/537.36"
            )
        }

        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        content_type = response.headers.get("Content-Type", "").lower()

        # Check if it's a PDF
        if "application/pdf" in content_type or url.lower().endswith(".pdf"):
            pdf = PdfReader(BytesIO(response.content))
            text = ""
            for page in pdf.pages:
                text += page.extract_text() or ""
            return text[:20000]  # Truncate for safety

        # Else assume it's HTML
        soup = BeautifulSoup(response.text, "html.parser")
        text = soup.get_text(separator="\n", strip=True)
        return text[:20000]

    except Exception as e:
        return f"Error fetching content: {e}"
    

# def extract_web_content(url: str) -> str:
#     return asyncio.run(get_webpage_content_async(url))



get_webpage_content = Tool(
    name="get_webpage_content",
    func=get_webpage_content,
    description=(
        "Use this tool to fetch metadata or links from a webpage or PDF URL. "
        "Returns the first 5000 characters of webpage or PDF content. "
        # "DO NOT use this tool to load or analyze large files (CSV, PDF, Excel, etc.). "
        # "Instead, if the page contains a downloadable file link, extract just the file URL and pass it to the Python tool."
    )
)

In [29]:
get_webpage_content.invoke("https://www.universetoday.com/articles/astronomers-watch-a-black-hole-wake-up-in-real-time")

'Error fetching content: 403 Client Error: Forbidden for url: https://www.universetoday.com/articles/astronomers-watch-a-black-hole-wake-up-in-real-time'

In [33]:
chat=ChatOpenAI()

In [36]:
chat.invoke("What is the capital of India?")

parser=StrOutputParser()

parser.invoke(chat.invoke("What is the capital of India?"))



'The capital of India is New Delhi.'

In [17]:
from toolkit import tool_list

tool_list[10]

Tool(name='DuckDuckGo Search', description="Search the web using DuckDuckGo and return a list of relevant results. Useful for answering general knowledge questions or finding recent news.The output is a list of dictionaries, each containing 'title', 'snippet', and 'link' keys.The 'link' key contains the URL of the result, which can be used to fetch more information.", func=<bound method BaseTool.run of DuckDuckGoSearchResults(api_wrapper=DuckDuckGoSearchAPIWrapper(region='wt-wt', safesearch='moderate', time='y', max_results=5, backend='auto', source='text'), output_format='list')>)

In [19]:
from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper

In [None]:
wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())

wikipedia_search=Tool(
    name="wikipedia_search",
    func=wikipedia.run,
    description=(
        "search argument should be at least 3 words long. "
        "Fetches summaries from Wikipedia. Useful for general knowledge or factual questions about entities, events, concepts, etc."
    "Prioritze this wikipedia search tool over web search and get web content for general knowledge questions. "
    )

)


In [26]:
wikipedia_search.invoke("mily Midkiff June 2014 article Hreidmar's sons journal dragon depiction")

'No good Wikipedia Search Result was found'

In [3]:
# Google search tool
search = GoogleSerperAPIWrapper()

# Need to explicitly convert search function to a tool
google_search_tool = Tool(
    name="Google_Search",
    func=lambda q: search.results(q)['organic'][:3], # Limit to top 3 results
    description="Search web using Google and get structured results that includes links, which can be used to fetch more information.",
)

In [4]:
google_search_tool.invoke("Emily Midkiff June 2014 article Hreidmar's sons journal dragon depiction")

[{'title': "“Dragons are Tricksy”: The Uncanny Dragons of Children's Literature |",
  'link': 'https://journal.finfar.org/articles/dragons-are-tricksy-the-uncanny-dragons-of-childrens-literature/',
  'snippet': "This article argues that children's literature dragons have been not been entirely softened and satirized.",
  'attributes': {'Missing': "Hreidmar's | Show results with:Hreidmar's"},
  'position': 1},
 {'title': '[PDF] Adaptive In-conversation Team Building for Language Model Agents',
  'link': 'https://arxiv.org/pdf/2405.19425?',
  'snippet': "Identify the journal named for one of Hreidmar's sons that guarded his house. 2. Locate Emily Midkiff's June 2014 article in that journal. 3 ...",
  'date': 'Mar 2, 2025',
  'position': 2},
 {'title': 'm-ric/agents_small_benchmark · Datasets at Hugging Face',
  'link': 'https://huggingface.co/datasets/m-ric/agents_small_benchmark',
  'snippet': "In Emily Midkiff's June 2014 article in a journal named for the one of Hreidmar's sons that g

In [48]:
os.getenv("SPACE_ID")

In [49]:
print(os.getenv("SPACE_ID"))

None


In [53]:
from toolkit import tool_list

In [59]:
print(tool_list[:])

[StructuredTool(name='multiply', description='Multiplies two numbers.\n    Args:\n        a (float): the first number\n        b (float): the second number', args_schema=<class 'langchain_core.utils.pydantic.multiply'>, func=<function multiply at 0x115fae9e0>), StructuredTool(name='add', description='Adds two numbers.\n    Args:\n        a (float): the first number\n        b (float): the second number', args_schema=<class 'langchain_core.utils.pydantic.add'>, func=<function add at 0x116157eb0>), StructuredTool(name='subtract', description='Subtracts two numbers.\n    Args:\n        a (float): the first number\n        b (float): the second number', args_schema=<class 'langchain_core.utils.pydantic.subtract'>, func=<function subtract at 0x116157a30>), StructuredTool(name='divide', description='Divides two numbers.\n    Args:\n        a (float): the first float number\n        b (float): the second float number', args_schema=<class 'langchain_core.utils.pydantic.divide'>, func=<function

In [57]:
print(i for i in tool_list)

<generator object <genexpr> at 0x124c4b0d0>


In [53]:
from youtube_transcript_api import YouTubeTranscriptApi
from pytube import extract


@tool
def get_youtube_transcript(page_url: str) -> str:
    """Get the transcript of a YouTube video
    Args:
        page_url (str): YouTube URL of the video
    """
    try:
        # get video ID from URL
        video_id = extract.video_id(page_url)

        # get transcript
        ytt_api = YouTubeTranscriptApi()
        transcript = ytt_api.fetch(video_id)

        # keep only text
        txt = '\n'.join([s.text for s in transcript.snippets])
        return txt[:20000]
    except Exception as e:
        return f"get_youtube_transcript failed: {e}"

In [57]:
get_youtube_transcript.invoke('https://www.youtube.com/watch?v=1htKBjuUWec')



"Wow this coffee's great I was just\nthinking that\nyeah is that cinnamon chicory\ntea oak\n[Music]\nisn't that hot\nextremely"

In [None]:
from openai import OpenAI

@tool
def automatic_speech_recognition(file_url: str, file_extension: str) -> str:
    """Transcribe an audio file to text
    Args:
        file_url (str): the URL to the audio file
        file_extension (str): the file extension, e.g. mp3
    """
    try:
        
        response = requests.get(file_url)
        response.raise_for_status()
        # write to disk
        file_extension = file_extension.replace('.','')
        with open(f'tmp.{file_extension}', 'wb') as file:
            file.write(response.content)

        audio_file = open(f'tmp.{file_extension}', "rb")
        client = OpenAI()
        transcription = client.audio.transcriptions.create(
                model="whisper-1",
                file=audio_file
            )
        
        return transcription.text

        # return 'sample'

    except Exception as e:
        return f"automatic_speech_recognition failed: {e}"
    

automatic_speech_recognition.invoke({'file_url':"https://raw.githubusercontent.com/prateekralhan/Speech2Text-for-Long-Audio-Files/master/audio_wav/genevieve.wav",'file_extension': '.wav'})


'This dynamic workshop aims to provide up-to-date information on pharmacological approaches, issues, and treatment in the geriatric population to assist in preventing medication-related problems, appropriately and effectively managing medications and compliance, period. The concept of polypharmacy, parentheses, taking multiple types of drugs, parentheses, will also be discussed, comma, as this is a common issue that can impact adverse side effects in the geriatric population, period. Participants will leave with the knowledge and considerations of common drug interaction and how to minimize effects that limit function, period. Summit Professional Education is approved provider of continuing education, period. This course is offered for six units, period. This course contains content classified under both the domain of occupational therapy and professional issues, period.'