In [8]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [9]:
OPEN_API_KEY = os.getenv('OPEN_API_KEY')

LANGSMITH_TRACING = os.getenv('LANGSMITH_TRACING')
LANGSMITH_API_KEY = os.getenv('LANGSMITH_API_KEY')
LANGSMITH_PROJECT = os.getenv('LANGSMITH_PROJECT')
LANGSMITH_ENDPOINT = os.getenv('LANGSMITH_ENDPOINT')

TAVILY_API_KEY = os.getenv('TAVILY_API_KEY')

In [10]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o", temperature=0)

In [11]:
from tavily import TavilyClient

# Initialize Tavily client
tavily_client = TavilyClient(api_key=TAVILY_API_KEY)


In [12]:
from pydantic import BaseModel, Field


from typing import TypedDict, List
from typing import Annotated
from operator import add
from typing import Literal

In [13]:
class TavilySearchOutput(BaseModel):
    title: str
    url: str
    content: str


In [14]:

class AgentState(TypedDict):
    query: str
    tavilySearchOutput: List[TavilySearchOutput]

In [23]:
# This function takes your search term, looks it up on three dataset-heavy sites, throws away useless or duplicate links, and gives you back a clean list of dataset pages you can actually use.

def tavily_search_tool(state: AgentState):
    """Search Kaggle + Hugging Face + GitHub and return up to 10 most relevant, mixed, deduped dataset links."""
    query = state["query"]

    site_queries = {
        "kaggle": f"{query} dataset site:kaggle.com/datasets",
        "huggingface": f"{query} dataset site:huggingface.co/datasets",
        "github": f"{query} dataset site:github.com (dataset OR data)"
    }

    def which_site(url: str) -> str | None:
        u = url.lower()
        if "kaggle.com/datasets/" in u:
            return "kaggle"
        if "huggingface.co/datasets/" in u:
            return "huggingface"
        if "github.com" in u and any(p in u for p in ["/data", "/dataset", "/datasets", "/blob/", "/tree/", "/raw/", ".csv", ".xlsx", ".json"]):
            return "github"
        return None

    def relevance_score(site: str, title: str, url: str, content: str) -> float:
        t = (title or "").lower()
        u = (url or "").lower()
        c = (content or "").lower()
        score = 0.0
        # Strong signals of downloadable/data-bearing resources
        if any(x in u for x in [".csv", ".xlsx", ".json", "/download", "/resolve/", "/raw/"]):
            score += 3.0
        if any(x in c for x in ["csv", "xlsx", "json", "download"]):
            score += 2.0
        # Query keyword overlap (simple)
        for tok in set(query.lower().split()):
            if tok and (tok in t or tok in c):
                score += 0.5
        # Site priors (dataset-centric sites get a small boost)
        if site in ("kaggle", "huggingface"):
            score += 0.5
        return score

    candidates: list[tuple[str, TavilySearchOutput, float]] = []
    seen_urls: set[str] = set()

    # Collect per-site, compute scores
    for site, q in site_queries.items():
        try:
            r = tavily_client.search(query=q, search_depth="basic", max_results=10)
        except Exception:
            continue
        if not isinstance(r, dict) or "results" not in r:
            continue
        for h in r["results"]:
            url = h.get("url", "") if isinstance(h, dict) else ""
            site_name = which_site(url) if url else None
            if site_name is None:
                continue
            if url in seen_urls:
                continue
            seen_urls.add(url)
            title = h.get("title", "")
            content = h.get("content", "")
            tso = TavilySearchOutput(title=title, url=url, content=content)
            score = relevance_score(site_name, title, url, content)
            candidates.append((site_name, tso, score))

    # Sort globally by relevance (desc)
    candidates.sort(key=lambda x: x[2], reverse=True)

    # Seed with top-1 per site if available to ensure a mix
    selected: list[TavilySearchOutput] = []
    used_urls: set[str] = set()
    sites_present = {s for s, _, _ in candidates}
    for s in ("kaggle", "huggingface", "github"):
        if s in sites_present:
            for site_name, tso, _ in candidates:
                if site_name == s and tso.url not in used_urls:
                    selected.append(tso)
                    used_urls.add(tso.url)
                    break
        if len(selected) >= 3:
            break

    # Fill remaining slots by overall relevance
    for _, tso, _ in candidates:
        if len(selected) >= 10:
            break
        if tso.url in used_urls:
            continue
        selected.append(tso)
        used_urls.add(tso.url)

    # Cap to 10
    selected = selected[:10]

    return {"tavilySearchOutput": selected}

In [24]:
state = AgentState(query="Coronvirus diseases")

tavily_search_tool(state)

{'tavilySearchOutput': [TavilySearchOutput(title='Novel Corona Virus 2019 Dataset - Kaggle', url='https://www.kaggle.com/datasets/sudalairajkumar/novel-corona-virus-2019-dataset', content='Content. This dataset has daily level information on the number of affected cases, deaths and recovery from 2019 novel coronavirus. Please note that this is a'),
  TavilySearchOutput(title='xhluca/publichealth-qa · Datasets at Hugging Face', url='https://huggingface.co/datasets/xhluca/publichealth-qa', content='COVID-19 is a new disease, caused be a novel (or new) coronavirus that has not previously been seen in humans. The name of this disease was selected following'),
  TavilySearchOutput(title='awesome-data/coronavirus.md at main - GitHub', url='https://github.com/datasets/awesome-data/blob/main/coronavirus.md', content='This dataset includes time series data tracking the number of people affected by COVID-19 worldwide, including: confirmed tested cases of Coronavirus infection'),
  TavilySearchOu

# Get datasest from Kaggle

In [None]:
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

api.dataset_download_files(
    'sudalairajkumar/novel-corona-virus-2019-dataset',
    path='./data',
    force=True,
    quiet=False,
    unzip=True
)
print("done")

Dataset URL: https://www.kaggle.com/datasets/sudalairajkumar/novel-corona-virus-2019-dataset
Downloading novel-corona-virus-2019-dataset.zip to ./data


100%|██████████| 8.52M/8.52M [00:00<00:00, 1.68GB/s]


done





# Tool to download Kaggle Files

In [None]:
from kaggle.api.kaggle_api_extended import KaggleApi
import os
import re

def download_kaggle_files(url: str, data_dir: str = './data') -> dict[str, list[str]]:
    """
    Given a Kaggle dataset URL, downloads the dataset using the Kaggle API,
    unzips it, and returns a dictionary with lists of CSV and Excel file paths extracted.

    Args:
        url (str): The Kaggle dataset URL, e.g. 'https://www.kaggle.com/datasets/sudalairajkumar/novel-corona-virus-2019-dataset' or 'https://www.kaggle.com/datasets/einsteindata4u/covid19'
        data_dir (str): Directory to download and extract files to.

    Returns:
        Dict[str, List[str]]: Dictionary with keys 'csv' and 'excel', each mapping to a list of file paths.
    """
    # Extract dataset slug from URL
    m = re.search(r'kaggle\.com/datasets/([^/]+/[^/?#]+)', url)
    if not m:
        raise ValueError(f"Could not parse Kaggle dataset slug from URL: {url}")
    dataset_slug = m.group(1)

    api = KaggleApi()
    api.authenticate()

    os.makedirs(data_dir, exist_ok=True)
    api.dataset_download_files(
        dataset_slug,
        path=data_dir,
        force=True,
        quiet=True,
        unzip=True
    )

    # Find all CSV and Excel files in the data_dir
    csv_files = []
    excel_files = []
    for root, _, files in os.walk(data_dir):
        for f in files:
            if f.lower().endswith('.csv'):
                csv_files.append(os.path.join(root, f))
            elif f.lower().endswith('.xlsx') or f.lower().endswith('.xls'):
                excel_files.append(os.path.join(root, f))
    return {"csv": csv_files, "excel": excel_files}

# Example usage:
tabular_files = download_kaggle_files('https://www.kaggle.com/datasets/sudalairajkumar/novel-corona-virus-2019-dataset')
print(tabular_files)

Dataset URL: https://www.kaggle.com/datasets/sudalairajkumar/novel-corona-virus-2019-dataset
{'csv': ['./data/time_series_covid_19_confirmed_US.csv', './data/time_series_covid_19_recovered.csv', './data/time_series_covid_19_deaths_US.csv', './data/covid_19_data.csv', './data/time_series_covid_19_deaths.csv', './data/time_series_covid_19_confirmed.csv'], 'excel': []}
