In [1]:
# {
#     "freq": {"text": "Time frequency", "values": {"A": "Annual"}},
#     "twrk_hr": {
#         "text": "Taught activities during paid working hours",
#         "values": {"DPH": "During paid hours", "OPH": "Outside paid hours"},
#     },
#     "unit": {
#         "text": "Unit of measure",
#         "values": {"THS": "Thousand", "PC": "Percentage"},
#     },
#     "geo": {
#         "text": "Geopolitical entity (reporting)",
#         "values": {
#             "EU25": "European Union - 25 countries (2004-2006)",
#             "BE": "Belgium",
#             "BG": "Bulgaria",
#             "CZ": "Czechia",
#             "DK": "Denmark",
#             "EE": "Estonia",
#             "IE": "Ireland",
#             "EL": "Greece",
#             "ES": "Spain",
#             "FR": "France",
#             "IT": "Italy",
#             "CY": "Cyprus",
#             "LV": "Latvia",
#             "LT": "Lithuania",
#             "LU": "Luxembourg",
#             "HU": "Hungary",
#             "MT": "Malta",
#             "NL": "Netherlands",
#             "AT": "Austria",
#             "PL": "Poland",
#             "PT": "Portugal",
#             "RO": "Romania",
#             "SI": "Slovenia",
#             "SK": "Slovakia",
#             "FI": "Finland",
#             "SE": "Sweden",
#             "NO": "Norway",
#             "CH": "Switzerland",
#             "UK": "United Kingdom",
#         },
#     },
#     "time": {"text": "Time", "values": {}},
# }

In [2]:
import json
import cohere
from deeplake.core.vectorstore.deeplake_vectorstore import VectorStore
import json
import os

# load the local .env file
from dotenv import load_dotenv

load_dotenv()

co = cohere.Client(os.environ["COHERE_API_KEY"])
VECTOR_STORE_NAME = "eurostat"  # Name of vector store on activeloop hub
# TOKEN = os.environ["ACTIVELOOP_TOKEN"]


vector_store = VectorStore(path=VECTOR_STORE_NAME)


# # Potential filter function for vector_store.search


def cohere_embedding_function(texts, model="embed-multilingual-v3.0"):
    if isinstance(texts, str):
        texts = [texts]

    response = co.embed(texts, model=model, input_type="search_query")
    return response.embeddings


def search_tables(search_string: str, k: int = 10):
    """Performs a search in tables based on the given search string."""
    results = vector_store.search(
        embedding_data=search_string,
        embedding_function=cohere_embedding_function,
        # exec_option="tensor_db",
        return_tensors=["text", "code", "start_date", "end_date"],
        k=k,
    )
    return results


def format_search_results(search_results: dict, include_score: bool = False) -> dict:
    """Formats the search results to the format expected by the frontend."""
    if not include_score:
        search_results.pop("score", None)
    formatted_results = []
    # number of results is equal to the length of any of the lists in the dict
    # set the number of results to the length of the FIRST list in the dict
    nbr_of_results = len(list(search_results.values())[0])
    # each dict in the formatted_results list should have all the same keys
    # as the search_results dict

    for i in range(nbr_of_results):
        result = {}
        for key, value in search_results.items():
            if isinstance(value[i], list) and len(value[i]) == 1:
                result[key] = value[i][0]
            else:
                result[key] = value[i]
        formatted_results.append(result)

    return formatted_results


def search_eurostat(search_string: str, year: int = None, k=10) -> dict:
    """Performs a search in Eurostat based on the given search string."""
    search_results = search_tables(search_string, k=k)
    # possible reranking is done here
    # RERANK
    formatted_results = format_search_results(search_results)
    return formatted_results


def od_search(search_string: str, k: int = 10):
    """Performs a search in tables based on the given search string."""

    results = search_eurostat(search_string, k=k)
    base_url = "https://ec.europa.eu/eurostat/databrowser/view/{CODE}/default/table"
    results_with_urls = []
    for res in results:
        url = base_url.format(CODE=res["code"].lower())
        res["url"] = url
        results_with_urls.append(res)
    return results_with_urls

Deep Lake Dataset in eurostat already exists, loading from the storage


In [4]:
term = "interest rates"

results = od_search(search_string=term)

for res in results:
    title = res["text"]
    url = res["url"]
    print(f"{title}: {url}")

Money market interest rates: https://ec.europa.eu/eurostat/databrowser/view/enpe_irt_st/default/table
Interest rates - monthly data: https://ec.europa.eu/eurostat/databrowser/view/ei_mfir_m/default/table
3-month-interest rate: https://ec.europa.eu/eurostat/databrowser/view/teimf040/default/table
Day-to-day money market interest rates: https://ec.europa.eu/eurostat/databrowser/view/teimf100/default/table
Tax rate: https://ec.europa.eu/eurostat/databrowser/view/earn_nt_taxrate/default/table
Money market interest rates - annual data: https://ec.europa.eu/eurostat/databrowser/view/irt_st_a/default/table
Money market interest rates - monthly data: https://ec.europa.eu/eurostat/databrowser/view/irt_st_m/default/table
Money market interest rates - quarterly data: https://ec.europa.eu/eurostat/databrowser/view/irt_st_q/default/table
Exchange rates: https://ec.europa.eu/eurostat/databrowser/view/med_ec9/default/table
Loan and deposit one year interest rate: https://ec.europa.eu/eurostat/databro