In [23]:
import csv
import pandas as pd
from pandas.api.types import is_period_dtype

import asyncio
from typing import List, Dict, Optional
from playwright.async_api import async_playwright, Page
from urllib.parse import urljoin
from datetime import datetime
import os
import math
import time
import datetime as dt
from typing import Optional, Tuple
import requests
import duckdb

import asyncio, csv, re
from datetime import datetime
from urllib.parse import urljoin
from playwright.async_api import async_playwright, Page

from dotenv import load_dotenv
load_dotenv()

True

# üß† Projeto: Enriquecimento de Base Anal√≠tica com Web Scraping e API Financeira

## üéØ Contexto

Uma **fintech de investimentos** precisa enriquecer sua base anal√≠tica com informa√ß√µes externas do mercado para apoiar decis√µes estrat√©gicas.
Como Engenheira de Dados, foi desenvolvido um **pipeline de dados** que coleta informa√ß√µes p√∫blicas de **not√≠cias** e **s√©ries financeiras**, armazena localmente em um **banco DuckDB**, e permite posterior explora√ß√£o via SQL e dashboards.

---

## üß© Objetivo

Construir um pipeline completo de **coleta, transforma√ß√£o e carga (ETL)** que una:

* **Web Scraping** de not√≠cias econ√¥micas e geopol√≠ticas (BBC News);
* **API P√∫blica** de dados financeiros (FRED e CoinGecko);
* **Integra√ß√£o anal√≠tica** em banco local **DuckDB**.

---

## ‚öôÔ∏è Stack Utilizada

| Etapa         | Tecnologia                        | Descri√ß√£o                                          |
| ------------- | --------------------------------- | -------------------------------------------------- |
| Coleta Web    | `Playwright` + `asyncio`          | Scraping ass√≠ncrono de p√°ginas de not√≠cias da BBC  |
| Coleta API    | `requests`, `pandas`              | Consumo de APIs FRED (Federal Reserve) e CoinGecko |
| Armazenamento | `DuckDB`                          | Banco anal√≠tico local com tr√™s tabelas             |
| Ambiente      | `Python 3.9+`, `Jupyter Notebook` | Execu√ß√£o e an√°lise                                 |
| Persist√™ncia  | `.duckdb`, `.parquet`, `.csv`     | Formatos intermedi√°rios                            |

---

## üåê Fontes de Dados

### üîπ Not√≠cias (Web Scraping ‚Äì BBC News)

* Fonte: [BBC News ‚Äì US-Canada](https://www.bbc.com/news/us-canada)
* Coletadas **100 not√≠cias** contendo t√≠tulo, resumo, link e data de coleta.
* Campos armazenados:

  ```
  ['title', 'url', 'summary', 'collected_at']
  ```
* Objetivo: capturar contexto geopol√≠tico e eventos com impacto em mercados.

### üîπ S√©ries Financeiras (APIs P√∫blicas)

| Fonte     | S√©rie          | Descri√ß√£o                                   | Per√≠odo  |
| --------- | -------------- | ------------------------------------------- | -------- |
| FRED      | `DCOILBRENTEU` | Pre√ßo di√°rio do petr√≥leo Brent (USD/barril) | 6+ meses |
| FRED      | `DEXUSUK`      | Taxa USD/GBP (invertida para GBP/USD)       | 6+ meses |
| CoinGecko | `BTC/USD`      | Cota√ß√£o di√°ria do Bitcoin                   | 6+ meses |

Os dados foram padronizados em base di√°ria cont√≠nua, com c√°lculo de retornos em janelas de 1, 3 e 5 dias (`r1`, `r3`, `r5`).

---

## üóÑÔ∏è Modelagem de Dados no DuckDB

### Tabelas criadas:

| Tabela          | Descri√ß√£o                       | Principais Campos                          |
| --------------- | ------------------------------- | ------------------------------------------ |
| **prices**      | S√©ries hist√≥ricas dos ativos    | `instr`, `date`, `close`, `r1`, `r3`, `r5` |
| **news_bbc**    | Not√≠cias coletadas via scraping | `title`, `url`, `summary`, `collected_at`  |
| **instruments** | Metadados dos instrumentos      | `instr_id`, `symbol`, `name`, `class`      |

```sql
-- Exemplo de schema no DuckDB
DESCRIBE prices;
DESCRIBE news_bbc;
DESCRIBE instruments;
```

---

## üìä Resultados

* **100 not√≠cias** coletadas da BBC News.
* **3 instrumentos** (Brent, GBP/USD, BTC/USD) com **211 dias** de dados cada.
* **3 tabelas anal√≠ticas** armazenadas no DuckDB (`prices`, `news_bbc`, `instruments`).
* Pipeline totalmente reprodut√≠vel e modular, pronto para expans√£o com novos t√≥picos ou ativos.

---

## üßæ Estrutura Final

```
üìÇ projeto_etl_fintech/
‚îÇ
‚îú‚îÄ‚îÄ market.duckdb                 # Banco anal√≠tico local
‚îú‚îÄ‚îÄ prices.parquet                # Dados de pre√ßos
‚îú‚îÄ‚îÄ bbc_israel_gaza_war.csv       # Not√≠cias coletadas
‚îú‚îÄ‚îÄ etl_pipeline.ipynb            # Notebook principal
‚îî‚îÄ‚îÄ requirements.txt              # Depend√™ncias fixas
```

---

## ‚úÖ Conclus√£o

O projeto integra dados n√£o estruturados (not√≠cias) e estruturados (s√©ries econ√¥micas), simulando um fluxo real de engenharia de dados.
Com as tabelas organizadas no DuckDB, √© poss√≠vel executar consultas SQL r√°pidas e realizar an√°lises temporais sobre o impacto de eventos geopol√≠ticos nos ativos financeiros.

## Configs

In [24]:
DUCKDB_PATH = os.getenv("DUCKDB_PATH")

In [25]:
# =========================
# Configs
# =========================
# 100 √∫ltimas not√≠cias do BBC US & Canada

START_URL = os.getenv("NEWS_SOURCE")
TARGET = 100
OUTCSV = "bbc_us_canada_latest_updates.csv"
BASE = "https://www.bbc.com"

In [26]:
# =========================
# Configs
# =========================
# Per√≠odo alvo (>= 6 meses); pego ~210 dias\

END = dt.date.today()
START = END - dt.timedelta(days=210)  # ~7 meses

FRED_API_KEY = os.getenv("FRED_API_KEY")  # .env
FRED_BASE = "https://api.stlouisfed.org/fred/series/observations"
FRED_SERIES = {
    "BRENT": "DCOILBRENTEU",  # Brent Europe, di√°rio
    "GBPUSD": "DEXUSUK",      # Taxa USD/GBP di√°ria 
}

COINGECKO_BASE = "https://api.coingecko.com/api/v3"
COINGECKO_COIN = "bitcoin"
COINGECKO_VS = "usd"

## Web scraping de not√≠cias 

In [27]:
# =========================
# Helpers
# =========================

def abs_url(href): return urljoin(BASE, href or "")

async def accept_cookies(page: Page):
    for sel in (
        '[data-testid="cookie-banner"] button:has-text("Accept")',
        'button:has-text("I Agree")','button:has-text("Agree")','#bbccookies-continue-button',
    ):
        b = page.locator(sel).first
        if await b.count() and await b.is_visible():
            await b.click(); break

async def wait_heading(page: Page):
    await page.wait_for_selector('h2[data-testid="alaska-title"]', timeout=15000)

async def extract_latest_updates_on_page(page: Page):
    # Extrator geom√©trico: pega links /news/ VISUALMENTE entre o heading e a pagina√ß√£o
    js = """
    () => {
      const BASE = 'https://www.bbc.com';
      const head = document.querySelector('h2[data-testid="alaska-title"]');
      if (!head) return {items: [], debug: {reason: 'no heading'}};
      const headBottom = head.getBoundingClientRect().bottom + window.scrollY;

      // tenta achar a barra de pagina√ß√£o (nav ou container com bot√µes 1,2,3)
      let pag = document.querySelector('nav[aria-label*="Pagination" i]') ||
                Array.from(document.querySelectorAll('nav, div, section'))
                  .find(n => /Go to page/i.test(n.textContent||'') || /\b1\b.*\b2\b.*\b3\b/.test(n.textContent||''));
      let pagTop = Infinity;
      if (pag) pagTop = pag.getBoundingClientRect().top + window.scrollY;

      const links = Array.from(document.querySelectorAll('a[href*="/news/"]'));
      const filtered = [];

      for (const a of links) {
        const r = a.getBoundingClientRect();
        const y = r.top + window.scrollY;
        if (y > headBottom && y < pagTop) {
          // t√≠tulo
          let title = (a.querySelector('h3,h2')?.textContent || a.textContent || '').trim().replace(/\s+/g,' ');
          if (!title || title.length < 5) continue;

          // item container p/ achar resumo/time
          const container = a.closest('li, article, div[role="listitem"], div, section') || a;
          const p = container.querySelector('p');
          const time = container.querySelector('time');

          const summary = (p?.textContent || '').trim().replace(/\s+/g,' ');
          const rel = (time?.textContent || '').trim();

          try {
            const url = new URL(a.getAttribute('href'), BASE).toString();
            filtered.push({title, url, summary, relative_date: rel});
          } catch {}
        }
      }

      // dedupe por URL e remove duplicados do mesmo t√≠tulo
      const seen = new Set();
      const items = [];
      for (const it of filtered) {
        if (!seen.has(it.url)) { seen.add(it.url); items.push(it); }
      }
      return {items, debug: {headBottom, pagTop, totalLinks: links.length, kept: items.length}};
    }
    """
    res = await page.evaluate(js)
    items = res["items"]
    iso = datetime.now().isoformat()
    for it in items: it["collected_at"] = iso
    print(f'‚Üí between heading/pagination: {res["debug"]["kept"]} of {res["debug"]["totalLinks"]}')
    return items

async def get_max_page(page: Page) -> int:
    # l√™ todos "Go to page N" e pega o maior
    nums = set()
    btns = page.locator('button[aria-label^="Go to page "]')
    for i in range(await btns.count()):
        lbl = await btns.nth(i).get_attribute("aria-label")
        m = re.search(r"(\d+)$", lbl or "")
        if m: nums.add(int(m.group(1)))
    # fallback: n√∫meros vis√≠veis no paginador
    nav = page.locator("nav").filter(has_text=re.compile(r"\b1\b"))
    if await nav.count():
        txt = " ".join(await nav.first.all_text_contents())
        for n in re.findall(r"\b\d+\b", txt):
            nums.add(int(n))
    return max(nums) if nums else 1

async def click_page_n(page: Page, n: int) -> bool:
    # garante que o paginador est√° na tela
    await page.mouse.wheel(0, 99999)
    # 1) for√ßa o clique via JS no aria-label "Go to page n"
    ok = await page.evaluate("""
    (n) => {
      const byAria = Array.from(document.querySelectorAll('button[aria-label^="Go to page "]'))
        .find(b => (b.getAttribute('aria-label')||'').trim().endsWith(String(n)));
      if (byAria) { byAria.click(); return true; }
      // fallback por texto vis√≠vel = n
      const byText = Array.from(document.querySelectorAll('nav button, button'))
        .find(b => (b.textContent||'').trim() === String(n));
      if (byText) { byText.click(); return true; }
      return false;
    }
    """, n)
    if ok:
        await page.wait_for_load_state("domcontentloaded")
        await page.wait_for_timeout(900)
        return True

    # 2) fallback: clica no chevron ">" (pr√≥xima)
    chevron = page.locator('nav button[aria-label*="next" i], nav button:has-text(">"), nav button:has-text("‚Ä∫")').first
    if await chevron.count():
        await chevron.click()
        await page.wait_for_load_state("domcontentloaded")
        await page.wait_for_timeout(900)
        return True

    return False

async def scrape_latest_updates(target: int = TARGET):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()
        await page.goto(START_URL, timeout=60_000)
        await accept_cookies(page)
        await wait_heading(page)

        results, seen = [], set()
        page_no = 1
        max_page = await get_max_page(page)

        while len(results) < target and page_no <= max_page:
            # rola um pouco p/ garantir render
            await page.mouse.wheel(0, 2200); await asyncio.sleep(0.3)
            batch = await extract_latest_updates_on_page(page)
            print(f"P√°gina {page_no} ‚Üí {len(batch)} itens")
            for it in batch:
                if it["url"] in seen: continue
                seen.add(it["url"]); results.append(it)
                if len(results) >= target: break
            if len(results) >= target: break
            page_no += 1
            if page_no > max_page: break
            # leva o paginador ao viewport e clica no n√∫mero
            await page.mouse.wheel(0, 9_999); await asyncio.sleep(0.2)
            if not await click_page_n(page, page_no):
                print(f"‚ö†Ô∏è n√£o consegui clicar na p√°gina {page_no}")
                break

        await browser.close()
        return results[:target]

async def save_csv(rows, path=OUTCSV):
    cols = ["title","url","summary","relative_date","collected_at"]
    with open(path,"w",newline="",encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=cols); w.writeheader()
        for r in rows: w.writerow({k:r.get(k,"") for k in cols})


In [28]:
# =========================
# Execu√ß√£o
# =========================
data = await scrape_latest_updates(100)
await save_csv(data, OUTCSV)

‚Üí between heading/pagination: 9 of 46
P√°gina 1 ‚Üí 9 itens
‚Üí between heading/pagination: 0 of 37
P√°gina 2 ‚Üí 0 itens
‚ö†Ô∏è n√£o consegui clicar na p√°gina 3


In [29]:
news = pd.DataFrame(data)

In [30]:
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   title          9 non-null      object
 1   url            9 non-null      object
 2   summary        9 non-null      object
 3   relative_date  9 non-null      object
 4   collected_at   9 non-null      object
dtypes: object(5)
memory usage: 492.0+ bytes


### Persist√™ncia

In [31]:
# --- 1) manter s√≥ as colunas com valor ---
news = news[["title", "url", "summary", "collected_at"]].copy()
news["collected_at"] = pd.to_datetime(news["collected_at"], utc=True, errors="coerce")
news = news.drop_duplicates(subset=["url"]).reset_index(drop=True)

# --- 2) conectar ao banco local ---
con = duckdb.connect(DUCKDB_PATH)

# --- 3) criar a tabela de not√≠cias com apenas as colunas ---
con.execute("""
CREATE TABLE IF NOT EXISTS news_bbc (
    title         VARCHAR,
    url           VARCHAR,
    summary       VARCHAR,
    collected_at  TIMESTAMP
);
""")

# --- 4) inserir os dados ---
con.register("tmp_news", news)
con.execute("""
INSERT INTO news_bbc
SELECT title, url, summary, collected_at
FROM tmp_news;
""")

<_duckdb.DuckDBPyConnection at 0x10dc9a4f0>

### Verifica√ß√£o

In [32]:
# --- 5) checar resultado ---
print(con.execute("SELECT COUNT(*) AS n FROM news_bbc").df())
print(con.execute("SELECT * FROM news_bbc LIMIT 5").df())

con.close()

     n
0  209
                                               title  \
0    Businesses are running out of pennies in the US   
1  SNAP benefits: When will the US government shu...   
2  US judges say Trump administration must contin...   
3  Top Republican rebuffs Trump calls to axe fili...   
4  Watch: Pet monkey gets loose inside US Hallowe...   

                                              url  \
0  https://www.bbc.com/news/articles/c20556ly45eo   
1  https://www.bbc.com/news/articles/cew4gnyw8rlo   
2  https://www.bbc.com/news/articles/cr433x9zqq4o   
3  https://www.bbc.com/news/articles/c1d0qwx5z2vo   
4    https://www.bbc.com/news/videos/c70jj362x9yo   

                                             summary  \
0  Find a penny, pick it up, then what? Now the U...   
1  The programme helps 40 million low-income Amer...   
2  In two separate rulings, US judges said the pl...   
3  Ending the long-standing rule would allow Repu...   
4  The owner of the acrobatic primate told police.

## API com dados de petr√≥leo

In [33]:
# =========================
# Helpers
# =========================

def _retry_get(url: str, params: dict = None, max_tries: int = 5, sleep_base: float = 1.0):
    for i in range(max_tries):
        r = requests.get(url, params=params, timeout=30)
        if r.status_code == 200:
            return r
        time.sleep(sleep_base * (2**i))
    r.raise_for_status()

def _reindex_full_range(df: pd.DataFrame, start: dt.date, end: dt.date, date_col="date", value_cols=None):
    """Garante cobertura di√°ria START‚ÜíEND com bfill+ffill."""
    if value_cols is None:
        value_cols = [c for c in df.columns if c != date_col]
    full = pd.DataFrame({"date": pd.date_range(start, end, freq="D").date})
    out = full.merge(df, on="date", how="left")
    # Corrige tipos num√©ricos
    for c in value_cols:
        out[c] = pd.to_numeric(out[c], errors="coerce")
        out[c] = out[c].bfill().ffill()  # preenche come√ßo e meio
    return out

def fetch_fred_series_strict(series_id, start: dt.date, end: dt.date, api_key: str) -> pd.DataFrame:
    base = "https://api.stlouisfed.org/fred/series/observations"
    params = {
        "series_id": series_id,
        "api_key": api_key,
        "file_type": "json",
        "observation_start": start.isoformat(),
        "observation_end": end.isoformat(),
    }
    r = _retry_get(base, params=params)
    data = r.json().get("observations", [])
    df = pd.DataFrame(data)[["date", "value"]] if data else pd.DataFrame(columns=["date","value"])
    df["date"] = pd.to_datetime(df["date"]).dt.date
    df["value"] = pd.to_numeric(df["value"], errors="coerce")
    df = _reindex_full_range(df, start, end, value_cols=["value"])
    return df.rename(columns={"value": "close"})

def fetch_coingecko_btc_strict(start: dt.date, end: dt.date, vs_currency="usd") -> pd.DataFrame:
    base = "https://api.coingecko.com/api/v3/coins/bitcoin/market_chart"
    days = (end - start).days + 5
    r = _retry_get(base, params={"vs_currency": vs_currency, "days": days})
    js = r.json()
    p = pd.DataFrame(js.get("prices", []), columns=["ts_ms", "close"])
    if p.empty:
        p = pd.DataFrame(columns=["date", "close"])
    else:
        p["date"] = pd.to_datetime(p["ts_ms"], unit="ms").dt.date
        p = p.sort_values("ts_ms").groupby("date", as_index=False).tail(1)[["date","close"]]
        p = p[(p["date"] >= start) & (p["date"] <= end)]
    p = _reindex_full_range(p, start, end, value_cols=["close"])
    return p

def ensure_min_6_months(df, start, end, date_col="date"):
    if df.empty:
        raise AssertionError("DataFrame vazio.")
    span = (df[date_col].max() - df[date_col].min()).days
    if span < 180:
        raise AssertionError(f"Menos de 6 meses: {span} dias.")
    # no m√°ximo 1% de buracos (ap√≥s reindex + bfill/ffill deve ser 0)
    expected = set(pd.date_range(start, end, freq="D").date)
    got = set(df[date_col].values)
    missing = expected - got
    if len(missing) > len(expected) * 0.01:
        raise AssertionError(f"Muitas datas faltando ({len(missing)}).")

In [34]:
# =========================
# Execu√ß√£o
# =========================

if __name__ == "__main__":
    print(f"Coletando de {START} at√© {END} (~{(END-START).days} dias)")

    # ---- FRED: Brent & DEXUSUK ----
    brent = fetch_fred_series_strict("DCOILBRENTEU", START, END, FRED_API_KEY)  # Brent
    dex = fetch_fred_series_strict("DEXUSUK", START, END, FRED_API_KEY)         # USD/GBP
    gbpusd = dex.assign(close=lambda d: 1.0 / d["close"]).copy()                # GBP/USD

    # ---- CoinGecko: BTC/USD ----
    btc = fetch_coingecko_btc_strict(START, END, "usd")

    # ---- Valida√ß√£o ----
    for name, df in [("BRENT", brent), ("GBPUSD", gbpusd), ("BTCUSD", btc)]:
        ensure_min_6_months(df, START, END)
        print(f"[OK] {name}: {df['date'].min()} ‚Üí {df['date'].max()} | {len(df)} linhas")

    # ---- Consolida para salvar ----
    prices = pd.concat(
        [
            brent.assign(instr="BRENT"),
            gbpusd.assign(instr="GBPUSD"),
            btc.assign(instr="BTCUSD"),
        ],
        ignore_index=True,
    ).sort_values(["instr", "date"])

    print(prices.groupby("instr").head(3))
    print(prices.groupby("instr").tail(3))

Coletando de 2025-04-04 at√© 2025-10-31 (~210 dias)
[OK] BRENT: 2025-04-04 ‚Üí 2025-10-31 | 211 linhas
[OK] GBPUSD: 2025-04-04 ‚Üí 2025-10-31 | 211 linhas
[OK] BTCUSD: 2025-04-04 ‚Üí 2025-10-31 | 211 linhas
           date         close   instr
0    2025-04-04     68.360000   BRENT
1    2025-04-05     66.130000   BRENT
2    2025-04-06     66.130000   BRENT
422  2025-04-04  83163.987574  BTCUSD
423  2025-04-05  83852.007654  BTCUSD
424  2025-04-06  83595.885502  BTCUSD
211  2025-04-04      0.773575  GBPUSD
212  2025-04-05      0.785608  GBPUSD
213  2025-04-06      0.785608  GBPUSD
           date          close   instr
208  2025-10-29      65.520000   BRENT
209  2025-10-30      65.520000   BRENT
210  2025-10-31      65.520000   BRENT
630  2025-10-29  112950.348633  BTCUSD
631  2025-10-30  110046.669258  BTCUSD
632  2025-10-31  109553.033725  BTCUSD
419  2025-10-29       0.751993  GBPUSD
420  2025-10-30       0.751993  GBPUSD
421  2025-10-31       0.751993  GBPUSD


### Persist√™ncia

In [35]:
if isinstance(getattr(prices.index, "dtype", None), pd.PeriodDtype):
    prices = prices.copy()
    prices.index = prices.index.to_timestamp()           # para Timestamp
    prices = prices.reset_index().rename(columns={"index":"date"})

for c in prices.columns:
    if isinstance(prices[c].dtype, pd.PeriodDtype):
        prices[c] = prices[c].dt.to_timestamp()

prices["date"] = pd.to_datetime(prices["date"]).dt.date   # date puro
prices["instr"] = prices["instr"].astype(str)
for c in ["close","r1","r3","r5"]:
    if c in prices:
        prices[c] = pd.to_numeric(prices[c], errors="coerce").astype("float64")

In [36]:
# salva em parquet (opcional)
prices.to_parquet("prices.parquet", index=False)

# conecta ao banco local
con = duckdb.connect(DUCKDB_PATH)

# cria a tabela se n√£o existir
con.execute("""
DROP TABLE IF EXISTS prices;
CREATE TABLE IF NOT EXISTS prices (
    date DATE,
    close DOUBLE,
    instr VARCHAR
);
""")

# registra o DataFrame do pandas como uma "view" tempor√°ria
con.register("tmp_prices", prices)

# insere os dados na tabela
con.execute("""
INSERT INTO prices
SELECT * FROM tmp_prices;
""")

# confirma e fecha
con.close()
print("‚úÖ Dados salvos na tabela 'prices' do banco market.duckdb")

‚úÖ Dados salvos na tabela 'prices' do banco market.duckdb


### Verifica√ß√£o

In [37]:
con = duckdb.connect(DUCKDB_PATH)

con.execute("select instr, min(date) as min_date, max(date) as max_date, count(*) as n_rows from prices group by instr").df()

Unnamed: 0,instr,min_date,max_date,n_rows
0,BRENT,2025-04-04,2025-10-31,211
1,GBPUSD,2025-04-04,2025-10-31,211
2,BTCUSD,2025-04-04,2025-10-31,211


## Tabela intrumentos

In [38]:
# --- 3¬™ tabela: instruments ---
con.execute("""
CREATE TABLE IF NOT EXISTS instruments (
    instr_id VARCHAR PRIMARY KEY,
    symbol   VARCHAR,
    name     VARCHAR,
    class    VARCHAR
);
""")
con.register("tmp_instr", pd.DataFrame([
    {"instr_id":"BRENT",  "symbol":"DCOILBRENTEU",      "name":"Brent (FRED)",                  "class":"commodity"},
    {"instr_id":"GBPUSD", "symbol":"DEXUSUK (inverted)","name":"GBP/USD (from DEXUSUK ‚Äì FRED)", "class":"fx"},
    {"instr_id":"BTCUSD", "symbol":"CoinGecko BTC/USD", "name":"Bitcoin (CoinGecko)",           "class":"crypto"},
]))
con.execute("DELETE FROM instruments WHERE instr_id IN (SELECT instr_id FROM tmp_instr)")
con.execute("INSERT INTO instruments SELECT * FROM tmp_instr")

print(con.execute("SELECT * FROM instruments").df())
con.close()

  instr_id              symbol                           name      class
0    BRENT        DCOILBRENTEU                   Brent (FRED)  commodity
1   GBPUSD  DEXUSUK (inverted)  GBP/USD (from DEXUSUK ‚Äì FRED)         fx
2   BTCUSD   CoinGecko BTC/USD            Bitcoin (CoinGecko)     crypto


## Banco

In [39]:
con = duckdb.connect(DUCKDB_PATH)

# lista todas as tabelas
print(con.execute("SHOW TABLES").df())

# mostra o esquema completo (colunas e tipos)
print(con.execute("DESCRIBE prices").df())
print(con.execute("DESCRIBE news_bbc").df())
print(con.execute("DESCRIBE instruments").df())

con.close()

          name
0  instruments
1     news_bbc
2       prices
  column_name column_type null   key default extra
0        date        DATE  YES  None    None  None
1       close      DOUBLE  YES  None    None  None
2       instr     VARCHAR  YES  None    None  None
    column_name column_type null   key default extra
0         title     VARCHAR  YES  None    None  None
1           url     VARCHAR  YES  None    None  None
2       summary     VARCHAR  YES  None    None  None
3  collected_at   TIMESTAMP  YES  None    None  None
  column_name column_type null   key default extra
0    instr_id     VARCHAR   NO   PRI    None  None
1      symbol     VARCHAR  YES  None    None  None
2        name     VARCHAR  YES  None    None  None
3       class     VARCHAR  YES  None    None  None
