In [None]:
import warnings
import httpx
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message="Passing literal html to 'read_html'"
)

In [None]:
all_companies = pd.read_csv('./dados/buyers/tickets.csv')[['Ticker', 'Nome']]

In [None]:
def get_company_data(ticker: str) -> pd.DataFrame:
    resp = httpx.get(f'https://www.dadosdemercado.com.br/acoes/{ticker.lower()}')
    soup = BeautifulSoup(resp.content, 'html')
    site_div = soup.select_one('div:has(> span:-soup-contains("Site do RI"))')
    name = re.search(r'\(([^()]+)\)', soup.find('h1').text).group(1)
    if site_div:
        url_ri = site_div.find("a")["href"].replace('ri.','')
    else:
        url_ri = None
        
    containers = soup.find_all("div", class_="table-container")
    tabelas = {}
    for div in containers:
        table_id = div.get("id") or f"table_{len(tabelas)+1}"
        tabela_bs = div.find("table")
        if tabela_bs:
            df = pd.read_html(str(tabela_bs), decimal=',', thousands='.', header=0, flavor="bs4")[0]
            tabelas[table_id] = df
            
    df_consolidado = consolidar_tabelas(tabelas, ticker, nome)
    row = {
    'nome': name,
    'ticker': ticker,
    'relatorio': 'Site',
    'data': '2025',
    'nome_coluna': 'Site do RI',
    'valor': url_ri
}

    return pd.concat([
        pd.DataFrame([row]),
        df_consolidado
    ], ignore_index=True)


UNIDADES = {"B": 1e9, "M": 1e6, "mi": 1e6, "Mil": 1e3}
def parse_valor(x, *, pct_as: str = "frac"):

    try:
        x = float(x)
    except (ValueError, TypeError):
        pass

    if isinstance(x, (int, float)):
        return float(x)
    
    x = x.replace('--', '0')
    
    if x is None:
        return np.nan
    if isinstance(x, str):
        txt = x.strip().replace("\u2212", "-")
    else:
        return np.nan
    
    if txt in {"", "-", "—", "–", "N/A", "NA"}:
        return np.nan
    
    is_neg = False
    if txt.startswith("(") and txt.endswith(")"):
        is_neg = True
        txt = txt[1:-1]
    
    if txt.endswith("%"):
        try:
            num = float(txt.rstrip("%").replace(".", "").replace(",", "."))
            valor = num / 100.0 if pct_as == "frac" else num
        except:
            print(f"Erro ao converter porcentagem: {txt}")
        return -valor if "-" in txt or is_neg else valor
    
    sinal = -1 if txt.startswith("-") else 1
    txt = txt.lstrip("+-")
    
    for suf, mult in UNIDADES.items():
        if txt.endswith(suf):
            num_txt = txt[:-len(suf)]
            num = float(num_txt.replace(".", "").replace(",", "."))
            return sinal * num * mult
    
    try:
        num = float(txt.replace(".", "").replace(",", "."))
        return sinal * num
    except ValueError:
        m = re.search(r"[-+]?\d[\d.,]*", txt)
        if m:
            num = float(m.group().replace(".", "").replace(",", "."))
            return sinal * num
        return np.nan

def melt_tabela(df: pd.DataFrame,relatorio: str,ticker: str, nome:str ,col_idx_nome: int = 0) -> pd.DataFrame:
    
    df = df.reset_index(drop=False)
    id_col = df.columns[col_idx_nome]
    long = (df
            .melt(id_vars=[id_col], var_name="data", value_name="valor")
            .rename(columns={id_col: "nome_coluna"}))
    long["relatorio"]   = relatorio
    long["ticker"]      = ticker
    long['nome']        = nome
    return long[['nome',"ticker", "relatorio", "data", "nome_coluna", "valor"]]

def consolidar_tabelas(tabelas: dict[str, pd.DataFrame], ticker: str, nome:str) -> pd.DataFrame:
    registros = []

    for relatorio, df in tabelas.items():
        
        if relatorio == "releases":
            continue

        try:
            df.index = df['Conta']
            df = df.drop(columns=['Conta'])
        except KeyError:
            pass

        df = df.map(parse_valor)


        registros.append(melt_tabela(df, relatorio, ticker, nome))

    full_df = pd.concat(registros, ignore_index=True)

    try:
        full_df["data_ord"] = (full_df["data"]
                               .str.replace(r"\(.*\)", "", regex=True)
                               .str.replace(" ", "")
                               .map(lambda s: re.sub(r"(\d)T(\d{4})", r"\2Q\1", s)))  # 1T2025 → 2025Q1
        full_df = full_df.sort_values(["ticker","data_ord","relatorio"])
        full_df = full_df.drop(columns="data_ord")
    except Exception:
        pass

    return full_df.reset_index(drop=True)


In [None]:
from tqdm import tqdm

tickers = all_companies.tolist()
results = []
for ticker in tqdm(tickers, desc="Processing companies"):
   
    try:
        results.append(get_company_data(ticker.lower()))
    except Exception as e:
        print(f"Error processing {ticker}: {e}")
        
df = pd.concat(results)
df.to_csv('./dados/buyers/companies_data.csv', index=False)

In [None]:
buyers = pd.read_csv('./dados/buyers/companies_data.csv')
buyers

In [None]:
all_companies['Ticker'] = all_companies['Ticker'].str.lower()

In [None]:
buyers = buyers.merge(all_companies, left_on='ticker', right_on='Ticker', how='left').drop(columns=['Ticker'])

In [None]:
buyers = buyers[['Nome', 'ticker', 'relatorio', 'data', 'nome_coluna', 'valor']]

In [None]:
buyers[buyers['relatorio']=='Site'].to_csv('./dados/buyers/companies.csv', index=False)

In [None]:
buyers[buyers['relatorio']!='Site'].to_csv('./dados/buyers/companies_financials.csv', index=False)