# 1. Aquisicao e tratamento dos dados

## Importar bibliotecas

In [48]:
! pip install xlrd
! pip install openpyxl
! pip install pydantic

Collecting openpyxl
  Using cached openpyxl-3.0.9-py2.py3-none-any.whl (242 kB)
Collecting et-xmlfile
  Using cached et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.9


In [42]:
import pandas as pd
import os
import yfinance as yf

from pydantic import BaseModel
from typing import Optional, List

In [None]:
class AquisicaoDadosFundamentos(BaseModel):
    balancos_dir: Optional[str] = "../dados/balancos/"
    fund_by_code: Optional[dict] = {}
    codes: List[str] = []

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.codes = self.get_code_list()

    def run(self):
        self.get_balancos_by_code()
        self.get_dre_by_code()
        return self.fund_by_code
    
    def get_code_list(self):
        return [file.replace("balanco_", "").replace(".xls", "") for file in os.listdir(self.balancos_dir)]

    def get_balancos_by_code(self) -> None:

        files = os.listdir(self.balancos_dir)

        for file in files:
            code = file.replace("balanco_", "").replace(".xls", "")
            print(code)
            balanco = pd.read_excel(f"{self.balancos_dir}{file}", sheet_name=0)
            # colocar codigo na posicao 0, 0
            balanco.iloc[0, 0] = code
            # mudar coluna
            balanco.columns = balanco.iloc[0]
            balanco = balanco[1:]
            # tornar a 1ª coluna (que agora tem o nome da emrpesa)
            balanco = balanco.set_index(code)
            self.fund_by_code[code] = balanco

    def get_dre_by_code(self) -> None:

        files = os.listdir(self.balancos_dir)
        for file in files:
            code = file.replace("balanco_", "").replace(".xls", "")
            dre = pd.read_excel(f"{self.balancos_dir}{file}", sheet_name=1)
            # na primeira coluna colocar o título com o nome da empresa
            dre.iloc[0, 0] = code
            # pegar 1ª linha e tornar um cabeçalho
            dre.columns = dre.iloc[0]
            dre = dre[1:]
            # tornar a 1ª coluna (que agora tem o nome da emrpesa)
            dre = dre.set_index(code)
            self.fund_by_code[code] = self.fund_by_code[code].append(dre)

    


ad = AquisicaoDadosFundamentos()
fundamentos_by_code = ad.run()


In [54]:
class AquisicaoDadosCotacoes(BaseModel):
    cotacoes_file: Optional[str] = "../dados/cotacoes/Cotacoes.xlsx"
    codes: List[str] = []
    cotacoes: Optional[dict] = {}

    def run(self):
        self.get_cotacoes_from_excel()
        self.tratar_dados_nulos()
        return self.cotacoes

    def get_cotacoes_from_excel(self) -> None:
        """
        Le planilha com cotacoes historicas das acos do IBOV
        """
        cotacoes_df = pd.read_excel(self.cotacoes_file)

        for empresa in cotacoes_df['Empresa'].unique():
            self.cotacoes[empresa] = cotacoes_df.loc[cotacoes_df['Empresa']==empresa, :]
        
        print(len(self.cotacoes))

    def tratar_dados_nulos(self) -> None:
        """
        Remover empresas com cotacao contendo dados nulos
        """
        empresas_a_remover = []
        for empresa in self.codes:
            if self.cotacoes[empresa].isnull().values.any():
                self.cotacoes.pop(empresa)
                empresas_a_remover.append(empresa)
        self.codes = list(self.cotacoes.keys())
        print(len(self.codes))

ad_cotacoes = AquisicaoDadosCotacoes(codes=ad.codes)
cotacoes_by_code = ad_cotacoes.run()


77
65


In [61]:
# remove fundamentos das empresas que tenham cotacoes com dados nulos
codes_to_be_removed_from_fund = list(set(fundamentos_by_code.keys()) ^ set(cotacoes_by_code.keys()))
for code in codes_to_be_removed_from_fund:
    fundamentos_by_code.pop(code)

if cotacoes_by_code.keys() == fundamentos_by_code.keys():
    print("Fundamentos com cotacoes com itens nulos removidos com sucesso")
    print(len(fundamentos_by_code.keys()))


Fundamentos com cotacoes com itens nulos removidos com sucesso
65


In [66]:
def juntar_fundamentos_com_cotacoes():
    """
    Trata os data frames de fundamentos e junta as cotacoes por trimestre
    """
    codes = fundamentos_by_code.keys()
    for code in codes:
        if "Adj Close" not in fundamentos_by_code[code].columns:
            df = fundamentos_by_code[code].T
            df.index = pd.to_datetime(df.index, format="%d/%m/%Y")
            # print(df)

            # Definir data como indice e pegar somente coluna de Adj Close do df
            df_cotacao = cotacoes_by_code[code].set_index("Date")
            df_cotacao = df_cotacao[["Adj Close"]]

            # Juntar dois dataframes
            df = df.merge(df_cotacao, right_index=True, left_index=True)
            df.index.name = code

            fundamentos_by_code[code] = df
        else:
            print("Tratamento ja executado")

juntar_fundamentos_com_cotacoes()

### Ajeitando colunas

In [73]:
columns = list(fundamentos_by_code["PETR4"].columns)

def remover_empresa_colunas_diff(columns_ref: List[str]):
    """ 
    Remove empresas que nao tenham colunas de acordo com colunas de acao referencia 
    """
    codes = fundamentos_by_code.keys()
    empresa_a_remover = []
    for code in codes:
        if set(columns_ref) != set(fundamentos_by_code[code].columns):
            empresa_a_remover.append(code)
    
    for empresa in empresa_a_remover:
        fundamentos_by_code.pop(empresa)
    
    print(len(fundamentos_by_code))

remover_empresa_colunas_diff(columns)

61


In [95]:
def select_columns_for_calc(codes):
    for code in codes:
        colunas_para_calculos = ["Ativo Total", "Patrimônio Líquido", "Receita Líquida de Vendas e/ou Serviços", "Lucro/Prejuízo do Período", "Adj Close"]
        df_c = fundamentos_by_code[code][colunas_para_calculos]
        df_c = df_c.rename(columns={"Ativo Total": "ativo_total", "Patrimônio Líquido": "patrimonio_liq", "Receita Líquida de Vendas e/ou Serviços": "receita_liq", "Lucro/Prejuízo do Período": "lucro_op", "Adj Close": "cotacao"})
        df_c.iloc[:,:-1] = df_c.iloc[:,:-1].astype(float)
        fundamentos_by_code[code] = df_c

codes = fundamentos_by_code.keys()
select_columns_for_calc(codes)

In [102]:
def calcular_dupont():
    codes = fundamentos_by_code.keys()
    for code in codes:
        df = fundamentos_by_code[code]

        af = df["ativo_total"] / df["patrimonio_liq"]
        ra = df["lucro_op"] / df["ativo_total"]
        ga = df["receita_liq"] / df["ativo_total"]
        ml = df["lucro_op"] / df["receita_liq"]
        rpl = af * ra
        roe = df["lucro_op"] / df["patrimonio_liq"]
        roa = ga * ml

        df["RPL"] = rpl
        df["ROE"] = roe
        df["ROA"] = roa
        df["AF"] = af
        df["RA"] = ra
        df["GA"] = ga
        df["ML"] = ml

        fundamentos_by_code[code] = df[["RPL", "ROE", "ROA", "AF", "RA", "GA", "ML", "cotacao"]]

calcular_dupont()

In [117]:

columns = fundamentos_by_code["PETR4"].columns

valores_vazios = dict.fromkeys(columns, 0)
total_linhas = 0
empresa_with_null = []

for empresa in fundamentos_by_code:
    tabela = fundamentos_by_code[empresa]
    total_linhas += tabela.shape[0]
    for coluna in columns:
        qtde_vazios = pd.isnull(tabela[coluna]).sum()
        if qtde_vazios > 0:
            empresa_with_null.append(empresa)
        valores_vazios[coluna] += qtde_vazios

print(valores_vazios)
print(total_linhas)
print(set(empresa_with_null))

# remove empresas com valores nulos, ignorando ABEV por ter nulo na ultima linha
for empresa in set(empresa_with_null):
    if empresa != "ABEV3":
        fundamentos_by_code.pop(empresa)

print(len(fundamentos_by_code))

{'RPL': 3, 'ROE': 3, 'ROA': 35, 'AF': 1, 'RA': 3, 'GA': 3, 'ML': 3, 'cotacao': 0}
2004
{'BRAP4', 'ABEV3', 'CSAN3', 'RADL3'}
58


In [None]:
codes = list(fundamentos_by_code.keys())
codes

## Criacao de rotulos - coluna target

In [132]:
def get_ibov_from_yahoo():
    data_inicial = "2012-12-20"
    data_final = "2021-09-20"

    ibov = yf.download("^BVSP", start=data_inicial, end=data_final)

    return ibov

ibov = get_ibov_from_yahoo()

[*********************100%***********************]  1 of 1 completed


In [133]:
import numpy as np

In [None]:
def get_ibov_by_fundamentos_dates(ibov):
    datas_fundamentos = fundamentos_by_code["PETR4"].index

    # Set as nan when dates are different
    for data in datas_fundamentos:
        if data not in ibov.index:
            ibov.loc[data] = np.nan
    ibov = ibov.sort_index()
    ibov = ibov.ffill()
    ibov = ibov.rename(columns={"Adj Close": "IBOV"})

    for code in fundamentos_by_code:
        fundamentos_by_code[code] = fundamentos_by_code[code].merge(ibov[["IBOV"]], left_index=True, right_index=True)

get_ibov_by_fundamentos_dates(ibov)

print(len(fundamentos_by_code))
fundamentos_by_code["PETR4"]

## Criar coluna resultado baseado em cotacao e IBOV

- cotacao e Ibov, considerar valores do proximo trimestre 
- Considerar comprar (1): variacao da cotacao de 2% acima da variacao do IBOV
- Considerar Vender (0): variacao da cotacao de 2% abaixo da variacao do IBOV

In [140]:
def criar_coluna_decisao(): 
    for empresa in fundamentos_by_code:
        df = fundamentos_by_code[empresa]
        df = df.sort_index()

        df["cotacao_var"] = df["cotacao"].shift(-1) / df["cotacao"] - 1
        df["IBOV_var"] = df["IBOV"].shift(-1) / df["IBOV"] - 1
        df["resultado"] = df["cotacao_var"] - df["IBOV_var"]

        condicoes = [
            (df["resultado"] > 0), 
            (df["resultado"] < -0.02)
        ]
        valores = [1, 0]

        df["decisao"] = np.select(condicoes, valores)

        fundamentos_by_code[empresa] = df

criar_coluna_decisao()



In [141]:
fundamentos_by_code["PETR4"]

Unnamed: 0,RPL,ROE,ROA,AF,RA,GA,ML,cotacao,IBOV,cotacao_var,IBOV_var,resultado,decisao
2012-12-31,0.022582,0.022582,0.011432,1.97539,0.011432,0.108313,0.105545,16.124966,60952.0,-0.056352,-0.075469,0.019117,1
2013-03-31,0.022935,0.022935,0.011064,2.072856,0.011064,0.10432,0.106061,15.216284,56352.0,-0.105504,-0.157847,0.052343,1
2013-06-30,0.018332,0.018332,0.008279,2.214321,0.008279,0.098296,0.084223,13.610906,47457.0,0.135436,0.102851,0.032585,1
2013-09-30,0.009946,0.009946,0.004476,2.222109,0.004476,0.102449,0.043689,15.454315,52338.0,-0.069717,-0.015878,-0.053839,0
2013-12-31,0.018053,0.018053,0.008342,2.164071,0.008342,0.107612,0.077523,14.376887,51507.0,-0.076112,-0.021201,-0.054911,0
2014-03-31,0.015213,0.015213,0.006724,2.262442,0.006724,0.101675,0.066134,13.282627,50415.0,0.168312,0.054607,0.113706,1
2014-06-30,0.013748,0.013748,0.006196,2.218935,0.006196,0.102825,0.060257,15.518258,53168.0,0.046269,0.01783,0.028439,1
2014-09-30,-0.015642,-0.015642,-0.006545,2.389925,-0.006545,0.10834,-0.060412,16.236279,54116.0,-0.44555,-0.075929,-0.36962,0
2014-12-31,-0.086126,-0.086126,-0.033527,2.56882,-0.033527,0.107188,-0.312792,9.002206,50007.0,-0.036889,0.022857,-0.059746,0
2015-03-31,0.017547,0.017547,0.006407,2.738851,0.006407,0.089372,0.071685,8.67012,51150.0,0.313664,0.037752,0.275913,1


In [142]:
for empresa in fundamentos_by_code:
    fundamentos_by_code[empresa] = fundamentos_by_code[empresa].drop(["cotacao", "cotacao_var", "IBOV", "IBOV_var", "resultado"], axis=1)
print(fundamentos_by_code["PETR4"].shape)

(33, 8)


In [144]:
copia_fund = fundamentos_by_code.copy()

In [None]:
bd = pd.DataFrame()
for code in copia_fund:
    copia_fund[code] = copia_fund[code][1:-1]
    copia_fund[code] = copia_fund[code].reset_index(drop=True)
    bd = bd.append(copia_fund[code])


In [150]:
bd.reset_index(drop=True, inplace=True)
bd

Unnamed: 0,RPL,ROE,ROA,AF,RA,GA,ML,decisao
0,-0.105253,-0.105253,-0.016222,6.488153,-0.016222,0.172112,-0.094254,0
1,-0.104673,-0.104673,-0.018332,5.709798,-0.018332,0.275444,-0.066555,0
2,-0.027526,-0.027526,-0.004677,5.884993,-0.004677,0.164357,-0.028459,1
3,-0.032876,-0.032876,-0.005308,6.193049,-0.005308,0.263596,-0.020139,1
4,-0.018787,-0.018787,-0.002987,6.288936,-0.002987,0.277509,-0.010765,1
...,...,...,...,...,...,...,...,...
1669,0.030807,0.030807,0.009171,3.359034,0.009171,0.132155,0.069398,1
1670,0.031875,0.031875,0.009289,3.431660,0.009289,0.140275,0.066217,1
1671,0.024589,0.024589,0.008690,2.829619,0.008690,0.114133,0.076137,0
1672,0.020095,0.020095,0.007375,2.724519,0.007375,0.114956,0.064159,1


## Salvar dataframe em csv

In [151]:
bd.to_csv("../out/database_dupont.csv")

## Salvar dataframe em joblib

In [155]:
! pip install joblib
import joblib

joblib.dump(bd, "../out/database_dupont.joblib")



['../out/database_dupont.joblib']