In [None]:
import pprint
import pdfplumber
import pandas as pd 
import numpy as np
from decimal import Decimal

from typing import List
from abc import ABCMeta, abstractmethod
from dataclasses import dataclass

In [None]:
class Pdf():
    numero_instalacao: str = ""
    numero_cliente: str = ""
    numero_medidor: str = ""
    grupo_tarifario: str = ""
    modalidade_tarifaria: str = ""
    dem_cont_ep: str = ""
    dem_cont_efp: str = ""
    valor_fat_acl: str = ""
    pis: str = ""
    cofins: str = ""
    icms: str = ""
    usd_ep: str = ""
    usd_efp: str = ""
    ufer_ep: str = ""
    ufer_efp: str = ""
    ufdr_ep: str = ""
    ufdr_efp: str = ""
    cip_cosip: str = ""
    subv_tar: str = ""
    cde_covid_ep: str = ""
    cde_covid_efp: str = ""
    fat_car_ep: str = ""
    fat_car_efp: str = ""

    def to_csv(self):
        ...

In [None]:
class Converter(metaclass=ABCMeta):
    table = ...

    def convert_table(self, vertical: list, horizontal: list, page: int):
        _df = pd.DataFrame(self.pdf.pages[page].extract_tables({
            "vertical_strategy": "explicit", "horizontal_strategy": "explicit", "snap_tolerance": 3,
            "explicit_vertical_lines": vertical, "explicit_horizontal_lines": horizontal,
        })[0])
        _df.replace('', np.nan, inplace=True)
        _df.dropna(how="all", inplace=True)
        _df.replace(np.nan, '', inplace=True)
        return _df

    def to_img(self, vertical: list, horizontal: list, page: int, resolution=120):
        return self.pdf.pages[page].to_image(resolution=resolution).debug_tablefinder({
            "vertical_strategy": "explicit", "horizontal_strategy": "explicit", "snap_tolerance": 3,
            "explicit_vertical_lines": vertical, "explicit_horizontal_lines": horizontal,
        })

    def get_dataframe(self) -> Pdf:
        df = [[] for x in range(len(self.pdf.pages))]

        for table in self.tables:
            page = int(table[-1])
            df[page].append(self.convert_table(self.tables[table][0], self.tables[table][1], page))

        return df

    def vline(self, x: float, top: float = 0, bottom: float = 900, *, height: float = 3, width: float = 3):
        return {
            "top": Decimal(top),
            "bottom": Decimal(bottom),
            "x0": Decimal(x),
            "x1": Decimal(x),
            "height": Decimal(3),
            "width": Decimal(3),
            "object_type": "line"
        }


    def hline(self, y: float, left: float = 0, right: float = 600, *, height: float = 3, width: float = 3):
        return {
            "top": Decimal(y),
            "bottom": Decimal(y),
            "x0": Decimal(left),
            "x1": Decimal(right),
            "height": Decimal(3),
            "width": Decimal(3),
            "object_type": "line"
        }    

    @abstractmethod
    def to_dataclass(self):
        ...

In [36]:
@dataclass
class EnelConverter(Converter):

    # TODO: arrumar o __init__
    def __init__(self, file_path: str):
        self.pdf = pdfplumber.open(file_path)

        self.tables = {
        # **{
            **{
                "dados_cliente_0": ([43.3, self.vline(150, 47.8, 56), self.vline(160, 37, 47.8), 280], [37, 47.8, 56, 73, 81, 96]),
            },
            **{
                "dados_medidor_0": ([419, 551], [36, 46, 56, 66, 76])
            },
            **{
                "classificacao_unidade_0": ([43.3, self.vline(79, 108, 117), self.vline(132, 108, 117), self.vline(147, 131, 140), 280], [108, 117, 131, 140, 149])
            }
        }

    def __enter__(self):
        print("Reading PDF")
        return self    

    def __exit__(self):
        print("Closing PDF")
        self.pdf.close()
        
    # TODO: abrir o arquivo ao usar a função e fechar a porta ao sair 
    def to_dataclass(self):
        pdf = Pdf()
        df = self.get_dataframe()

        # pdf.numero_instalacao = df[0][0][1]
        # pdf.grupo_tarifario = df[0][1][1][0].split("-")[1][0:2]
        # pdf.modalidade_tarifaria = "Verde" if "verde" in df[0][1][1][0].lower() else "Azul"
        # pdf.n_medidor = df[0][2][1]
        # pdf.numero_cliente = df[0][2][2]

        # #nao funfa
        # pos_desconto_acl = [pos for pos, char in enumerate(df[0][3][1]) if 'desconto energia acl' in char.lower()]
        # pdf.valor_fat_acl = df[0][3][7][pos_desconto_acl]

        # #vem com %
        # pdf.pis = df[0][3][12][0].split('\n')[1]
        # #vem com %
        # pdf.cofins = df[0][3][13][0].split('\n')[1]
        # #icms vem sem %
        # pdf.icms = df[1][3][9][1]

        # if pdf.modalidade_tarifaria == "Azul":

        #     pos_usd_p = [pos for pos, char in enumerate(df[1][3][1]) if 'uso sist distr ponta' in char.lower()][0]
        #     pdf.usd_ep = df[1][3][4][pos_usd_p]

        #     pos_usd_fp = [pos for pos, char in enumerate(df[1][3][1]) if 'uso sist distr f ponta' in char.lower()][0]
        #     pdf.usd_efp = df[1][3][4][pos_usd_fp]

        # else:
        #     pos_cde_p = [pos for pos, char in enumerate(df[2][1][2]) if 'cde covid ponta' in char.lower()][0]
        #     pdf.cde_covid_ep = df[2][1][3][pos_cde_p]
        #     pos_cde_fp = [pos for pos, char in enumerate(df[2][1][2]) if 'cde covid fpont' in char.lower()][0]
        #     pdf.cde_covid_ep = df[2][1][3][pos_cde_fp]


        # pdf.dem_cont_ep = df[2][0][1][1]
        # pdf.dem_cont_efp = df[2][0][1][2]

        return df

In [37]:
enel = EnelConverter("in/enel/Barbosa ENEL_SP loja 19 abr21.pdf")

In [None]:
enel.to_dataclass()[0][0][0] #n_instalacao
enel.to_dataclass()[0][0][1] #n_cliente
enel.to_dataclass()[0][1][0] #n_medidor

In [41]:
enel.to_dataclass()[0][2][1][0] #subgrupo

'A4'