In [0]:
import json
import pyspark.sql.functions as F
import re
import sys
import traceback
from datetime import datetime
from delta.tables import DeltaTable
from enum import Enum, unique
from pyspark.sql import DataFrame
from pyspark.sql.window import Window
from types import TracebackType
from typing import List, Type, TypedDict

class Framework:
    
    @unique
    class LoadType(str, Enum):
        OVERWRITE_TABLE = "OVERWRITE_TABLE"
        OVERWRITE_PARTITION = "OVERWRITE_PARTITION"
        APPEND_ALL = "APPEND_ALL"  
        APPEND_NEW = "APPEND_NEW"
        UPSERT = "UPSERT"

    @unique
    class RawFileFormat(str, Enum):
        PARQUET = "PARQUET"
        DELTA = "DELTA"
        ORC = "ORC"
        CSV = "CSV"

    @unique
    class RunStatus(str, Enum):
        SUCCEEDED = "SUCCEEDED"
        FAILED = "FAILED"
    
    @unique
    class SchemaEvolutionMode(str, Enum):
        FAIL_ON_SCHEMA_MISMATCH = "FAIL_ON_SCHEMA_MISMATCH"
        ADD_NEW_COLUMNS = "ADD_NEW_COLUMNS"
        IGNORE_NEW_COLUMNS = "IGNORE_NEW_COLUMNS"
        OVERWRITE_SCHEMA = "OVERWRITE_SCHEMA"


    class ReturnObject(TypedDict):
        status: str
        target_object: str
        num_records_read: int
        num_records_loaded: int
        num_records_errored_out: int
        error_message: str
        error_details: str

    
    def check_workspace_env()-> str:
        ## busca workspace id
        id = spark.conf.get("spark.databricks.clusterUsageTags.clusterOwnerOrgId")
        if id == 'xxxxxxxxx':
            return  'abfss://xxxxxxxx.dfs.core.windows.net/xxxxxxxx'
        elif id == 'yyyyyyyyy':
            return 'abfss://xxxxxxxx.dfs.core.windows.net/xxxxxxxx'
        elif id == 'zzzzzzzz':
            return 'abfss://xxxxxxxx.dfs.core.windows.net/xxxxxxxx'
        else:
            ##raise ValueError("Este workspace não pertence a este escopo ")
            return 'dbfs:/mnt/lakehouse/bronze'
    
    LAKEHOUSE_LANDING_ROOT =  check_workspace_env()
    LAKEHOUSE_BRONZE_ROOT  =  check_workspace_env()
    LAKEHOUSE_SILVER_ROOT  =  check_workspace_env()
    LAKEHOUSE_GOLD_ROOT    =  check_workspace_env()
    
    ###print(LAKEHOUSE_LANDING_ROOT)
    
    @classmethod
    def _build_return_object(
        cls,
        status: RunStatus,
        target_object: str,
        num_records_read: int = 0,
        num_records_loaded: int = 0,
        error_message: str = "",
        error_details: str = "",
    ) -> ReturnObject:
        """ Retorna objeto com detalhes da ultima excução/erro
        """
        return {
            "status": status,
            "target_object": target_object,
            "num_records_read": num_records_read,
            "num_records_loaded": num_records_loaded,
            "num_records_errored_out": num_records_read - num_records_loaded,
            "error_message": error_message[:8000],
            "error_details": error_details,
        }
                
    @classmethod
    def exit_with_object(cls, results: ReturnObject):
        """ Retorno de objeto com o resultado da execução 
        """
        dbutils.notebook.exit(json.dumps(results))
    
    
    @classmethod
    def exit_with_last_exception(cls):
        """Busca o ultimo erro e retorno um objeto.
        """
        exc_type, exc_value, _ = sys.exc_info()
        results = cls._build_return_object(
            status=cls.RunStatus.FAILED,
            target_object=None,
            error_message=f"{exc_type.__name__}: {exc_value}",
            error_details=traceback.format_exc(),
        )
        cls.exit_with_object(results)
        

        
    @classmethod
    def read_landing_zone_dataframe(
        cls,
        file_format: RawFileFormat,
        location: str,
        delimiter: str = ';',
    ) -> DataFrame:
        """Leitura de arquivo da camada Landing com opções de parametros como csv, parquet, avro """

        try:
            df = (
                spark.read
                .format(file_format.lower())
                .option("header", True)
                .option("escape", "\"")
                .option("mergeSchema", True)
                .option("delimiter", delimiter)
                .load(location)
            )

            # transforma todas as colunas para string
            if file_format != cls.RawFileFormat.CSV:
                non_string_columns = [col for col, dtype in df.dtypes if dtype != "string"]
                for column in non_string_columns:
                    df = df.withColumn(column, F.col(column).cast("string"))

            return df

        except:
            cls.exit_with_last_exception()
            
    @classmethod
    def generate_bronze_table_location(
        cls,
        table_name: str,
    ) -> str:
        """Cria caminho para tabela bronze 
        """
        try:
            # Verifica se os parametro tem algum campo nulo ou em branco
            params_list = [schema_name, table_name]
            if any(len(x) == 0 for x in params_list):
                raise ValueError("Caminho não pode conter brancos ou nulo, verifique !")
            return f"{cls.LAKEHOUSE_BRONZE_ROOT}/{table_name}/"

            
        except:
            cls.exit_with_last_exception()


    @classmethod
    def write_delta_table(
        cls,
        df: DataFrame,
        location: str,
        schema_name: str,
        table_name: str,
        load_type: LoadType,
        key_columns: List[str] = [],
        partition_columns: List[str] = [],
        schema_evolution_mode: SchemaEvolutionMode = SchemaEvolutionMode.ADD_NEW_COLUMNS,
    ) -> ReturnObject:
      
        """Escreve um DataFrame como Delta Table 
        """
        num_records_read = 0
        num_records_loaded = 0
        #print('dentro da classe')
        #print(location)
        #print(schema_name)
        #print(table_name)
        #print(load_type)
        
      
        try:
            # Tabela deve existir para aplicar o merge
            if load_type != cls.LoadType.APPEND_ALL and not DeltaTable.isDeltaTable(spark, location):
                print("Delta table ainda não existe. Altere load_type para APPEND_ALL e reexecute")
                load_type = cls.LoadType.APPEND_ALL

            # Otimiza a escrita para evitar small files 
            spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", True)

            # Determina o load type
            if load_type == cls.LoadType.APPEND_ALL:
                
                cls._write_table_using_append_all(
                    df=df,
                    location=location,
                    partition_columns=partition_columns,
                    schema_evolution_mode=schema_evolution_mode,
                )
            elif load_type == cls.LoadType.UPSERT:
                if len(key_columns) == 0:
                    raise ValueError("Nenhuma coluna foi especificada para o upsert")

                cls._write_table_using_upsert(
                    df=df,
                    location=location,
                    key_columns=key_columns,
                    schema_evolution_mode=schema_evolution_mode,
                )
            else:
                raise NotImplementedError

            # Cria Metadados e tabela
            spark.sql(f"CREATE DATABASE IF NOT EXISTS {schema_name};")
            spark.sql(f"CREATE TABLE IF NOT EXISTS {schema_name}.{table_name} USING DELTA LOCATION '{location}';")

            return cls._build_return_object(
                status=cls.RunStatus.SUCCEEDED,
                target_object=f"{schema_name}.{table_name}",
                num_records_read=num_records_read,
                num_records_loaded=num_records_loaded,
            )

        except Exception as e:
            return cls._build_return_object(
                status=cls.RunStatus.FAILED,
                target_object=f"{schema_name}.{table_name}",
                num_records_read=num_records_read,
                num_records_loaded=num_records_loaded,
                error_message=str(e),
                error_details=traceback.format_exc(),
            )
            
    @classmethod
    def _write_table_using_append_all(
        cls,
        df: DataFrame,
        location: str,
        partition_columns: List[str] = [],
        schema_evolution_mode: SchemaEvolutionMode = SchemaEvolutionMode.ADD_NEW_COLUMNS,
    ) -> ReturnObject:
        """Escreve o DataFrame utilizando APPEND_ALL.
        """
        
        df_writer = (
            df.write
            .format("delta")
            .mode("append")
        )

        # Checa se havera partições
        if len(partition_columns) > 0:
            df_writer = df_writer.partitionBy(partition_columns)

        # verifica schema evolution
        if schema_evolution_mode == cls.SchemaEvolutionMode.FAIL_ON_SCHEMA_MISMATCH:
            pass
        elif schema_evolution_mode == cls.SchemaEvolutionMode.ADD_NEW_COLUMNS:
            df_writer = df_writer.option("mergeSchema", True)
        elif schema_evolution_mode == cls.SchemaEvolutionMode.IGNORE_NEW_COLUMNS:
            if DeltaTable.isDeltaTable(spark, location):
                table_columns = DeltaTable.forPath(spark, location).columns
                new_df_columns = [col for col in df.columns if col not in table_columns]
                df = df.drop(*new_df_columns)
        elif schema_evolution_mode == cls.SchemaEvolutionMode.OVERWRITE_SCHEMA:
            df_writer = df_writer.option("overwriteSchema", True)
        elif schema_evolution_mode == cls.SchemaEvolutionMode.RESCUE_NEW_COLUMNS:
            raise NotImplementedError
        else:
            raise NotImplementedError

        # Grava a Delta Table
        df_writer.save(location)
        
    ############################################
    ## codigo importado funções já existentes ##
    ############################################
    @classmethod        
    def folder_read_path(
        cls, 
        source_table, 
        source_path_landing
    ) -> list:
      
        try:
            years = [file.name[:-1] for file in dbutils.fs.ls(source_path_landing)]
            regex = '\d+X*_\d+X*_*\d*.csv$'
            file_list = []
            for year in years:
              #lista de arquivos nas pastas
              filenames = dbutils.fs.ls(source_path_landing + '/' + year)
              for file in filenames:
                # obtenção do mês do arquivo
                start, _ = re.search(regex, file.path).span()
                month = file.path[start+4:start+6]
                file_list.append(file.path)
                for item in file_list:
                  print(item)
              ####df = spark.read.format('csv').options(header='true').option('delimiter', '|').load(file_list)
            return file_list 
        except :
            cls.exit_with_last_exception()
            
            
            
            