In [1]:
# Migração de Tabelas Hive para Tabelas Iceberg em ambiente hadoop com big Data.

In [2]:
#PASSO 01 - Instação de Dependências - Certifique-se de ter instaldo as bibliotecas necessárias, como PyArrow e Iceberg, no ambiente Python.  

In [3]:
!pip install PyArrow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install Iceberg

In [4]:
#PASSO 02 - conectar ao Hive - Usando a Biblioteca PyHive para recuperar as tabelas existentes e seus esquemas.

In [6]:
!pip install pyhive

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyhive
  Downloading PyHive-0.6.5.tar.gz (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyhive
  Building wheel for pyhive (setup.py) ... [?25l[?25hdone
  Created wheel for pyhive: filename=PyHive-0.6.5-py3-none-any.whl size=51554 sha256=0f25383548f14ad3979db15b1c5b1b39723b9c47567571c7637fe2ea4cdcea2d
  Stored in directory: /root/.cache/pip/wheels/2f/51/26/016e93a30481dee1a91808520eefde1fff4da0804f289ac708
Successfully built pyhive
Installing collected packages: pyhive
Successfully installed pyhive-0.6.5


In [9]:
!pip install thrift

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting thrift
  Downloading thrift-0.16.0.tar.gz (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.6/59.6 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: thrift
  Building wheel for thrift (setup.py) ... [?25l[?25hdone
  Created wheel for thrift: filename=thrift-0.16.0-cp310-cp310-linux_x86_64.whl size=399060 sha256=7e704e77232a73881390ad8d65106ab9d67e039f33e5b60b8468b0805a0b23e1
  Stored in directory: /root/.cache/pip/wheels/52/f8/d2/acfd995e8247eb0cad372fa6a640a5fcf279ab2ed7c5c4490e
Successfully built thrift
Installing collected packages: thrift
Successfully installed thrift-0.16.0


In [10]:
from pyhive import hive

In [None]:
# conectar ao Hive
hive_conn = hive.Connection(host='localhost', port=10000, username='hadoop_user', passwd='senha@122')
cursor = hive_conn.cursor()

# Recuperar as tabelas existesntes no Hive
cursor.execute("SHOW TABLES")
tables = cursor.fetchall()


In [None]:
#PASSO 03 - Conectando com Tabelas ICeberg.
from pyarrow import fs
from pyarrow.dataset import dataset
from pyarrow import Table
from pyarrow import schema
from pyarrow import iceberg

# configurando o cliente no HDFS
hdfs = fs.HadoopFileSystem(host='dominio.com.br', port=8020, user='hadoop_user', passwd='senha@123')

for table_name in tables:
  # Obter o esquema da tabela Hive
  cursor.execute(f"DESCRIBE {table_name}")
  hive_schema = cursor.fetchall()

  # Criar o esquema Arrow a partir do esquema Hive
  arrow_schema = schema.Schema.from_pandas(hive_schema) # Ajustes conforme necessidade.

  # Criar a tabela Iceberg correspondente
  iceberg_table_location = f'hdfs:/home/data_lakehouse/INSS/dadossaida/iceberg_table/{table_name}'
  iceberg_table = iceberg.Table.create(iceberg_table_location, schema=arrow_schema)

  # Obter os dados da Tabela Hive
  cursor.execute(f"SELECT * FROM {table_name}")
  hive_data = cursor.fetchall()

  # converter os dados Hive para Arrow Table
  arrow_data = Table.from_pandas(hive_data, schema=arrow_schema)

  # Criar um novo arquivo Iceberg
  new_file = iceberg_table.new_data_file()

  # Configurar o escritor Iceberg
  with new_file.new_row_write() as writer:
    # Gravar os dados na tabela Iceberg
    writer.write_table(arrow_data)

  # Adicionar o novo arquivo à tabela Iceberg
  iceberg_table.new_version().add_file(new_file).commit()




In [None]:
#PASSO 04 - Validação e Testes
# Após a migração, é importante validar os dados migrados para garantir que tudo ocorreu
# conforme o esperado. Você pode consultar e verificar os dados na tabela Iceberg usando a
# Biblioteca PyArrow.

In [None]:
# Carregar a tabela Iceberg migrada
loaded_table = iceberg.Table.load(iceberg_table_location, hdfs=hdfs)

# Ler os dados da table Iceberg
iceberg_data = loaded_table.to_pandas()

# Realizar tests e validações nos dados migrados
# Comparar os dados originais no Hive, com os dados migrados para tabelas Iceberg.