<a href="https://colab.research.google.com/github/rsabilio/ia024-projeto-rag/blob/main/1_extracao.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing packages

In [3]:
!pip install -q llama-index llama-parse

# Importing packages

In [36]:
# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio
import nest_asyncio
import os
from google.colab import userdata
from llama_parse import LlamaParse
import json
import csv
import pandas as pd

nest_asyncio.apply()

os.environ["LLAMA_CLOUD_API_KEY"] = userdata.get('LLAMA_CLOUD_API_KEY_VAGNER')

# Cloning git repo

In [2]:
# Clonar o repositório GitHub
!git clone https://github.com/rsabilio/ia024-projeto-rag.git

Cloning into 'ia024-projeto-rag'...
remote: Enumerating objects: 5016, done.[K
remote: Counting objects: 100% (21/21), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 5016 (delta 5), reused 14 (delta 3), pack-reused 4995[K
Receiving objects: 100% (5016/5016), 1.11 GiB | 30.40 MiB/s, done.
Resolving deltas: 100% (1394/1394), done.
Updating files: 100% (4582/4582), done.


In [55]:
def save_on_git(msg):
    GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')

    %cd /content/ia024-projeto-rag

    !git remote set-url origin https://{GITHUB_TOKEN}@github.com/rsabilio/ia024-projeto-rag

    !git config --global user.email "ramon.s.abilio@gmail.com"
    !git config --global user.name "Ramon Abilio"

    !git pull origin main

    !git add .

    !git commit -m {msg}

    !git push origin main

# Extracting to JSON

In [None]:
def extract_and_save_content(parser, src_dir, dst_dir, log_file):

    # Verifica se o arquivo de log existe
    if not os.path.exists(log_file):
        # Cria o arquivo de log com cabeçalhos se não existir
        with open(log_file, 'w', newline='') as csvfile:
            log_writer = csv.writer(csvfile)
            log_writer.writerow(['file_path', 'num_pages'])  # Cabeçalhos do CSV

    for root, dirs, files in os.walk(src_dir):
        for file in files:
            src_file_path = os.path.join(root, file)
            relative_path = os.path.relpath(root, src_dir)
            dst_file_dir = os.path.join(dst_dir, relative_path)

            #print(src_file_path)

            # Cria o diretório de destino se não existir
            os.makedirs(dst_file_dir, exist_ok=True)

            # Caminho completo do arquivo de destino
            dst_file_path = os.path.join(dst_file_dir, file)
            dst_file_path = f"{dst_file_path}.json"
            if os.path.exists(dst_file_path):
                print(f"*** {src_file_path}  já processado")
                continue

            # Extrai o conteúdo do arquivo usando LlamaParse
            try:
                print(f"{src_file_path}")
                json_objs = parser.get_json_result(src_file_path)

                with open(dst_file_path, 'w', encoding='utf-8') as f:
                    json.dump(json_objs, f, ensure_ascii=False, indent=4)

                # Conta o número de páginas no conteúdo processado
                num_pages = len(json_objs[0]["pages"])

                # Escreve as informações do log em tempo real
                with open(log_file, 'a', newline='') as csvfile:
                    log_writer = csv.writer(csvfile)
                    log_writer.writerow([src_file_path, num_pages])

            except Exception as e:
                print(f"Erro ao processar o arquivo {src_file_path}: {e}")

# Caminhos de exemplo
root          = "/content/ia024-projeto-rag"
src_directory = f"{root}/0-downloaded-files/2-boituva/portarias"
dst_directory = f"{root}/1-files-content/2-boituva/portarias"
log_file_path = f"{root}/log-content-extraction.csv"

parser = LlamaParse(verbose=False)

# Extrai e salva o conteúdo mantendo a estrutura dos diretórios
for ano in range(2014, 2025):
    print(f"Processing: {ano}")
    src_directory_year = f"{src_directory}/{ano}"
    dst_directory_year = f"{dst_directory}/{ano}"

    print("extracting...")
    extract_and_save_content(parser, src_directory_year, dst_directory_year, log_file_path)

    print("push to git...")
    msg = 'Adicionando JSON das Portarias de Boituva'
    save_on_git(msg)


In [None]:
print("push to git...")
msg = 'Adicionando JSON das Portarias de Boituva'
save_on_git(msg)