In [16]:
# Prompt
prompt = '''You are a specialist in creating databases for LLMs in JSON format with the fields "instruction", "input", and "output".
Extract important information from large amounts of text to create the entries for "instruction", "input", and "output" for the Dataset.
Use the text provided in the "Text input" section.
Example provided for guidance.

Example:

Text input:
Magic in George R. R. Martin's world manifests in different ways. In Westeros, visible magic has largely waned, leaving behind traces such as the Children of the Forest and their bond with weirwood trees. Conversely, in Essos, magic is more widespread. There are sorcerers in Qarth, the warlocks of Asshai, the Red Priests of R'hllor who wield shadow magic, and the glass candles of the Citadel with mysterious properties.
House Stark is a prominent noble house in Westeros, governing the North from Winterfell. Recognized by their direwolf sigil and their motto 'Winter is Coming,' the Starks are celebrated for their honor, resilience, and ties to the Old Gods. Throughout A Song of Ice and Fire, Starks like Eddard 'Ned' Stark, his wife Catelyn Tully, and their children are central figures in the War of the Five Kings and the looming threat beyond the Wall.

Text output:
[
    {
        "instruction": "Write a short biography of a famous house in Westeros.",
        "input": "House Stark",
        "output": "House Stark is a noble house in Westeros, ruling the North from Winterfell. Known for their sigil of the direwolf and their motto 'Winter is Coming,' the Starks are renowned for their honor, resilience, and connection to the Old Gods. Throughout A Song of Ice and Fire, Starks like Eddard 'Ned' Stark, his wife Catelyn Tully, and their children play pivotal roles in the War of the Five Kings and the greater threat beyond the Wall."
    },
    {
        "instruction": "Explain the concept of magic in Westeros and Essos.",
        "input": "",
        "output": "Magic in George R. R. Martin's world exists in various forms. Westeros has largely seen a decline in overt magic, with remnants like the Children of the Forest and their connection to weirwood trees. In Essos, however, magic is more prevalent. There are sorcerers in Qarth, the warlocks of Asshai, the Red Priests of R'hllor who practice shadow magic, and the glass candles of the Citadel with unknown properties."
    }
    ... to be continued
]

Additional Guidelines:
Describe only the data listed in the text, use the "Text output" format (JSON), and consider the examples only as a reference to extract relevant information about the subjects mentioned in the "text input" text.
You will receive user input data in diferent languages, outputs should be in input's language.
Use UTF-8 for generated text.
'''

In [17]:
# Imports the OpenAI library and sets the API key
from openai import AzureOpenAI
from unidecode import unidecode
import json_repair
import json
import fitz  # PyMuPDF
import tiktoken

# Configuração da API da OpenAI
client = AzureOpenAI(
    azure_endpoint="https://saulopenai.openai.azure.com/",
    api_key='c6e83b7e47254fa68dddcee008fe8aeb',
    api_version="2024-02-01"
)

# Função para extrair texto do PDF
def extract_text_from_pdf(pdf_path, start_page=5, end_page=None):
    pdf_document = fitz.open(pdf_path)
    extracted_text = ""
    if end_page is None:
        end_page = pdf_document.page_count
    for page_num in range(start_page, min(end_page, pdf_document.page_count)):
        page = pdf_document.load_page(page_num)
        text = page.get_text()
        extracted_text += text
    pdf_document.close()
    return extracted_text

# Função para dividir o texto em blocos de 500 tokens
def split_text_into_chunks(text, model_name="gpt-4o", chunk_size=500):
    enc = tiktoken.encoding_for_model(model_name)
    tokens = enc.encode(text)
    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
    text_chunks = [enc.decode(chunk) for chunk in chunks]
    return text_chunks
# Prompt
prompt = '''You are a specialist in creating databases for LLMs in JSON format with the fields "instruction", "input", and "output".
Extract important information from large amounts of text to create the entries for "instruction", "input", and "output" for the Dataset.
Use the text provided in the "Text input" section.
Example provided for guidance.

Example:

Text input:
Magic in George R. R. Martin's world manifests in different ways. In Westeros, visible magic has largely waned, leaving behind traces such as the Children of the Forest and their bond with weirwood trees. Conversely, in Essos, magic is more widespread. There are sorcerers in Qarth, the warlocks of Asshai, the Red Priests of R'hllor who wield shadow magic, and the glass candles of the Citadel with mysterious properties.
House Stark is a prominent noble house in Westeros, governing the North from Winterfell. Recognized by their direwolf sigil and their motto 'Winter is Coming,' the Starks are celebrated for their honor, resilience, and ties to the Old Gods. Throughout A Song of Ice and Fire, Starks like Eddard 'Ned' Stark, his wife Catelyn Tully, and their children are central figures in the War of the Five Kings and the looming threat beyond the Wall.

Text output:
[
    {
        "instruction": "Write a short biography of a famous house in Westeros.",
        "input": "House Stark",
        "output": "House Stark is a noble house in Westeros, ruling the North from Winterfell. Known for their sigil of the direwolf and their motto 'Winter is Coming,' the Starks are renowned for their honor, resilience, and connection to the Old Gods. Throughout A Song of Ice and Fire, Starks like Eddard 'Ned' Stark, his wife Catelyn Tully, and their children play pivotal roles in the War of the Five Kings and the greater threat beyond the Wall."
    },
    {
        "instruction": "Explain the concept of magic in Westeros and Essos.",
        "input": "",
        "output": "Magic in George R. R. Martin's world exists in various forms. Westeros has largely seen a decline in overt magic, with remnants like the Children of the Forest and their connection to weirwood trees. In Essos, however, magic is more prevalent. There are sorcerers in Qarth, the warlocks of Asshai, the Red Priests of R'hllor who practice shadow magic, and the glass candles of the Citadel with unknown properties."
    }
    ... to be continued
]

Additional Guidelines:
Describe only the data listed in the text, use the "Text output" format (JSON), and consider the examples only as a reference to extract relevant information about the subjects mentioned in the "text input" text.
You will receive user input data in diferent languages, outputs should be in input's language.
Use UTF-8 for generated text.
'''
# Função para enviar os blocos de texto para a API e gerar JSON
def process_chunks_with_api(text_chunks, model_name="gpt-4o", prompt_start=prompt):
    response_content = []

    for chunk in text_chunks:
        response = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": prompt_start},
                {"role": "user", "content": chunk}
            ]
        )
        response_text = response.choices[0].message.content
        try:
            response_json = json.loads(response_text)
            response_content.extend(response_json)
        except json.JSONDecodeError:
            repaired_text = json_repair.repair(response_text)
            try:
                response_json = json.loads(repaired_text)
                response_content.extend(response_json)
            except json.JSONDecodeError:
                print("Failed to parse response as JSON, skipping chunk.")
    
    return response_content

In [18]:
##Conting tokens
import tiktoken

def count_tokens(text, model_name="gpt-4o"):
    # Carregar o codificador para o modelo especificado
    enc = tiktoken.encoding_for_model(model_name)
    
    # Codificar o texto em tokens
    tokens = enc.encode(text)
    
    # Retornar a contagem de tokens
    return len(tokens)


In [19]:
count_tokens(prompt)

621

In [20]:
text = '''
In the world of fantasy literature, few authors have created as intricate and immersive a universe as J.R.R. Tolkien. His works, most notably "The Hobbit" and "The Lord of the Rings," have become cornerstones of modern fantasy, setting the bar for world-building, character development, and epic storytelling. The richness of Middle-earth, with its diverse cultures, languages, and histories, continues to captivate readers and inspire writers decades after its creation.

Tolkien's Middle-earth is not just a backdrop for his stories; it is a living, breathing world with its own geography, politics, and social structures. The Shire, home to the hobbits, is a pastoral idyll, a symbol of peace and simplicity. In contrast, the dark lands of Mordor, with Mount Doom at its heart, represent the ultimate evil and corruption. Each location in Middle-earth is meticulously crafted, with its own unique atmosphere and significance.

The characters in Tolkien's works are equally memorable and well-developed. Frodo Baggins, the unlikely hero, embodies the themes of courage and sacrifice. His journey from the Shire to Mount Doom is a testament to the resilience of the human (or hobbit) spirit. Alongside Frodo, characters like Aragorn, Legolas, and Gimli add depth and variety to the narrative, each bringing their own strengths and perspectives to the quest to destroy the One Ring.
'''

In [21]:
count_tokens(text)

284

In [22]:
response = client.chat.completions.create(
    model="gpt-4o",  # The deployment name you chose when you deployed the GPT-3.5-Turbo or GPT-4 model.
    messages=[
        {"role": "system", "content": prompt},
        {"role": "user", "content": text}
    ]
)
print(response.choices[0].message.content)

RateLimitError: Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2024-02-01 have exceeded token rate limit of your current OpenAI S0 pricing tier. Please retry after 86400 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.'}}

In [None]:
# Exemplo de uso
pdf_path = "gram_tupi.pdf"
text = extract_text_from_pdf(pdf_path)

# Dividir o texto em blocos de 500 tokens
text_chunks = split_text_into_chunks(text)

# Processar os blocos com a API e salvar as respostas em JSON
responses = process_chunks_with_api(text_chunks, prompt_start=prompt)
json_data = json_repair.loads(responses.strip('```').strip('json').strip())

# Salvar as respostas em um arquivo JSON
with open('dataset.json', 'w', encoding='utf-8') as json_file:
    json.dump(json_data, json_file, ensure_ascii=False, indent=4)

print("Processamento completo. Respostas salvas em 'responses.json'.")


In [3]:
from openai import AzureOpenAI
from unidecode import unidecode
import json_repair
import json
import fitz  # PyMuPDF
import tiktoken

# Configuração da API da OpenAI
client = AzureOpenAI(
    azure_endpoint="",
    api_key='',
    api_version="2024-02-01"
)

# Função para extrair texto do PDF
def extract_text_from_pdf(pdf_path, start_page=0, end_page=None):
    pdf_document = fitz.open(pdf_path)
    extracted_text = ""
    if end_page is None:
        end_page = pdf_document.page_count
    for page_num in range(start_page, min(end_page, pdf_document.page_count)):
        page = pdf_document.load_page(page_num)
        text = page.get_text()
        extracted_text += text
    pdf_document.close()
    return extracted_text

# Função para dividir o texto em blocos de tokens
def split_text_into_chunks(text, model_name="gpt-4o", chunk_size=500):
    enc = tiktoken.encoding_for_model(model_name)
    tokens = enc.encode(text)
    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
    text_chunks = [enc.decode(chunk) for chunk in chunks]
    print("Quantidade de chamadas:", len(text_chunks))
    return text_chunks

# Prompt
prompt = '''You are a specialist in creating databases for LLMs in JSON format with the fields "instruction", "input", and "output".
Extract important information from large amounts of text to create the entries for "instruction", "input", and "output" for the Dataset.
Use the text provided in the "Text input" section.
Example provided for guidance.

Example:

Text input:
Magic in George R. R. Martin's world manifests in different ways. In Westeros, visible magic has largely waned, leaving behind traces such as the Children of the Forest and their bond with weirwood trees. Conversely, in Essos, magic is more widespread. There are sorcerers in Qarth, the warlocks of Asshai, the Red Priests of R'hllor who wield shadow magic, and the glass candles of the Citadel with mysterious properties.
House Stark is a prominent noble house in Westeros, governing the North from Winterfell. Recognized by their direwolf sigil and their motto 'Winter is Coming,' the Starks are celebrated for their honor, resilience, and ties to the Old Gods. Throughout A Song of Ice and Fire, Starks like Eddard 'Ned' Stark, his wife Catelyn Tully, and their children are central figures in the War of the Five Kings and the looming threat beyond the Wall.

Text output:
[
    {
        "instruction": "Write a short biography of a famous house in Westeros.",
        "input": "House Stark",
        "output": "House Stark is a noble house in Westeros, ruling the North from Winterfell. Known for their sigil of the direwolf and their motto 'Winter is Coming,' the Starks are renowned for their honor, resilience, and connection to the Old Gods. Throughout A Song of Ice and Fire, Starks like Eddard 'Ned' Stark, his wife Catelyn Tully, and their children play pivotal roles in the War of the Five Kings and the greater threat beyond the Wall."
    },
    {
        "instruction": "Explain the concept of magic in Westeros and Essos.",
        "input": "",
        "output": "Magic in George R. R. Martin's world exists in various forms. Westeros has largely seen a decline in overt magic, with remnants like the Children of the Forest and their connection to weirwood trees. In Essos, however, magic is more prevalent. There are sorcerers in Qarth, the warlocks of Asshai, the Red Priests of R'hllor who practice shadow magic, and the glass candles of the Citadel with unknown properties."
    }
    ... to be continued
]

Additional Guidelines:
Describe only the data listed in the text, use the "Text output" format (JSON), and consider the examples only as a reference to extract relevant information about the subjects mentioned in the "text input" text.
You will receive user input data in diferent languages, outputs should be in input's language.
Use UTF-8 for generated text.
'''

# Função para enviar os blocos de texto para a API e gerar JSON
def process_chunks_with_api(text_chunks, model_name="gpt4o-compass-dev", prompt_start=prompt):
    response_content = []
    counter = 0  # Inicializa o contador

    for chunk in text_chunks:
        response = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": prompt_start},
                {"role": "user", "content": chunk}
            ]
        )
        response_text = response.choices[0].message.content
        try:
            response_json = json_repair.loads(response_text)
            response_content.extend(response_json)
        except json.JSONDecodeError:
            repaired_text = json_repair.repair(response_text)
            try:
                response_json = json.loads(repaired_text)
                response_content.extend(response_json)
            except json.JSONDecodeError:
                print("Failed to parse response as JSON, skipping chunk.")
        
        counter += 1  # Incrementa o contador
        print(f"Chunks processados: {counter}")  # Imprime o contador
    
    return response_content

# Caminho para o PDF
pdf_path = "gloss.pdf"

# Extraindo texto do PDF
extracted_text = extract_text_from_pdf(pdf_path)

# Dividindo o texto em blocos de x tokens
text_chunks = split_text_into_chunks(extracted_text)

# Processando os blocos de texto com a API
response_content = process_chunks_with_api(text_chunks)


with open('dataset3.json', 'w', encoding='utf-8') as json_file:
    json.dump(response_content, json_file, ensure_ascii=False, indent=4)

# Processar os blocos com a API e salvar as respostas em JSON

print("Processamento completo. Respostas salvas em 'responses.json'.")


Quantidade de chamadas: 14
Chunks processados: 1
Chunks processados: 2
Chunks processados: 3
Chunks processados: 4
Chunks processados: 5
Chunks processados: 6
Chunks processados: 7
Chunks processados: 8
Chunks processados: 9
Chunks processados: 10
Chunks processados: 11
Chunks processados: 12
Chunks processados: 13
Chunks processados: 14
Processamento completo. Respostas salvas em 'responses.json'.
