In [1]:
import json
import markdown
import math
import os
import pandas as pd
import re
import requests
import tiktoken

from transformers import GPT2Tokenizer

In [2]:
base_name = "2016_Guo_AdS_Numerics"
project_folder = "diygenomics-projects"
sub_category = "math"
work_bucket = "AdS-CFT"
external_id = "2023_06_01_e4e21f7112268ec22c17g"

In [3]:
url = 'http://localhost:999/create'

enc = tiktoken.get_encoding("gpt2")
max_chunk_size = 16000
overlap_size = 10
# headers = {'Content-Type': 'application/json'}

data_path = os.getenv('DATA_PATH')
file_path = lambda *args: os.path.join(data_path, project_folder, sub_category, work_bucket, base_name, 'mathpix', *args)

In [4]:
document_path = file_path(f'{external_id}.md')

In [5]:
with open(document_path, 'r') as f:
    text = f.read()

In [6]:
with open('system_create.txt', 'r') as file:
    system_prompt = file.read()

In [73]:
# def split_conversation(conversation, system_prompt, token_limit):
#     tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
#     tokens = tokenizer.encode(conversation)
#     prompt_tokens = tokenizer.encode(system_prompt)
#     max_tokens = token_limit - len(prompt_tokens)

#     # Calculate the number of chunks we need to split the tokens into
#     num_chunks = math.ceil(len(tokens) / max_tokens)

#     # Split the tokens into chunks
#     token_chunks = [tokens[i * max_tokens:(i + 1) * max_tokens] for i in range(num_chunks)]

#     # Decode each chunk back into text
#     text_chunks = [tokenizer.decode(chunk) for chunk in token_chunks]

#     return text_chunks
def split_conversation(conversation, system_prompt, token_limit):
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokens = tokenizer.encode(conversation)
    prompt_tokens = tokenizer.encode(system_prompt)
    max_tokens = token_limit - len(prompt_tokens)
    
    # Split the tokens into chunks
    token_chunks = []
    i = 0
    while i < len(tokens):
        end = min(i + max_tokens, len(tokens))
        
        # Check if the chunk ends in the middle of a LaTeX block
        chunk = tokens[i:end]
        chunk_text = tokenizer.decode(chunk)
        if chunk_text.count('$$') % 2 != 0:  # LaTeX block is not complete
            # Find the start of the incomplete LaTeX block
            latex_start = chunk_text.rfind('$$')
            # Adjust the end of the chunk to be before the LaTeX block
            end = i + len(tokenizer.encode(chunk_text[:latex_start]))
        token_chunks.append(tokens[i:end])
        i = end

    # Decode each chunk back into text
    text_chunks = [tokenizer.decode(chunk) for chunk in token_chunks]

    return text_chunks

In [74]:
text_chunks = split_conversation(text, system_prompt, 16384)

for index, text_chunk in enumerate(text_chunks):
    with open(f'{base_name}_{index}.txt', 'w') as f:
        f.write(text_chunk)

Token indices sequence length is longer than the specified maximum sequence length for this model (18729 > 1024). Running this sequence through the model will result in indexing errors


In [50]:
def create_kb_document(text_chunk):
    text_chunk = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text_chunk)

    payload = {
        'input': text_chunk
    }
    
    response = requests.post(url, json=payload)
    print('Status code:', response.status_code)
    print('Response:', response.json())

In [51]:
# create_kb_document('estasfaf asdfasf test test')

In [62]:
for index, text_chunk in enumerate(text_chunks):
    with open(f'{base_name}_{index}.txt', 'w') as f:
        f.write(text_chunk)
    # create_kb_document(text_chunk)