# Indexer pipeline

## Import dependecies

In [None]:
import pandas as pd
from llama_index.core import Settings, VectorStoreIndex, Document
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from transformers import AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt_tab')

from tqdm import tqdm

## Load Data

In [None]:
dir_data_original = "../data/"

dir_data_json = f"E:/GitHub repositories/data-indexer/data/validations/mqdquad.json"
dir_data_csv = f"E:/GitHub repositories/data-indexer/data/validations/mqdquad.csv"

### Variables

In [None]:
test_file_size = 50

fixedlength_chunk_size = 512
fixedlength_chunk_max_length = 1024

# token-based sliding window
# 128 - 64 - 32 - 16
# 50% overlap okés, de lehetne 25% is
# 64 - 16, 32 - 8, 16 - 4, 8 - 2

token_sliding_window_size = 128
token_sliding_window_overlap = 64

sentence_sliding_window_size = 5
sentence_sliding_window_overlap = 3

### Load Validation Dataset (MedQuAD)

In [None]:
medquad = pd.read_json(dir_data_json, orient="records")

medquad['answer'].to_csv(dir_data_csv, index=False)

medquadcsv = pd.read_csv(dir_data_csv)

medquadcsv = medquadcsv[:test_file_size]

medquadcsv.info()
#medquadcsv.head(10)

# Set LLM to None
Settings.llm = None

# Set Hugging Face embedding model for LlamaIndex
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L12-v2")

# Hugging Face tokenizer setup
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

### Load Publications

### Load Sensors

## Index Data

### Functions

In [None]:
def CreateQueries(query_engine):
    # Define your queries
    queries = [
        "What is (are) keratoderma with woolly hair?",
        "How many people are affected by keratoderma with woolly hair?",
        "What are the genetic changes related to keratoderma with woolly hair?",
        "Is keratoderma with woolly hair inherited?",
        "What are the treatments for keratoderma with woolly hair?",
        "What is (are) Knobloch syndrome?",
        "How many people are affected by Knobloch syndrome?",
        "What are the genetic changes related to Knobloch syndrome?"
    ]

    # Iterate through the queries and print responses
    for query in queries:
        response = query_engine.query(query)
        print(f"Query: {query}\nResponse: {response}\n")

### Rögzített hosszúságú chunk-ok (Fixed-length chunking)

- **Hivatkozás:** Devlin, J., Chang, M. W., Lee, K., & Toutanova, K. (2019). BERT: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (pp. 4171-4186). https://doi.org/10.18653/v1/N19-1423
- **Állítás:** A rögzített hosszúságú chunk-ok használata gyakori a BERT alapú modellekben, ahol a dokumentumokat egy meghatározott hosszúságú (512 tokenes) egységekbe bontják.

In [None]:
# Function to create fixed-length chunks using tokenizer
def create_fixed_length_chunks_with_tokenizer(texts, chunk_size=fixedlength_chunk_size):
    chunks = []
    for text in tqdm(texts, desc="Creating chunks"):
        tokens = tokenizer.tokenize(text, max_length=fixedlength_chunk_max_length, truncation=True)  # Tokenize the text using Hugging Face tokenizer
        for i in range(0, len(tokens), chunk_size):
            chunk_tokens = tokens[i:i + chunk_size]
            chunk = tokenizer.convert_tokens_to_string(chunk_tokens)
            chunks.append(Document(text=chunk)) # Store the chunk as a Document
    return chunks

# Extract the 'answer' column from the loaded data
texts = medquadcsv['answer'].tolist()

# Create chunks from the 'answer' column text
document_chunks_fixed_length = create_fixed_length_chunks_with_tokenizer(texts)

# Create a VectorStoreIndex from the document chunks
index_fixed_length = VectorStoreIndex.from_documents(document_chunks_fixed_length, show_progress=True)

index_fixed_length.storage_context.persist()

# Set up the query engine
query_engine_fixed_length = index_fixed_length.as_query_engine()

CreateQueries(query_engine_fixed_length)

### Sliding Window (csúszóablak) módszer, token hossz átfedéssel

- **Hivatkozás:** Beltagy, I., Peters, M. E., & Cohan, A. (2020). Longformer: The long-document transformer. arXiv preprint arXiv:2004.05150. https://arxiv.org/abs/2004.05150
- **Állítás:** Az idézet alátámasztja a csúszóablakos módszert, amely csökkenti a hosszú szövegek feldolgozásának költségeit, miközben megőrzi a szöveg összefüggését, és a dokumentum több ezer tokenes feldolgozását is lehetővé teszi.

In [None]:
# Function to create sliding window chunks based on tokens
def create_token_based_chunks_with_sliding_window(texts, chunk_size=token_sliding_window_size, overlap=token_sliding_window_overlap):
    chunks = []
    stride = chunk_size - overlap  # Calculate the stride based on chunk size and overlap
    for text in tqdm(texts, desc="Creating chunks"):
        tokens = tokenizer.tokenize(text, max_length=fixedlength_chunk_max_length, truncation=True)  # Tokenize the text using Hugging Face tokenizer
        for i in range(0, len(tokens), stride):
            chunk_tokens = tokens[i:i + chunk_size]  # Select tokens within the window size
            chunk = tokenizer.convert_tokens_to_string(chunk_tokens)  # Convert tokens back to string
            chunks.append(Document(text=chunk))  # Store the chunk as a Document
    return chunks

# Extract the 'answer' column from the loaded data
texts = medquadcsv['answer'].tolist()

# Create token-based chunks using sliding window
document_chunks_sliding_window_chunk_based = create_token_based_chunks_with_sliding_window(texts)

# Create a VectorStoreIndex from the document chunks
index_sliding_window_chunk_based = VectorStoreIndex.from_documents(document_chunks_sliding_window_chunk_based, show_progress=True)

index_sliding_window_chunk_based.storage_context.persist()

# Set up the query engine
query_engine_sliding_window_chunk_based = index_sliding_window_chunk_based.as_query_engine()

CreateQueries(query_engine_sliding_window_chunk_based)

###  Mondat- vagy bekezdés-alapú darabolás (Sentence/Paragraph-based chunking)

- **Hivatkozás:** Kwiatkowski, T., Palomaki, J., Redfield, O., Collins, M., Parikh, A., Alberti, C., ... & Petrov, S. (2019). Natural questions: A benchmark for question answering research. Transactions of the Association for Computational Linguistics, 7, 453-466. https://doi.org/10.1162/tacl_a_00276
- **Állítás:** Itt a mondat- és bekezdés-alapú chunking módszer a kérdés-válasz rendszerek fejlesztésében alkalmazott módszert illusztrálja, amely függetlenül kezeli a hosszú és rövid válaszokat a dokumentumból kinyert mondat- vagy bekezdésszintű egységek alapján.

In [None]:
# Function to create sentence-based chunks
def create_sentence_based_chunks(texts):
    chunks = []
    for text in tqdm(texts, desc="Creating sentence chunks"):
        # Simple sentence splitting based on common delimiters
        sentences = sent_tokenize(text)
        #sentences = text.replace('!', '.').replace('?', '.').split('.')  # Replace punctuation and split
        for sentence in sentences:
            sentence = sentence.strip()  # Remove leading/trailing whitespace
            if sentence:  # Ensure the sentence is not empty
                chunks.append(Document(text=sentence))  # Store each sentence as a Document
    return chunks

# Extract the 'answer' column from the loaded data
texts = medquadcsv['answer'].tolist()

# Create chunks from the 'answer' column text
document_chunks_sentence_based = create_sentence_based_chunks(texts)

# Create a VectorStoreIndex from the document chunks
index_sentence_based = VectorStoreIndex.from_documents(document_chunks_sentence_based, show_progress=True)

index_sentence_based.storage_context.persist()

# Set up the query engine
query_engine_sentence_based = index_sentence_based.as_query_engine()

CreateQueries(query_engine_sentence_based)

### Sliding Window (csúszóablak) módszer, mondatonként

- **Hivatkozás:** Beltagy, I., Peters, M. E., & Cohan, A. (2020). Longformer: The long-document transformer. arXiv preprint arXiv:2004.05150. https://arxiv.org/abs/2004.05150
- **Állítás:** Az idézet alátámasztja a csúszóablakos módszert, amely csökkenti a hosszú szövegek feldolgozásának költségeit, miközben megőrzi a szöveg összefüggését, és a dokumentum több ezer tokenes feldolgozását is lehetővé teszi.

In [None]:
# Function to create sentence-based chunks with sliding window
def create_sliding_window_chunks(texts, window_size=sentence_sliding_window_size, stride=sentence_sliding_window_overlap):
    chunks = []
    for text in tqdm(texts, desc="Creating sentence chunks"):
        # Simple sentence splitting based on common delimiters
        # sentences = text.replace('!', '.').replace('?', '.').split('.')  # Replace punctuation and split into sentences
        sentences = sent_tokenize(text)
        sentences = [sentence.strip() for sentence in sentences if sentence]  # Clean up empty and whitespace sentences
        
        # Apply sliding window
        for i in range(0, len(sentences), stride):
            window_sentences = sentences[i:i + window_size]  # Select a window of sentences
            if window_sentences:  # If the window has sentences
                chunk = ' '.join(window_sentences)  # Join the sentences to form a chunk
                chunks.append(Document(text=chunk))  # Store each chunk as a Document
    return chunks

# Extract the 'answer' column from the loaded data
texts = medquadcsv['answer'].tolist()

# Create sliding window chunks from the 'answer' column text
document_chunks_sliding_window_sentence_based = create_sliding_window_chunks(texts)

# Create a VectorStoreIndex from the document chunks
index_sliding_window_sentence_based = VectorStoreIndex.from_documents(document_chunks_sliding_window_sentence_based, show_progress=True)

index_sliding_window_sentence_based.storage_context.persist()

# Set up the query engine
query_engine_sliding_window_sentence_based = index_sliding_window_sentence_based.as_query_engine()

CreateQueries(query_engine_sliding_window_sentence_based)

### Hierarchikus chunking

- **Hivatkozás:** Luong, T., Pham, H., & Manning, C. D. (2015). Effective approaches to attention-based neural machine translation. In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing (pp. 1412-1421). https://doi.org/10.18653/v1/D15-1166
- **Állítás:** A hierarchikus chunking megközelítés itt úgy jelenik meg, hogy a figyelmet több szinten alkalmazzák (szó- és mondatszinten), ami lehetővé teszi a modellek számára, hogy a különböző szintű kontextusokat figyelembe vegyék a szövegek feldolgozása során.

In [None]:
def create_hierarchical_chunks_2_levels(texts):
    top_level_chunks = create_fixed_length_chunks_with_tokenizer(texts)  # First-level chunking based on tokens
    hierarchical_chunks = []
    
    for chunk in top_level_chunks:
        chunk_text = chunk.text  # Text of the first-level chunks
        sentence_based_chunks = create_sentence_based_chunks([chunk_text])  # Second-level chunking based on sentences
        hierarchical_chunks.extend(sentence_based_chunks)  # Collect sentence-based chunks
    
    return hierarchical_chunks

# Extract the 'answer' column from the loaded data
texts = medquadcsv['answer'].tolist()

# Create hierarchical chunks from the 'answer' column text
document_chunks_hierarchical = create_hierarchical_chunks_2_levels(texts)

# Create a VectorStoreIndex from the hierarchical chunks
index_hierarchical_2_levels = VectorStoreIndex.from_documents(document_chunks_hierarchical, show_progress=True)

index_hierarchical_2_levels.storage_context.persist()

# Set up the query engine
query_engine_hierarchical_2_levels = index_hierarchical_2_levels.as_query_engine()

CreateQueries(query_engine_hierarchical_2_levels)

In [None]:
def create_hierarchical_chunks_3_levels(texts):
    top_level_chunks = create_fixed_length_chunks_with_tokenizer(texts )# First level: Chunking based on tokens (fixed length chunks)
    hierarchical_chunks = []
    
    # Second level: Further chunking each token-based chunk into sentences
    for chunk in top_level_chunks:
        chunk_text = chunk.text  # Text of the first-level chunks
        sentence_based_chunks = create_sentence_based_chunks([chunk_text])  # Second-level chunking based on sentences
        
        # Third level: Apply sliding window on sentence chunks
        for sentence_chunk in sentence_based_chunks:
            sentence_chunk_text = sentence_chunk.text  # Text of the second-level (sentence-based) chunks
            sliding_window_chunks = create_sliding_window_chunks([sentence_chunk_text])  # Third-level chunking with sliding window
            hierarchical_chunks.extend(sliding_window_chunks)  # Collect all sliding window-based chunks

    return hierarchical_chunks

# Extract the 'answer' column from the loaded data
texts = medquadcsv['answer'].tolist()

# Create three-level hierarchical chunks from the 'answer' column text
document_chunks_hierarchical_3_levels = create_hierarchical_chunks_3_levels(texts)

# Create a VectorStoreIndex from the three-level hierarchical chunks
index_hierarchical_3_levels = VectorStoreIndex.from_documents(document_chunks_hierarchical_3_levels, show_progress=True)

index_hierarchical_3_levels.storage_context.persist()

# Set up the query engine
query_engine_hierarchical_3_levels = index_hierarchical_3_levels.as_query_engine()

CreateQueries(query_engine_hierarchical_3_levels)

### Dinamikus chunking tokenlimittel

- **Hivatkozás:** Brown, T. B., Mann, B., Ryder, N., Subbiah, M., Kaplan, J., Dhariwal, P., ... & Amodei, D. (2020). Language models are few-shot learners. Advances in Neural Information Processing Systems, 33, 1877-1901. https://arxiv.org/abs/2005.14165
- **Állítás:** A dinamikus chunking módszer ebben a kontextusban figyelmet fordít arra, hogy a természetes mondathatárok mentén darabolja fel a szöveget, így a GPT-3 modell optimálisan használja a 2048 tokenes kontextusablakát anélkül, hogy fontos információkat truncálna.

In [None]:
# addig teszi a mondatokat ameddig a vektora eléggé hasonlit az előzőig, mennyire hasonlit? 0.75
# engedjük el :(