In [12]:
import pandas as pd
import os
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()

# Access the API key
api_key = os.getenv("LLAMA_API_KEY")

if api_key:
    print("API key loaded successfully!")
else:
    print("API key not found. Check your .env file.")


# Load the CSV file
file_path = 'istanbul_places_wikipedia.csv'  
df = pd.read_csv(file_path)

# Create a single text file with only content
output_file = 'istanbul_places_content.txt'

with open(output_file, 'w', encoding='utf-8') as f:
    for content in df['Content']:
        f.write(f"{content}\n")
        

print(f"Content saved in: {output_file}")




API key loaded successfully!
Content saved in: istanbul_places_content.txt


In the initial step, we used LLama Parser to chunk the txt for benchmark creation

In [8]:
import nest_asyncio

nest_asyncio.apply()

from llama_parse import LlamaParse

parser = LlamaParse(
    api_key=api_key,  
    result_type="markdown",  # "markdown" and "text" are available
    num_workers=4,  # if multiple files passed, split in `num_workers` API calls
    verbose=True,
    language="en",  
)

# sync
documents_2 = parser.load_data("istanbul_places_content.txt")









Started parsing the file under job_id 1d8132dd-f018-4988-abe3-286e9f751a42


Comment: Parsing and chunking the wikipedia informations by using Llama Parser in order to create our benchmark

In [9]:
import json

# Assuming 'documents' contains a list of Document objects
# Extract text content from each document
serializable_documents = [{"text": doc.text} for doc in documents_2]

# Save the processed data as JSON
output_file = 'istanbul_places_documents_2.json'

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(serializable_documents, f, ensure_ascii=False, indent=4)

print(f"Documents saved in: {output_file}")

Documents saved in: istanbul_places_documents_2.json


Printing the character count in istanbul_places_content.txt

In [13]:
# Specify the file path
file_path = 'istanbul_places_content.txt'  

# Read the file and count characters
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()
    char_count = len(content)  # Count characters

print(f"Number of characters in the file: {char_count}")


Number of characters in the file: 753364


Now, we add additional information from IBB Museums into our existing istanbul_places_content.txt

In [14]:
import json

# File paths
txt_file_path = 'istanbul_places_content.txt'
json_file_path = 'museum_data.json'

# Read the existing text file or initialize if not found
try:
    with open(txt_file_path, 'r', encoding='utf-8') as file:
        existing_content = file.read()
except FileNotFoundError:
    existing_content = ""

# Read additional data from the JSON file
try:
    with open(json_file_path, 'r', encoding='utf-8') as file:
        additional_data = json.load(file)
except FileNotFoundError:
    print(f"Error: '{json_file_path}' not found!")
    additional_data = []

# Append descriptions to the existing content
for entry in additional_data:
    if 'description' in entry:
        existing_content += "\n\n" + entry['description']

# Write updated content back to the text file
with open(txt_file_path, 'w', encoding='utf-8') as file:
    file.write(existing_content)

print("Descriptions appended successfully!")


Descriptions appended successfully!


Now, number of characters increase

In [15]:
# Specify the file path
file_path = 'istanbul_places_content.txt'  

# Read the file and count characters
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()
    char_count = len(content)  # Count characters

print(f"Number of characters in the file: {char_count}")

Number of characters in the file: 818243


Recursive Character Splitter

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Read the text file
with open('istanbul_places_content.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Initialize Recursive Character Splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,       # Max size of each chunk
    chunk_overlap=50,     # Overlap between chunks for context
    separators=["\n\n", "\n", " ", ""],  # Split by paragraphs, lines, spaces, characters
)

# Split the text into chunks
chunks = text_splitter.split_text(text)

# Save chunks into a new text file
with open('chunks.txt', 'w', encoding='utf-8') as chunk_file:
    for i, chunk in enumerate(chunks):
        chunk_file.write(f"Chunk {i+1}:\n{chunk}\n\n")

print(f"{len(chunks)} chunks saved successfully in 'chunks.txt'")


2408 chunks saved successfully in 'chunks.txt'


Fixed Size Chunking

In [4]:
# Read the text file
with open('istanbul_places_content.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Fixed-size chunking function
def fixed_size_chunking(text, chunk_size=500, overlap=50):
    chunks = []
    start = 0

    while start < len(text):
        # Define the end position with overlap
        end = start + chunk_size

        # Add chunk to the list
        chunks.append(text[start:end])

        # Move the start forward considering overlap
        start += chunk_size - overlap
    
    return chunks

# Generate chunks
fixed_chunks = fixed_size_chunking(text, chunk_size=500, overlap=50)

# Save fixed-size chunks to a text file
with open('fixed_chunks.txt', 'w', encoding='utf-8') as chunk_file:
    for i, chunk in enumerate(fixed_chunks):
        chunk_file.write(f"Chunk {i+1}:\n{chunk}\n\n")

print(f"{len(fixed_chunks)} fixed-size chunks saved successfully in 'fixed_chunks.txt'")

1675 fixed-size chunks saved successfully in 'fixed_chunks.txt'


Semantic Chunking

In [5]:
import spacy

# Load the pre-trained NLP model
nlp = spacy.load("en_core_web_sm")  # Use 'en_core_web_lg' for better accuracy with larger models

# Read the text file
with open('istanbul_places_content.txt', 'r', encoding='utf-8') as file:
    text = file.read()

def semantic_chunking(text, max_length=500, overlap=50):
    # Process the text with spaCy
    doc = nlp(text)
    
    chunks = []
    current_chunk = ""
    for sent in doc.sents:  # Iterate through sentences
        # Add the sentence if it fits in the current chunk
        if len(current_chunk) + len(sent.text) <= max_length:
            current_chunk += sent.text + " "
        else:
            # Add the current chunk to the list and start a new chunk with overlap
            chunks.append(current_chunk.strip())
            current_chunk = " ".join(current_chunk.split()[-overlap:]) + " " + sent.text
    
    # Append the last chunk
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

# Generate semantic chunks
semantic_chunks = semantic_chunking(text, max_length=500, overlap=50)

# Save the chunks to a text file
with open('semantic_chunks.txt', 'w', encoding='utf-8') as chunk_file:
    for i, chunk in enumerate(semantic_chunks):
        chunk_file.write(f"Chunk {i+1}:\n{chunk}\n\n")

print(f"{len(semantic_chunks)} semantic chunks saved successfully in 'semantic_chunks.txt'")


4582 semantic chunks saved successfully in 'semantic_chunks.txt'
