In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# suppress warning message
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.compute as pc

---------

In [4]:
from nn_rag import Knowledge
from nn_rag.components.commons import Commons

In [12]:
def join_strings_to_limit(strings, limit):
    result = []
    current_string = ""
    
    for string in strings:
        if len(current_string) + len(string) + (1 if current_string else 0) <= limit:
            if current_string:
                current_string += " "
            current_string += string
        else:
            result.append(current_string)
            current_string = string
    
    if current_string:
        result.append(current_string)
    
    return result

# Example usage
strings = ["This", "is", "an", "example", "of", "joining", "strings", "to", "a", "character", "limit."]
limit = 20
result = join_strings_to_limit(strings, limit)
print(result)

['This is an example', 'of joining strings', 'to a character', 'limit.']


In [15]:
def join_strings_at_indices(strings, indices):
    if not strings or not indices:
        return strings
    
    indices = sorted(indices)
    result = []
    i = 0
    
    while i < len(strings):
        if i in indices:
            joined_string = strings[i]
            while i + 1 < len(strings) and i + 1 in indices:
                joined_string += " " + strings[i + 1]
                i += 1
            joined_string += " " + strings[i + 1] if i + 1 < len(strings) else ""
            result.append(joined_string)
            i += 1
        else:
            result.append(strings[i])
        i += 1
    
    return result

# Example usage
strings = ["This", "is", "an", "example", "of", "joining", "strings", "in", "a", "list."]
indices = [1, 2, 4, 5, 7]
result = join_strings_at_indices(strings, indices)
print(result)


['This', 'is an example', 'of joining strings', 'in a', 'list.']


In [16]:
def split_elements(strings, limit):
    result = []
    original_positions = []
    
    for i, string in enumerate(strings):
        if len(string) > limit:
            start = 0
            while start < len(string):
                end = min(start + limit, len(string))
                result.append(string[start:end])
                original_positions.append(i)
                start = end
        else:
            result.append(string)
            original_positions.append(i)
    
    return result, original_positions

def rejoin_elements(split_strings, original_positions):
    result = []
    current_index = -1
    current_string = ""
    
    for i, string in enumerate(split_strings):
        if original_positions[i] != current_index:
            if current_string:
                result.append(current_string)
            current_index = original_positions[i]
            current_string = string
        else:
            current_string += string
    
    if current_string:
        result.append(current_string)
    
    return result

# Example usage
strings = ["This is a long string that needs to be split.", "Short one.", "Another long string that will be split."]
limit = 20

split_strings, original_positions = split_elements(strings, limit)
print("Split strings:", split_strings)
print("Original positions:", original_positions)

# Process the split strings here

rejoined_strings = rejoin_elements(split_strings, original_positions)
print("Rejoined strings:", rejoined_strings)


Split strings: ['This is a long strin', 'g that needs to be s', 'plit.', 'Short one.', 'Another long string ', 'that will be split.']
Original positions: [0, 0, 0, 1, 2, 2]
Rejoined strings: ['This is a long string that needs to be split.', 'Short one.', 'Another long string that will be split.']


In [17]:
def split_with_overlap(long_string, chunk_length, overlap_length):
    # Check for invalid input
    if chunk_length <= 0 or overlap_length < 0:
        raise ValueError("Chunk length must be positive and overlap length must be non-negative")

    result = []
    start = 0
    while start < len(long_string):
        end = start + chunk_length
        chunk = long_string[max(0, start - overlap_length):end]
        result.append(chunk)
        start += chunk_length

    return result

# Example usage
long_strings = [
    "This is a long string that we want to split into chunks of a specified length.",
    "Another long string that needs to be split and include overlapping characters."
]

chunk_length = 10
overlap_length = 5

for long_string in long_strings:
    chunks = split_with_overlap(long_string, chunk_length, overlap_length)
    for chunk in chunks:
        print(chunk)
    print("-" * 40)


This is a 
is a long strin
string that we 
t we want to sp
to split into c
nto chunks of a
 of a specified
ified length.
----------------------------------------
Another lo
er long string 
ring that needs
needs to be spl
e split and inc
d include overl
overlapping cha
g characters.
----------------------------------------
