In [1]:
import os
import sys
sys.path.insert(0, os.path.abspath(".."))

In [2]:
from weaviate_module.weaviate_wrapper import WeaviateWrapper

In [3]:
ww = WeaviateWrapper(host="http://localhost:8080")

            Consider upgrading to the new and improved v4 client instead!
            See here for usage: https://weaviate.io/developers/weaviate/client-libraries/python
            


In [4]:
class_obj = {
                'class': 'Parent_child_chunks',
                'properties': [
                    {
                        'name': 'child_text',
                        'dataType': ['text'],
                    },
                    {
                        'name': 'text',
                        'dataType': ['text'],
                    },
                    {
                        'name': 'chunk_number',
                        'dataType': ['int'],
                    }
                ],
            }

In [16]:
if not ww.client.schema.exists('Parent_child_chunks'):
    ww.add_class_to_schema(classconfiguration=class_obj)

In [5]:
import fitz
import pandas as pd
import ast
from langchain.text_splitter import SpacyTextSplitter

In [6]:
def text_to_chunks(texts: str,
                   chunk_length: int = 100,
                   chunk_overlap: int = 25) -> list:
    """
    Splits the text into equally distributed chunks with 25-word overlap.
    Args:
        texts (str): Text to be converted into chunks.
        chunk_length (int): Maximum number of words in each chunk.
        chunk_overlap (int): Number of words to overlap between chunks.
    """
    words = texts.split(' ')
    n = len(words)
    chunks = []
    chunk_number = 1
    i = 0
    while i < n:  # Corrected the length check
        chunk = words[i: min(i + chunk_length, n)]
        i = i + chunk_length - chunk_overlap
        #print(len(chunk))
        chunk = ' '.join(chunk).strip()
        chunks.append({"text": chunk, "chunk_number": chunk_number})
        chunk_number += 1
    return chunks

In [7]:
def parent_child_splitting(text: str, number_of_children: int, child_overlap: int = 10) -> list:
    """
    Splits the parent text into 'number_of_children' chunks, each chunk containing a portion of the full text.
    There will be an overlap of 'child_overlap' words between consecutive chunks.
    Args:
        text (str): Parent text to be split into chunks.
        number_of_children (int): Number of chunks to split the text into.
        child_overlap (int): Number of words to overlap between consecutive chunks.
    Returns:
        list: List containing each chunk of text.
    """
    words = text.split(' ')
    total_words = len(words)
    # Calculate the length of each chunk (excluding overlap)
    if number_of_children <= 1:
        chunk_length = total_words
    else:
        chunk_length = (total_words + (child_overlap * (number_of_children - 1))) // number_of_children
    chunks = []
    i = 0
    for _ in range(number_of_children):
        start_index = max(0, i)
        end_index = min(i + chunk_length, total_words)
        chunk = words[start_index:end_index]
        i = end_index - child_overlap
        chunk_text = ' '.join(chunk).strip()
        chunks.append(chunk_text)
    return chunks

In [8]:
def parent_to_child(parent_chunk:str)->list:
    text_splitter = SpacyTextSplitter(pipeline="en_core_web_sm",separator='##')
    return [sent for sent in  text_splitter.split_text(parent_chunk)[0].split('##')]

In [10]:
df = pd.read_csv("../data/ms-marco-200-rows.csv")

In [9]:
# df = pd.read_csv("../data/baseline.csv")

In [10]:
texxt =""

In [11]:
df.head()

Unnamed: 0,question,ground_truths,answer,contexts
0,How to deposit a cheque issued to an associate...,['Have the check reissued to the proper payee....,\nThe best way to deposit a cheque issued to a...,"[""Just have the associate sign the back and th..."
1,Can I send a money order from USPS as a business?,"[""Sure you can. You can fill in whatever you ...","\nYes, you can send a money order from USPS as...","[""Sure you can. You can fill in whatever you ..."
2,1 EIN doing business under multiple business n...,"[""You're confusing a lot of things here. Compa...","\nYes, it is possible to have one EIN doing bu...","[""You're confusing a lot of things here. Compa..."
3,Applying for and receiving business credit,"['""I\'m afraid the great myth of limited liabi...",\nApplying for and receiving business credit c...,"[""Set up a meeting with the bank that handles ..."
4,401k Transfer After Business Closure,"[""You should probably consult an attorney. How...",\nIf your employer has closed and you need to ...,"[""The time horizon for your 401K/IRA is essent..."


In [12]:
for context in df["contexts"]:
    context = ast.literal_eval(context)
    text = '\n'.join(context)
    texxt += text + "\n"

In [13]:
len(texxt)

88434

In [14]:
chunks = text_to_chunks(texxt)

In [15]:
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [17]:
for chunk in chunks:
    chunk_number = chunk["chunk_number"]
    chunk_text = chunk["text"]
    print("-"*100)
    # parent 300 words, each children will have 50 words with an overlap of 5 words
    #child_texts = parent_child_splitting(text=chunk_text,number_of_children=6,child_overlap=5)
    child_texts = parent_to_child(chunk_text)
    for child_text in child_texts:
        ww.add_parent_child_object_to_schema(classname="Parent_child_chunks",
                                             parent_text=chunk_text,
                                             chunk_number=chunk_number,
                                             child_text=child_text)
    print(f"added {chunk_number} of rag_dataset")

----------------------------------------------------------------------------------------------------




added 1 of rag_dataset
----------------------------------------------------------------------------------------------------




added 2 of rag_dataset
----------------------------------------------------------------------------------------------------
added 3 of rag_dataset
----------------------------------------------------------------------------------------------------
added 4 of rag_dataset
----------------------------------------------------------------------------------------------------
added 5 of rag_dataset
----------------------------------------------------------------------------------------------------
added 6 of rag_dataset
----------------------------------------------------------------------------------------------------
added 7 of rag_dataset
----------------------------------------------------------------------------------------------------
added 8 of rag_dataset
----------------------------------------------------------------------------------------------------
added 9 of rag_dataset
----------------------------------------------------------------------------------------------------
added 10