In [None]:
from IPython.display import display, Markdown
from nltk.tokenize import sent_tokenize

def display_splitter_results(chunks_list, splitter_names, sentences=3):
    assert chunks_list is not None, "The 'chunks_list' parameter cannot be None. Please provide a valid chunk_list."
    assert isinstance(chunks_list, list), "The 'chunks_list' parameter must be a list of chunked Langchain Documents"
    # Abbreviate type names
    abbreviations = {
        'CharacterTextSplitter': 'C',
        'RecursiveCharacterTextSplitter': 'R',
        'NLTKTextSplitter': 'N',
        'SpacyTextSplitter': 'S'
    }
    if splitter_names == 'all'.lower():
        filtered_chunks_list = chunks_list
    else:
        assert all(name in abbreviations for name in splitter_names), "Check the splitter names.  A splitter name should be either `CharacterTextSpitter`, `RecursiveTextSplitter`, `NLTKTextSplitter`, or `SpacyTextSplitter`."
        filtered_chunks_list = [chunked_doc for chunked_doc in chunks_list if chunked_doc['type'] in splitter_names]

    # Create a markdown string for the table
    table = "| Label | Chunk Size | Overlap Size | Total Chunks | Chunk # | Position in Chunk | Text |\n| --- | --- | --- | --- | --- | --- | --- |\n"

    for chunked_doc in filtered_chunks_list:
        # Get the type of the splitter
        splitter_type = abbreviations[chunked_doc['type']]
        
        # Get the chunk size
        chunk_size = chunked_doc['chunk_size']
        
        # Get the overlap size
        overlap_size = chunked_doc['overlap_size']
        
        # Get the total number of chunks
        total_chunks = len(chunked_doc['chunks'])
        
        # Get the contents for the first two chunks
        for i in range(min(2, total_chunks - 1)):
            chunk = chunked_doc['chunks'][i]
            next_chunk = chunked_doc['chunks'][i+1]
            
            if chunked_doc['type'] in ['NLTKTextSplitter', 'SpacyTextSplitter']:
                # Tokenize the tail and head into sentences
                tail_sentences_list = sent_tokenize(chunk[-overlap_size:])
                head_sentences_list = sent_tokenize(' '.join(sent_tokenize(next_chunk)))
                
                # Get the last few sentences of the tail and the first few sentences of the head
                tail = ' '.join(tail_sentences_list[-sentences:]) if tail_sentences_list else ''
                head = ' '.join(head_sentences_list[:sentences]) if head_sentences_list else ''
            else:
                # Don't tokenize the tail and head
                tail = chunk[-overlap_size:]
                head = next_chunk[:overlap_size+50]
            
            # Add a row to the table for the tail of the current chunk
            table += f"| {splitter_type} | {chunk_size} | {overlap_size} | {total_chunks} | {i+1} | Tail | {tail} |\n"
            
            # Add a row to the table for the head of the next chunk
            table += f"| {splitter_type} | {chunk_size} | {overlap_size} | {total_chunks} | {i+2} | Head | {head} |\n"

    # Display the table
    display(Markdown(table))


In [None]:
from IPython.display import display, Markdown
###  Transcript chunks
def display_transcript_chunks(chunks_list):
    assert chunks_list is not None, "The 'chunks_list' parameter cannot be None. Please provide a valid chunk_list."
    assert isinstance(chunks_list, list), "The 'chunks_list' parameter must be a list of chunked Langchain Documents"
    # Create the header of the table
    table = "| Type | Chunk Size | Overlap | Num Chunks |\n| --- | --- | --- | --- |\n"

    # Add each row to the table
    for transcript_split_dict in chunks_list:
        table += f"| {transcript_split_dict['type']} | {transcript_split_dict['chunk_size']} | {transcript_split_dict['overlap_size']} | {len(transcript_split_dict['chunks'])} |\n"

    # Display the table
    display(Markdown(table))


In [3]:
from langchain.schema.document import Document
from langchain.text_splitter import NLTKTextSplitter, SpacyTextSplitter 
def get_split_sentences(doc, splitter_instance):
    assert doc is not None, "The 'doc' parameter cannot be None. Please provide a valid Langchain document."
    assert isinstance(doc, Document), "The 'doc' parameter must be an instance of a Langchain Document."
    assert isinstance(splitter_instance, (NLTKTextSplitter, SpacyTextSplitter)), "The 'splitter_instance' parameter must be an instance of either NLTKTextSplitter or SpacyTextSplitter."
    class_name = splitter_instance.__class__.__name__
    print(class_name)
    chunks = splitter_instance.split_text(doc.page_content)
    chunk_sizes = [len(chunk) for chunk in chunks]
    average_chunk_size = round(sum(chunk_sizes) / len(chunk_sizes))
    # chunks = splitter_instance(separator='  ').split_text(doc.page_content)
    # I set the chunk and overlap sizes to 0 because the splitter figures out.
    return {
        'type': splitter_instance.__class__.__name__,
        'chunk_size': average_chunk_size,
        'overlap_size': 0,
        'num_chunks' : len(chunks),
        'chunks': chunks

        }