In [None]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import pyarrow as pa
import pyarrow.compute as pc
from nn_rag import Knowledge

### Instantiate capability

In [None]:
kn = Knowledge.from_memory()

In [None]:
uri = "source/Gen AI Best Practices.pdf"
tbl = kn.set_source_uri(uri).load_source_canonical()

In [None]:
tbl.shape
tbl.column_names
tbl.column('text').to_pylist()[0][:300]

### Tidy the text

In [None]:
tbl = kn.tools.correlate_replace(tbl, 'text', '\n', ' ', intent_order=-1)
tbl = kn.tools.correlate_replace(tbl, 'text', '  ', ' ', intent_order=-1)

### Profling

In [None]:
profile = kn.tools.text_profiler(tbl)
profile.shape
kn.table_report(profile, head=4)

In [None]:
profile = kn.tools.text_profiler(tbl, to_drop=[0, (2,5)])
profile.shape
kn.table_report(profile, head=4)

In [12]:
clean_text = kn.tools.text_profiler(tbl, to_drop=[0, (2,5)], as_text=True)
clean_text.shape
clean_text.column('text').to_pylist()[0][:700]

(1, 1)

'With generative AI (GenAI) applications ranging from customer service automation to content creation, the recent explosive adoption of LLM technologies like ChatGPT underscores the potential transformative scale of AI impact, both positive and negative. Applying Responsible AI (RAI) frameworks to generative and other forms of AI across the organization can mitigate pressing risks and threats, allowing the technology’s potential to be maximized. The RAI Institute offers the following set of best practices for responsible generative AI use to guide AI practitioners, executive, and other professionals. These guidelines include recommendations related to gathering the right teams and tools, trac'

### Chunking

In [16]:
chunks = kn.tools.text_chunk(clean_text, text_name='GenAIBestPractice', num_sentence_chunk_size=10)
chunks.shape
chunks.column_names

(39, 7)

['text_name',
 'chunk_number',
 'sentence_chunks',
 'chunk_sentence_count',
 'chunk_char_count',
 'chunk_word_count',
 'chunk_token_count']

### Embedding

In [17]:
embedding = kn.tools.chunk_embedding(chunks, batch_size=32, embedding_name='all-mpnet-base-v2', device='cpu')

KeyError: 'sentence_chunk'

In [18]:
type(embedding)
embedding.shape

NameError: name 'embedding' is not defined

In [None]:
kn.run_component_pipeline()

### Reports

In [None]:
kn.report_intent()

In [None]:
kn.report_connectors()

In [None]:
kn.report_task()