In [42]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [43]:
import pyarrow as pa
import pyarrow.compute as pc
from nn_rag import Knowledge

### Instantiate capability

In [44]:
kn = Knowledge.from_memory()

In [45]:
tbl = kn.set_source_uri("source/Gen AI Best Practices.pdf").load_source_canonical()
kn.set_persist_uri('./hadron/data/gen_ai_best_practice.parquet')

In [46]:
tbl.shape
tbl.column_names
tbl.column('text').to_pylist()[0][:300]

(1, 1)

['text']

'Best Practices in Generative AI\nResponsible use and development \nin the modern workplace\n© Responsible AI Institute 2024  \nAll Rights Reserved | Do Not Use Without Permission\n\x0cExecutive Summary\nGenerative AI, a technology capable of producing realistic content in the form of text, images,\nsound, and'

### Tidy the text

In [47]:
tbl = kn.tools.correlate_replace(tbl, 'text', '\n', ' ', intent_order=-1)
tbl = kn.tools.correlate_replace(tbl, 'text', '  ', ' ', intent_order=-1)
tbl = kn.tools.correlate_replace(tbl, 'text', r'\.([A-Z])', r'. \1', is_regex=True, intent_order=-1)

### Profling
#### discovery

In [7]:
profile = kn.tools.text_profiler(tbl)
profile.shape
kn.table_report(profile, head=5)

(392, 5)

Unnamed: 0,sentence,sentence_num,char_count,word_count,token_count
0,"Best Practices in Generative AI Responsible use and development in the modern workplace © Responsible AI Institute 2024 All Rights Reserved | Do Not Use Without Permission Executive Summary Generative AI, a technology capable of producing realistic content in the form of text, images, sound, and more, presents signiﬁcant opportunities and challenges for businesses today.",0,375,56,94
1,"With generative AI (GenAI) applications ranging from customer service automation to content creation, the recent explosive adoption of LLM technologies like ChatGPT underscores the potential transformative scale of AI impact, both positive and negative.",1,253,34,63
2,"Potential risks and harms from generative AI impact human rights, privacy, security, labor, fairness, sustainability, and more.",2,127,17,32
3,"Without investing effort to comprehensively address these issues across the enterprise, businesses are exposed to the risks of compliance penalties, consumer harm, loss of trust, damages, and more.",3,197,28,49
4,"To position themselves to responsibly capitalize on this potential, organizations must implement governance to pave the way for trustworthy AI deployment, procurement, sale, and use, as applicable.",4,197,27,49


In [39]:
print(f"min word count {pc.min(profile['word_count']).as_py()}")
print(f"max word count {pc.max(profile['word_count']).as_py()}")
print(f"mean word count {pc.mean(profile['word_count']).as_py()}")

min word count 1


In [41]:
print(f"sentence less than four words {pc.count(pc.filter(profile['word_count'], pc.less_equal(profile['word_count'], 3))).as_py()}")

sentence less than four words 17


#### clean

In [None]:
tbl = kn.tools.correlate_replace(tbl, 'text', '\n', ' ', intent_order=-1)
tbl = kn.tools.correlate_replace(tbl, 'text', '  ', ' ', intent_order=-1)

In [9]:
profile = kn.tools.text_profiler(tbl, to_drop=[0, (2,4)])
profile.shape
kn.table_report(profile, head=4)

(389, 5)

Unnamed: 0,sentence,sentence_num,char_count,word_count,token_count
0,"With generative AI (GenAI) applications ranging from customer service automation to content creation, the recent explosive adoption of LLM technologies like ChatGPT underscores the potential transformative scale of AI impact, both positive and negative.",1,253,34,63
1,"To position themselves to responsibly capitalize on this potential, organizations must implement governance to pave the way for trustworthy AI deployment, procurement, sale, and use, as applicable.",4,197,27,49
2,"Applying Responsible AI (RAI) frameworks to generative and other forms of AI across the organization can mitigate pressing risks and threats, allowing the technology’s potential to be maximized.",5,194,28,48
3,"The RAI Institute offers the following set of best practices for responsible generative AI use to guide AI practitioners, executive, and other professionals.",6,157,23,39


#### rebuild

In [10]:
clean_text = kn.tools.text_profiler(tbl, to_drop=[0, (2,5)], as_text=True)
clean_text.shape
clean_text.column('text').to_pylist()[0][:700]

(1, 1)

'With generative AI (GenAI) applications ranging from customer service automation to content creation, the recent explosive adoption of LLM technologies like ChatGPT underscores the potential transformative scale of AI impact, both positive and negative. Applying Responsible AI (RAI) frameworks to generative and other forms of AI across the organization can mitigate pressing risks and threats, allowing the technology’s potential to be maximized. The RAI Institute offers the following set of best practices for responsible generative AI use to guide AI practitioners, executive, and other professionals. These guidelines include recommendations related to gathering the right teams and tools, trac'

### Chunking

In [11]:
chunks = kn.tools.text_chunk(clean_text, text_name='GenAIBestPractice', num_sentence_chunk_size=10)
chunks.shape
chunks.column_names

(39, 7)

['text_name',
 'chunk_number',
 'sentence_chunk',
 'chunk_sentence_count',
 'chunk_char_count',
 'chunk_word_count',
 'chunk_token_count']

### Embedding

In [12]:
embedding = kn.tools.chunk_embedding(chunks, batch_size=32, embedding_name='all-mpnet-base-v2', device='cpu')

In [13]:
type(embedding)
embedding.shape

pyarrow.lib.Tensor

(39, 768)

In [14]:
kn.run_component_pipeline()

TypeError: Argument 'tensor' has incorrect type (expected pyarrow.lib.Tensor, got pyarrow.lib.Table)

### Save Embedding

In [None]:
kn.save_persist_canonical(embedding)