In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import pyarrow as pa
import pyarrow.compute as pc
from nn_rag import Knowledge

### Instantiate capability

In [3]:
kn = Knowledge.from_memory()

In [4]:
tbl = kn.set_source_uri("hadron/source/Gen AI Best Practices.pdf").load_source_canonical()
kn.set_persist_uri('./hadron/data/gen_ai_best_practice.parquet')
kn.add_connector_uri('chunks', './hadron/data/gen_ai_best_practice_chunks.parquet')

<nn_rag.components.knowledge.Knowledge at 0x7fb9ada967d0>

In [5]:
tbl.shape
tbl.column_names
tbl.column('text').to_pylist()[0][:300]

(1, 1)

['text']

'Best Practices in Generative AI\nResponsible use and development \nin the modern workplace\n© Responsible AI Institute 2024  \nAll Rights Reserved | Do Not Use Without Permission\n\x0cExecutive Summary\nGenerative AI, a technology capable of producing realistic content in the form of text, images,\nsound, and'

### Tidy the text

In [6]:
tbl = kn.tools.pattern_replace(tbl, 'text', '\n', ' ')
tbl = kn.tools.pattern_replace(tbl, 'text', '  ', ' ')

### Profling
#### discovery

In [7]:
sentences = kn.tools.text_profiler(tbl)
sentences.shape
kn.table_report(sentences, head=4)

(392, 5)

Unnamed: 0,sentence,sentence_num,char_count,word_count,token_count
0,"Best Practices in Generative AI Responsible use and development in the modern workplace © Responsible AI Institute 2024 All Rights Reserved | Do Not Use Without Permission Executive Summary Generative AI, a technology capable of producing realistic content in the form of text, images, sound, and more, presents signiﬁcant opportunities and challenges for businesses today.",0,375,56,94
1,"With generative AI (GenAI) applications ranging from customer service automation to content creation, the recent explosive adoption of LLM technologies like ChatGPT underscores the potential transformative scale of AI impact, both positive and negative.",1,253,34,63
2,"Potential risks and harms from generative AI impact human rights, privacy, security, labor, fairness, sustainability, and more.",2,127,17,32
3,"Without investing effort to comprehensively address these issues across the enterprise, businesses are exposed to the risks of compliance penalties, consumer harm, loss of trust, damages, and more.",3,197,28,49


In [8]:
print(f"min word count {pc.min(sentences['word_count']).as_py()}")
print(f"max word count {pc.max(sentences['word_count']).as_py()}")
print(f"mean word count {round(pc.mean(sentences['word_count']).as_py())}")

min word count 1
max word count 188
mean word count 25


In [9]:
print(f"sentence with 2 or less words {pc.count(pc.filter(sentences['word_count'], pc.less_equal(sentences['word_count'], 2))).as_py()}")

sentence with 2 or less words 16


#### check small sentences

In [10]:
short = kn.tools.filter_on_condition(sentences, header='word_count', condition=(2, 'less_equal', None))

In [11]:
kn.table_report(short, head=5)

Unnamed: 0,sentence,sentence_num,char_count,word_count,token_count
0,2.0,78,2,1,0
1,3.0,80,2,1,0
2,4.0,82,2,1,0
3,5.0,84,2,1,0
4,1.0,332,2,1,0


It looks like these are reference numbers that might compromise our paragraph chunks. At this point we could reverse the condition and remove the reference numbers.

In [12]:
sentences = kn.tools.filter_on_condition(sentences, header='word_count', condition=(1, 'greater', None))

Alternatively we could look at removing specific sentences or groups of sentences that are not relevant to all text.
An example of this might be a PDF with an introductory pre-face and indexing where the first 40 pages should be removed.

In [13]:
kn.table_report(sentences, head=6)

Unnamed: 0,sentence,sentence_num,char_count,word_count,token_count
0,"Best Practices in Generative AI Responsible use and development in the modern workplace © Responsible AI Institute 2024 All Rights Reserved | Do Not Use Without Permission Executive Summary Generative AI, a technology capable of producing realistic content in the form of text, images, sound, and more, presents signiﬁcant opportunities and challenges for businesses today.",0,375,56,94
1,"With generative AI (GenAI) applications ranging from customer service automation to content creation, the recent explosive adoption of LLM technologies like ChatGPT underscores the potential transformative scale of AI impact, both positive and negative.",1,253,34,63
2,"Potential risks and harms from generative AI impact human rights, privacy, security, labor, fairness, sustainability, and more.",2,127,17,32
3,"Without investing effort to comprehensively address these issues across the enterprise, businesses are exposed to the risks of compliance penalties, consumer harm, loss of trust, damages, and more.",3,197,28,49
4,"To position themselves to responsibly capitalize on this potential, organizations must implement governance to pave the way for trustworthy AI deployment, procurement, sale, and use, as applicable.",4,197,27,49
5,"Applying Responsible AI (RAI) frameworks to generative and other forms of AI across the organization can mitigate pressing risks and threats, allowing the technology’s potential to be maximized.",5,194,28,48


In [14]:
sentences = kn.tools.sentence_removal(sentences, indices=[(0,3), 4])

In [15]:
kn.table_report(sentences, head=4)

Unnamed: 0,sentence,sentence_num,char_count,word_count,token_count
0,"Without investing effort to comprehensively address these issues across the enterprise, businesses are exposed to the risks of compliance penalties, consumer harm, loss of trust, damages, and more.",3,197,28,49
1,"Applying Responsible AI (RAI) frameworks to generative and other forms of AI across the organization can mitigate pressing risks and threats, allowing the technology’s potential to be maximized.",5,194,28,48
2,"The RAI Institute offers the following set of best practices for responsible generative AI use to guide AI practitioners, executive, and other professionals.",6,157,23,39
3,"These guidelines include recommendations related to gathering the right teams and tools, tracking legal requirements, evolving the workforce, and implementing clear objectives and requirements for generative AI.",7,211,27,53


### Chunking

In [16]:
chunks = kn.tools.sentence_chunks(sentences, num_sentence_chunk_size=5)
chunks.shape
kn.table_report(chunks.drop_columns('chunk_text'), head=4)

(75, 6)

Unnamed: 0,chunk_number,chunk_sentence_count,chunk_char_count,chunk_word_count,chunk_token_count
0,0,5,848,119,212
1,1,5,443,63,111
2,2,5,853,129,213
3,3,5,747,103,187


In [17]:
print(f"min chunk token count {pc.min(chunks['chunk_token_count']).as_py()}")
print(f"max chunk token count {pc.max(chunks['chunk_token_count']).as_py()}")
print(f"mean chunk token count {round(pc.mean(chunks['chunk_token_count']).as_py())}")
print(f"total tokens {pc.sum(chunks['chunk_token_count']).as_py()}")

min chunk token count 78
max chunk token count 468
mean chunk token count 228
total tokens 17110


#### ensure paragraphs are properly formed

In [18]:
chunks = kn.tools.pattern_replace(chunks, 'chunk_text', r'\.([A-Z])', r'. \1', is_regex=True)
chunks.shape

(75, 6)

#### save chunks

In [19]:
kn.save_canonical('chunks', chunks)

### Embedding

In [20]:
embedding = kn.tools.chunk_embedding(chunks, batch_size=32, embedding_name='all-mpnet-base-v2', device='cpu')

In [21]:
type(embedding)
embedding.shape

pyarrow.lib.Tensor

(75, 768)

#### save embedding

In [22]:
kn.save_persist_canonical(embedding)

### Query Score

In [23]:
query = 'what are the key best practices'

In [24]:
tensor = kn.load_persist_canonical()

In [25]:
scores, indices = kn.tools.score_embedding(tensor, query=query, topk=5,
                                          embedding_name='all-mpnet-base-v2', device='cpu')

In [26]:
import textwrap
import pyarrow as pa

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)
    
def print_top_results_and_scores(query: str, chunks: pa.Table):
    chunk_dict = chunks.to_pylist()
    
    print(f"Query: {query}\n")
    print("Results:")
    # Loop through zipped together scores and indicies
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        print_wrapped(chunk_dict[index]["chunk_text"])
        print("\n")

In [27]:
print_top_results_and_scores(query=query, chunks=chunks)

Query: what are the key best practices

Results:
Score: 0.4008
● Standards are a category of “soft law” mechanisms that are a useful complement
to the “hard law” of AI laws and regulation that can serve as guidance for an
organization’s practices, such as ISO/IEC 42001 AI Management Standard, the
world’s ﬁrst AI management system standard. Check out the RAI Institute AI
Regulatory Tracker on Airtable here. Clearly define objectives, use cases, and
requirements of GenAI solutions Articulating the goals and vision driving
investment in GenAI solutions helps the organization align efforts with business
goals and support eﬃcient resource allocation. Involving key stakeholders across
the company helps gather diverse perspectives on current and potential use
cases, prioritizing high-impact opportunities. ● Involve key stakeholders to
gather diverse perspectives and prioritize high-impact opportunities.


Score: 0.3988
● Output validation: Validate the generated outputs at each iteration to e