In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import pyarrow as pa
import pyarrow.compute as pc
from nn_rag import Knowledge

### Instantiate capability

In [3]:
kn = Knowledge.from_memory()

In [4]:
tbl = kn.set_source_uri("source/Gen AI Best Practices.pdf").load_source_canonical()
kn.set_persist_uri('./hadron/data/gen_ai_best_practice.parquet')

In [5]:
tbl.shape
tbl.column_names
tbl.column('text').to_pylist()[0][:300]

(1, 1)

['text']

'Best Practices in Generative AI\nResponsible use and development \nin the modern workplace\n© Responsible AI Institute 2024  \nAll Rights Reserved | Do Not Use Without Permission\n\x0cExecutive Summary\nGenerative AI, a technology capable of producing realistic content in the form of text, images,\nsound, and'

### Tidy the text

In [6]:
tbl = kn.tools.pattern_replace(tbl, 'text', '\n', ' ')
tbl = kn.tools.pattern_replace(tbl, 'text', '  ', ' ')

### Profling
#### discovery

In [7]:
sentences = kn.tools.text_profiler(tbl)
sentences.shape
kn.table_report(sentences, head=4)

(392, 5)

Unnamed: 0,sentence,sentence_num,char_count,word_count,token_count
0,"Best Practices in Generative AI Responsible use and development in the modern workplace © Responsible AI Institute 2024 All Rights Reserved | Do Not Use Without Permission Executive Summary Generative AI, a technology capable of producing realistic content in the form of text, images, sound, and more, presents signiﬁcant opportunities and challenges for businesses today.",0,375,56,94
1,"With generative AI (GenAI) applications ranging from customer service automation to content creation, the recent explosive adoption of LLM technologies like ChatGPT underscores the potential transformative scale of AI impact, both positive and negative.",1,253,34,63
2,"Potential risks and harms from generative AI impact human rights, privacy, security, labor, fairness, sustainability, and more.",2,127,17,32
3,"Without investing effort to comprehensively address these issues across the enterprise, businesses are exposed to the risks of compliance penalties, consumer harm, loss of trust, damages, and more.",3,197,28,49


In [8]:
print(f"min word count {pc.min(sentences['word_count']).as_py()}")
print(f"max word count {pc.max(sentences['word_count']).as_py()}")
print(f"mean word count {round(pc.mean(sentences['word_count']).as_py())}")

min word count 1
max word count 188
mean word count 25


In [9]:
print(f"sentence with 2 or less words {pc.count(pc.filter(sentences['word_count'], pc.less_equal(sentences['word_count'], 2))).as_py()}")

sentence with 2 or less words 16


#### check small sentences

In [10]:
short = kn.tools.filter_on_condition(sentences, header='word_count', condition=(2, 'less_equal', None))

In [11]:
kn.table_report(short, head=5)

Unnamed: 0,sentence,sentence_num,char_count,word_count,token_count
0,2.0,78,2,1,0
1,3.0,80,2,1,0
2,4.0,82,2,1,0
3,5.0,84,2,1,0
4,1.0,332,2,1,0


It looks like these are reference numbers that might compromise our paragraph chunks. At this point we could reverse the condition and remove the reference numbers.

In [12]:
sentences = kn.tools.filter_on_condition(sentences, header='word_count', condition=(1, 'greater', None))

Alternatively we could look at removing specific sentences or groups of sentences that are not relevant to all text.
An example of this might be a PDF with an introductory pre-face and indexing where the first 40 pages should be removed.

### Chunking

In [13]:
chunks = kn.tools.sentence_chunks(sentences, num_sentence_chunk_size=5)
chunks.shape
kn.table_report(chunks.drop_columns('chunk_text'), head=4)

(76, 6)

Unnamed: 0,chunk_number,chunk_sentence_count,chunk_char_count,chunk_word_count,chunk_token_count
0,0,5,1153,162,288
1,1,5,698,97,174
2,2,5,512,79,128
3,3,5,937,132,234


In [14]:
print(f"min chunk token count {pc.min(chunks['chunk_token_count']).as_py()}")
print(f"max chunk token count {pc.max(chunks['chunk_token_count']).as_py()}")
print(f"mean chunk token count {round(pc.mean(chunks['chunk_token_count']).as_py())}")
print(f"total tokens {pc.sum(chunks['chunk_token_count']).as_py()}")

min chunk token count 28
max chunk token count 486
mean chunk token count 228
total tokens 17345


#### ensure paragraphs are properly formed

In [15]:
chunks = kn.tools.pattern_replace(chunks, 'chunk_text', r'\.([A-Z])', r'. \1', is_regex=True)
chunks.shape

(76, 6)

### Embedding

In [16]:
embedding = kn.tools.chunk_embedding(chunks, batch_size=32, embedding_name='all-mpnet-base-v2', device='cpu')

In [17]:
type(embedding)
embedding.shape

pyarrow.lib.Tensor

(76, 768)

--------------------

### Save Embedding

In [18]:
kn.save_persist_canonical(embedding)

### Query Score

In [19]:
query = 'what are the key best practices'

In [20]:
tensor = kn.load_persist_canonical()

In [21]:
scores, indices = kn.tools.score_embedding(tensor, query=query, topk=5,
                                          embedding_name='all-mpnet-base-v2', device='cpu')

In [23]:
import textwrap
import pyarrow as pa

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)
    
def print_top_results_and_scores(query: str, chunks: pa.Table):
    chunk_dict = chunks.to_pylist()
    
    print(f"Query: {query}\n")
    print("Results:")
    # Loop through zipped together scores and indicies
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        print_wrapped(chunk_dict[index]["chunk_text"])
        print("\n")

In [24]:
print_top_results_and_scores(query=query, chunks=chunks)

Query: what are the key best practices

Results:
Score: 0.4253
Index: 15
These best practices should be implemented in parallel where appropriate by
diverse, inclusive, and cross-functional teams. These best practices are grouped
into ﬁve categories of Responsible Generative AI: 1. Strategy: This encompasses
Planning, Policies, and Governance, ensuring that the organization's GenAI
initiatives are well-aligned with its overall goals and compliant with relevant
regulations. Workforce: Focuses on Training, Education, and Upskilling,
equipping employees with the necessary knowledge and skills to effectively
engage with GenAI technologies. Capacity: Relates to Resourcing and Tools,
addressing the need for adequate resources and tools to support GenAI
development and deployment.


Score: 0.3860
Index: 40
● Performance metrics: Deﬁne and track relevant performance metrics to
quantitatively assess the model's progress and compare different iterations. ●
Documentation: Thoroughly document the 