<a target="_parent" href="https://colab.research.google.com/github/gretelai/gretel-blueprints/blob/main/docs/notebooks/demo/rag/gretel-webinar-navigator-rag.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [None]:
local_dir = "aistudio-copilot-sample"
![ -d $local_dir ] || git clone https://github.com/Azure/aistudio-copilot-sample.git $local_dir

In [None]:
%pip install -Uqq gretel_client langchain tiktoken

### 1. Read and chunk dataset

In [None]:
import os
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter

CHUNK_SIZE = 1500 #tokens
CHUNK_OVERLAP = 0
MIN_CHUNK_CHARS = 2000

def find_files_by_extension(directory, extension):
    texts = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(extension):
                file_path = os.path.join(root, file)
                with open(file_path) as f:
                    text = f.read()
                    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
                        chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
                    )
                    chunks = text_splitter.split_text(text)
                    for chunk in chunks:
                        if len(chunk) > MIN_CHUNK_CHARS:
                            texts.append([file_path, chunk])
    return texts

target_path = Path(local_dir) / 'data' / '3-product-info'
target_extension = ".md"
texts = find_files_by_extension(target_path, target_extension)

print(f"Found {len(set([t[0] for t in texts]))} files. Extracted {len(texts)} chunks with tiktoken")

### 2. Create synthetic Q-T pairs with Gretel Navigator

In [None]:
from gretel_client import Gretel

gretel = Gretel(api_key="prompt", cache=True, validate=True)

navigator = gretel.factories.initialize_navigator_api("tabular", backend_model="gretelai-google/gemini-pro")

In [None]:
# Base instruction for dataset creation
INTRO_PROMPT = (
    "From the source text below, create a dataset with the following columns:\n"
)

# Columns for question, context, and truth
COLUMN_DETAILS = (
    "* `question`: Ask a set of unique questions related to the topic that a customer might ask. "
    "Questions should be relatively complex and specific enough to be addressed in a short answer.\n"
    "* `context`: Copy the exact sentence(s) from the source text and surrounding details from where the answer can be derived. T\n"
    "* `truth`: Respond to the question with a clear, textbook quality answer that provides relevant details to fully address the question.\n"
)

# Combining all parts into the final prompt
PROMPT = (
    INTRO_PROMPT +
    COLUMN_DETAILS
)

# Optionally, print the prompt to verify its format
print(PROMPT)

In [None]:
# Separate diversification prompts
SEED_PROMPT_1 = (
    "* `topic`: Select topics from Basic Information, Pricing and Warranty, Usage, Technical Details, "
    "Sustainability, Security, and Future Updates.\n"
)

SEED_PROMPT_2 = (
    "* `user_profile`: The complexity level of the question and truth, categorized into beginner, intermediate, and expert.\n"
    "  - Beginner users are about building foundational knowledge about the product and ask about basic features, benefits, and uses of the product.\n"
    "  - Intermediate users have a deep understanding of the product and are focusing on practical applications, comparisons with other products, and intermediate-level features and benefits.\n"
    "  - Expert users demonstrate in-depth technical knowledge, strategic application, and advanced troubleshooting. This level is for those who need to know the product inside and out, possibly for roles in sales, technical support, or product development.\n"
)

SEED_PROMPT_3 = (
    "* `language`: This is the language in which the question and truth columns should be phrased. Chose from English, Dutch, French, and Spanish.\n"
)

# Combining all parts into the final prompt
PROMPT = (
    INTRO_PROMPT +
    SEED_PROMPT_1 +
    SEED_PROMPT_2 +
    SEED_PROMPT_3 +
    COLUMN_DETAILS
)

# Optionally, print the prompt to verify its format
print(PROMPT)


In [None]:
import pandas as pd

# Create synthetic records
MAX_DOCS = 1
SAMPLES_PER_DOC = 10

GENERATE_PARAMS = {
    "num_records": SAMPLES_PER_DOC,
    "temperature": 0.7,
    "top_p": 0.9,
    "top_k": 40
}

df = pd.DataFrame()

for text in texts[:MAX_DOCS]:
    df_doc = navigator.generate(f"{PROMPT}\n\n{text[1]}", **GENERATE_PARAMS)
    df_doc['file_path'] = text[0]
    df = pd.concat([df, df_doc], ignore_index=True)

pd.set_option('display.max_colwidth', 200)
display(df.sample(n=5))

### 3. Human Evaluation

In [None]:
### helper function

from ipywidgets import widgets, Layout, VBox, HBox, HTML
from IPython.display import display, clear_output

# Initialize global variables
index = 0
keep_df = pd.DataFrame(columns=df.columns)
discard_df = pd.DataFrame(columns=df.columns)
finished_labeling = False

def display_title():
    title_html = """
    <style>
        .title {
            font-family: Arial, sans-serif;
            color: #2E86C1;
            font-size: 18px;
        }
    </style>
    <div class="title">Human in the Loop Validation Stage</div>
    """
    display(HTML(title_html))

def update_progress():
    progress_info = f"Progress: {index + 1}/{len(df)} | Kept: {len(keep_df)} | Discarded: {len(discard_df)}"
    return widgets.Label(value=progress_info)

def create_buttons():
    keep_button = widgets.Button(description="Keep", layout=Layout(width='100px'))
    discard_button = widgets.Button(description="Discard", layout=Layout(width='100px'))
    finished_button = widgets.Button(description="Finished", layout=Layout(width='100px'))
    keep_all_button = widgets.Button(description="Keep All",
                                     layout=Layout(width='100px', margin='0 0 0 20px'),  # Add margin for separation
                                     style={'button_color': 'lightgreen'})  # Change button color

    keep_button.on_click(on_keep_button_click)
    discard_button.on_click(on_discard_button_click)
    finished_button.on_click(on_finished_button_click)
    keep_all_button.on_click(on_keep_all_button_click)  # Set the click handler for the new button

    return HBox([keep_button, discard_button, finished_button, keep_all_button])  # Include the new button

def on_keep_button_click(b):
    global keep_df, index
    keep_df = pd.concat([keep_df, df.iloc[[index]]], ignore_index=True)
    index += 1
    human_evaluate()

def on_discard_button_click(b):
    global discard_df, index
    discard_df = pd.concat([discard_df, df.iloc[[index]]], ignore_index=True)
    index += 1
    human_evaluate()

def on_finished_button_click(b):
    global finished_labeling
    finished_labeling = True
    human_evaluate()

def on_keep_all_button_click(b):
    global keep_df, finished_labeling
    keep_df = df.copy()  # Set keep_df to all records from df
    finished_labeling = True
    human_evaluate()  # Optionally call human_evaluate() to update the UI or handle the finish state

def display_row_details(row):
    text_widgets = [widgets.HTML(value=f"<b>{col}:</b> {row.iloc[0][col]}") for col in df.columns]
    return VBox(text_widgets)

def human_evaluate():
    global index
    clear_output(wait=True)
    display_title()

    if index >= len(df) or finished_labeling:
        display(HTML("<h3>Labeling Complete!</h3>"))
        return

    row = df.iloc[[index]]
    progress_label = update_progress()
    buttons = create_buttons()
    row_details = display_row_details(row)

    display(VBox([progress_label, buttons, row_details]))

# Initial call to start the human evaluation process
human_evaluate()



### 4. Writing out Q-T pairs

In [None]:
import json

jsonl_file_path = 'gretel_samples.jsonl'

# Filter the DataFrame to keep only 'question' and 'ground_truth' columns
keep_df_filtered = keep_df[['question', 'truth']]

# Export the filtered DataFrame to JSONL
with open(jsonl_file_path, 'w') as jsonl_file:
    for _, row in keep_df_filtered.iterrows():
        json.dump(row.to_dict(), jsonl_file)
        jsonl_file.write('\n')

# Read and print the JSONL file contents with nice formatting
with open(jsonl_file_path, 'r', encoding='utf-8') as jsonl_file:
    for line in jsonl_file:
        json_obj = json.loads(line)
        # Use ensure_ascii=False here as well to print Unicode characters correctly
        print(json.dumps(json_obj, indent=4, ensure_ascii=False))