<a href="https://colab.research.google.com/github/spencer18001/llm_zoomcamp_project_2024/blob/main/ground_truth_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install semchunk tiktoken google-generativeai
!wget -O The_Adventure_of_the_Speckled_Band.txt https://github.com/spencer18001/llm_zoomcamp_project_2024/blob/main/The_Adventure_of_the_Speckled_Band.txt?raw=1

In [None]:
import json
from tqdm.auto import tqdm
import pandas as pd
import semchunk
import tiktoken
from google.colab import userdata
import google.generativeai as genai

In [None]:
data_file_path = "The_Adventure_of_the_Speckled_Band.txt"
with open(data_file_path, 'r') as file:
    content = file.read()

chunker = semchunk.chunkerify(tiktoken.encoding_for_model("gpt-4o"), 100)
chunks = chunker(content)

docs = []
for chunk in tqdm(chunks):
    doc = {
        'text': chunk,
    }
    docs.append(doc)

len(docs)

  0%|          | 0/162 [00:00<?, ?it/s]

162

In [None]:
prompt_template = """
You emulate a reader who's taking this story.
Formulate 5 questions this reader might ask based on a record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record.

The record:

text: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [None]:
genai.configure(api_key=userdata.get('GEMINI_API_KEY'))
model = genai.GenerativeModel('gemini-1.5-flash')

In [None]:
docs[0]

{'text': 'N glancing over my notes of the seventy odd cases in which I have during the last eight years studied the methods of my friend Sherlock Holmes, I find many tragic, some comic, a large number merely strange, but none commonplace; for, working as he did rather for the love of his art than for the acquirement of wealth, he refused to associate himself with any investigation which did not tend towards the unusual, and even the fantastic. Of all these varied cases, however, I cannot recall'}

In [None]:
prompt = prompt_template.format(**(docs[0]))
response = model.generate_content(prompt, safety_settings=[
    {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
    {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
    {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
    {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"}
])
response.text

'[\n"How many cases has the narrator studied involving Sherlock Holmes?",\n"What is the primary reason Sherlock Holmes chooses to investigate a case?",\n"What kind of cases does Sherlock Holmes typically refuse to investigate?",\n"What is the range of cases that the narrator has studied?",\n"What is the narrator\'s opinion of the cases involving Sherlock Holmes?"\n]'

In [None]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)
    response = model.generate_content(prompt, safety_settings=[
    {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
    {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
    {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
    {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"}
])
    json_response = response.text
    return json_response

In [None]:
results = {}

In [None]:
for i, doc in enumerate(tqdm(docs)):
    if i in results:
        continue

    questions = generate_questions(doc)
    results[i] = questions

  0%|          | 0/162 [00:00<?, ?it/s]

In [None]:
len(results)

162

In [None]:
parsed_resulst = {}
for doc_id, json_questions in results.items():
    parsed_resulst[doc_id] = json.loads(json_questions)

len(parsed_resulst)

162

In [None]:
final_results = []
for i, questions in parsed_resulst.items():
    for q in questions:
        final_results.append((q, i))
len(final_results)

810

In [None]:
df = pd.DataFrame(final_results, columns=['question', 'document'])
df.to_csv('ground-truth-data.csv', index=False)

In [None]:
!head ground-truth-data.csv

question,document
What kind of cases did Sherlock Holmes typically refuse to work on?,0
What motivated Sherlock Holmes to choose the cases he did?,0
How long did the narrator study Sherlock Holmes' methods?,0
What is the narrator's perspective on the cases they studied?,0
What is the most common characteristic of the cases the narrator studied?,0
What unique characteristics did the Roylott family of Stoke Moran have?,1
When did the events in the story take place?,1
Where were Holmes and the narrator living at the time?,1
Has the narrator shared this story before?,1
