In [None]:
!pip install pypdf
!pip install openai==0.28
!pip install chromadb
!pip install typing
!pip install sentence-transformers
!pip install tiktoken

import os
import re
from typing import List, Tuple
from pypdf import PdfReader
import openai
import chromadb
from sentence_transformers import SentenceTransformer
from chromadb.api.types import Documents, Embeddings, EmbeddingFunction
import tiktoken
import csv
import time
import random

# Load the PDF file and extract text from each page
def load_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text
    return text

# Split the text into chunks based on double newlines
def split_text(text, max_tokens_per_chunk=500):
    tokenizer = tiktoken.encoding_for_model("gpt-4")
    chunks = []
    current_chunk = ""

    for paragraph in re.split('\n\n', text):
        if paragraph.strip():
            if len(tokenizer.encode(current_chunk + paragraph)) < max_tokens_per_chunk:
                current_chunk += paragraph + "\n\n"
            else:
                chunks.append(current_chunk.strip())
                current_chunk = paragraph + "\n\n"

    if current_chunk.strip():
        chunks.append(current_chunk.strip())

    return chunks

# Define a custom embedding function using SentenceTransformer
class CustomEmbeddingFunction(EmbeddingFunction):
    def __init__(self):
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

    def __call__(self, input: Documents) -> Embeddings:
        return self.model.encode(input).tolist()

# Create and populate Chroma database
def create_chroma_db(documents: List[str], name: str):
    db_folder = "/content/chroma_db"
    if not os.path.exists(db_folder):
        os.makedirs(db_folder)

    chroma_client = chromadb.PersistentClient(path=db_folder)

    existing_collections = chroma_client.list_collections()
    if name in [c.name for c in existing_collections]:
        chroma_client.delete_collection(name)
        print(f"Deleted existing collection: {name}")

    embedding_function = CustomEmbeddingFunction()
    db = chroma_client.create_collection(name=name, embedding_function=embedding_function)
    for i, d in enumerate(documents):
        db.add(documents=[d], ids=[str(i)])

    return db

# Retrieve the most relevant passages based on the query
def get_relevant_passage(query: str, db, n_results: int):
    results = db.query(query_texts=[query], n_results=n_results)
    if not results['documents']:
        return ["No relevant passage found."]
    return [doc[0] for doc in results['documents']]

# Count tokens in a string
def count_tokens(text: str) -> int:
    tokenizer = tiktoken.encoding_for_model("gpt-4o")
    return len(tokenizer.encode(text))

# Generate a query using OpenAI GPT-4o API specific to a topic
def generate_specific_query(topic: str, context: str):
    max_context_tokens = 1500
    if count_tokens(context) > max_context_tokens:
        context = context[:max_context_tokens * 4]  # Approximate truncation

    prompt = f"""Based on the following context from a document about tobacco, generate a relevant query specifically about "{topic}":

Context: {context}

Query:"""

    response = openai.ChatCompletion.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that generates relevant queries based on the given context."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=50
    )
    return response.choices[0].message['content'].strip()

# Generate an answer using OpenAI GPT-4o API
def generate_answer(query: str, context: str):
    max_context_tokens = 2000
    if count_tokens(context) > max_context_tokens:
        context = context[:max_context_tokens * 4]  # Approximate truncation

    prompt = f"""You are a helpful and informative bot that answers questions using text from the reference passage included below.
Be sure to respond in a complete sentence, being comprehensive, including all relevant background information.
However, you are talking to a non-technical audience, so be sure to break down complicated concepts and
strike a friendly and conversational tone.

QUESTION: '{query}'
PASSAGE: '{context}'

ANSWER:"""

    response = openai.ChatCompletion.create(
        model="gpt-4o",
        messages=[{"role": "system", "content": "You are a helpful assistant that provides informative answers based on given context."},
                  {"role": "user", "content": prompt}],
        max_tokens=700
    )
    return response.choices[0].message['content'].strip()

# Define query criteria based on topics
TOPICS = {
    "Industry strategies/practices": "Queries regarding what the company has been doing and what they plan to do.",
    "Business/market performance": "Queries regarding how the company or product market has performed, as well as how these are projected to perform (according to the company).",
    "Tobacco/nicotine products": "Queries regarding the company's products, including product descriptions, launches of new products, discontinued products, brand or technological innovations.",
    "Science/health effects": "Queries regarding the company's research and scientific activities, as well as findings from these.",
    "Policy/regulation": "Queries regarding policy and regulation of interest to the company, how they feel about said legislation and how they might be trying to influence it."
}

# Generate specific query-response pairs
def generate_query_response_pairs_by_topic(db, topics=TOPICS, queries_per_topic=4) -> List[Tuple[str, str, str]]:
    pairs = []

    for topic, explanation in topics.items():
        topic_pairs = []
        attempts = 0
        max_attempts = 20  # Maximum number of attempts per topic

        while len(topic_pairs) < queries_per_topic and attempts < max_attempts:
            try:
                random_doc = db.get(ids=[str(random.randint(0, db.count() - 1))])['documents'][0]
                query = generate_specific_query(topic, random_doc)
                relevant_passage = get_relevant_passage(query, db, n_results=1)[0]
                answer = generate_answer(query, relevant_passage)

                # Only add the pair if the query is unique for this topic
                if not any(existing_query == query for _, existing_query, _ in topic_pairs):
                    topic_pairs.append((topic, query, answer))
                    print(f"Generated pair {len(topic_pairs)}/{queries_per_topic} for topic: {topic}")

                time.sleep(30)  # Wait for 30 seconds between successful calls
            except openai.error.RateLimitError:
                print("Rate limit exceeded. Waiting before retrying...")
                time.sleep(60)  # Wait for 60 seconds before retrying

            attempts += 1

        pairs.extend(topic_pairs)
        print(f"Completed topic: {topic} with {len(topic_pairs)} pairs")
        time.sleep(60)  # Wait for 60 seconds between topics

    return pairs

# Main execution
if __name__ == "__main__":
    openai.api_key = 'openai_api_key'

    pdf_path = "pmi_report_2023.pdf"
    pdf_text = load_pdf(pdf_path)

    chunked_text = split_text(pdf_text, max_tokens_per_chunk=500)

    db_name = "tobacco_report"
    db = create_chroma_db(chunked_text, db_name)

    query_response_pairs = generate_query_response_pairs_by_topic(db)

    with open('query_response_pairs.csv', 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Topic", "Query", "Response"])
        for topic, query, response in query_response_pairs:
            writer.writerow([topic, query, response])

    print(f"Generated {len(query_response_pairs)} query-response pairs and saved them to query_response_pairs.csv")

Collecting pypdf
  Using cached pypdf-5.0.1-py3-none-any.whl.metadata (7.4 kB)
Using cached pypdf-5.0.1-py3-none-any.whl (294 kB)
Installing collected packages: pypdf
Successfully installed pypdf-5.0.1
Collecting openai==0.28
  Using cached openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Using cached openai-0.28.0-py3-none-any.whl (76 kB)
Installing collected packages: openai
Successfully installed openai-0.28.0
Collecting chromadb
  Using cached chromadb-0.5.11-py3-none-any.whl.metadata (6.8 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Using cached chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Using cached fastapi-0.115.0-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Using cached uvicorn-0.31.0-py3-none-any.whl.metadata (6.6 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Using cached posthog-3.6.6-py2.py3-none-any.wh

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.7.0


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generated pair 1/4 for topic: Industry strategies/practices
Generated pair 2/4 for topic: Industry strategies/practices
Generated pair 3/4 for topic: Industry strategies/practices
Generated pair 4/4 for topic: Industry strategies/practices
Completed topic: Industry strategies/practices with 4 pairs
Generated pair 1/4 for topic: Business/market performance
Generated pair 2/4 for topic: Business/market performance
Generated pair 3/4 for topic: Business/market performance
Generated pair 4/4 for topic: Business/market performance
Completed topic: Business/market performance with 4 pairs
Generated pair 1/4 for topic: Tobacco/nicotine products
Generated pair 2/4 for topic: Tobacco/nicotine products
Generated pair 3/4 for topic: Tobacco/nicotine products
Generated pair 4/4 for topic: Tobacco/nicotine products
Completed topic: Tobacco/nicotine products with 4 pairs
Generated pair 1/4 for topic: Science/health effects
Generated pair 2/4 for topic: Science/health effects
Generated pair 3/4 for t