# **Setting up Python Environment**

In [None]:
# %pip install -r requirements.txt
# %pip install ipykernel langchain_experimental llama-index-vector-stores-pinecone ipykernel PyMuPDF pinecone-client pypdf faiss-cpu langchain_community transformers sentence_transformers

In [None]:
import io
import json
import math
import os
import re
import sqlite3
import time

import datasets
import dotenv
import faiss
import fitz
import huggingface_hub
import langchain
import langchain_community
import nltk
import numpy as np
import openai
import pandas as pd
import pinecone
import pypdf
import requests
import torch
import transformers

In [None]:
from dotenv import load_dotenv
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
from llama_index.core import (SimpleDirectoryReader, StorageContext,
                              VectorStoreIndex)
from llama_index.core.extractors import (QuestionsAnsweredExtractor,
                                         TitleExtractor)
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import TextNode
# sentence transformers
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.vector_stores.pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

In [None]:
# from dotenv import load_dotenv
# import os
# import pinecone

# load_dotenv()

# # OpenAI API Key:
# openai = os.getenv('OPENAI_API_KEY')

# # Pinecone API Key:
# pinecone_api_key =os.getenv('PINECONE_API_KEY')
# environment =os.getenv('PINECONE_ENV')

# # Hugging Face Token:
# HF_TOKEN = os.getenv('HF_TOKEN')

# # configure Pinecone client
# pc = Pinecone(api_key=pinecone_api_key)


In [None]:
from dotenv import load_dotenv
import os

# Load .env file
load_dotenv()

# Retrieve API keys securely
openai_api_key = os.getenv("OPENAI_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_env = os.getenv("PINECONE_ENV")
hf_token = os.getenv("HF_TOKEN")

print("API keys loaded securely.")


# **Project 2 Implementation**

## **Section A. Experimenting with Vector Store Query Design (50 points)**

In [None]:
#!pip install llama_index.embeddings.huggingface

from openai import OpenAI

from sentence_transformers import SentenceTransformer

import json, os, io, re, requests, fitz, dotenv, transformers, pinecone, pypdf, faiss, sqlite3, langchain_community, langchain, openai, math, time, nltk, torch, huggingface_hub, datasets
import requests
from langchain_text_splitters import RecursiveJsonSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

from llama_index.core import (
    VectorStoreIndex,
    StorageContext,
    SimpleDirectoryReader
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core.schema import TextNode
from llama_index.core.extractors import (
    QuestionsAnsweredExtractor,
    TitleExtractor,
)
from llama_index.core.ingestion import IngestionPipeline
from llama_index.llms.openai import OpenAI

# sentence transformers
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [None]:
from pinecone import Pinecone, ServerlessSpec, Pinecone  

from dotenv import load_dotenv
import os

# Load .env file
load_dotenv()

# Retrieve API keys securely
openai_api_key = os.getenv("OPENAI_API_KEY")
# pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_env = os.getenv("PINECONE_ENV")
hf_token = os.getenv("HF_TOKEN")

# configure Pinecone client
pc = Pinecone(api_key="pcsk_549ncM_8EkcmEKkZ4MAHHRYzgVeKAa9hkHumAJ5K3G7yE4X7JYm6rgRLKT3sMdn34e5Muo")

print("API keys loaded securely.")

In [None]:
doc = fitz.open("the-word-2023-24-12.11.23.pdf")

### **Choose a method to chunk the text data:**

- [Semantic chunking](https://python.langchain.com/docs/modules/data_connection/document_transformers/semantic-chunker)

- [Recursive chunking](https://python.langchain.com/docs/modules/data_connection/document_transformers/recursive_text_splitter)

- [Character chunking](https://python.langchain.com/docs/modules/data_connection/document_transformers/character_text_splitter)

- [Token chunking](https://python.langchain.com/docs/modules/data_connection/document_transformers/split_by_token)

##### Choose a type of chunker (From langchain):

In [None]:
nltk.download('punkt')

In [None]:
import nltk
nltk.download('punkt_tab')

In [None]:
# from langchain_experimental.text_splitter import SemanticChunker

# # parser to split up PDF resume:
# text_parser = SentenceSplitter(
#     chunk_size=1024
# )

from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import HuggingFaceEmbeddings

# Load an embedding model for semantic chunking
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Initialize the Semantic Chunker
semantic_chunker = SemanticChunker(embedding_model)

In [None]:
text_chunks = []
doc_idxs = []


for doc_idx, page in enumerate(doc):
    page_text = page.get_text("text")
    cur_text_chunks = semantic_chunker.split_text(page_text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

In [None]:
print(f'{len(text_chunks)} chunks parsed')

In [None]:
nodes = []

for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc_idx = doc_idxs[idx]
    src_page = doc[src_doc_idx]
    nodes.append(node)

In [None]:
print(text_chunks[6])

#### **Chunker Choices**

In [None]:
# Chunker choice #1:

In [None]:
# Chunker choice #2:

### **Create text nodes from chunks**

In [None]:
nodes = []

for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
        metadata={'text': text_chunk}  # type: ignore
    )
    src_doc_idx = doc_idxs[idx]
    src_page = doc[src_doc_idx]  # pymupdf.Page
    nodes.append(node)

print(f'{len(nodes)} nodes created')

In [None]:
nodes[0].text

In [None]:
nodes[0].metadata

In [None]:
llm = OpenAI(model="gpt-3.5-turbo",  # This is llama_index.llms.openai.OpenAI, not openai.AI
             api_key="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")


extractors = [
    TitleExtractor(nodes=5, llm=llm),
    QuestionsAnsweredExtractor(questions=3, llm=llm),
]

pipeline = IngestionPipeline(
    transformations=extractors,
)
nodes = await pipeline.arun(nodes=nodes, in_place=False)

### **Create the vector store using chosen similarity metrics:**

In [None]:
import pinecone
from pinecone import Pinecone

load_dotenv()

# OpenAI API Key:
openai = os.getenv('OPENAI_API_KEY')

# pinecone_api_key="pcsk_549ncM_8EkcmEKkZ4MAHHRYzgVeKAa9hkHumAJ5K3G7yE4X7JYm6rgRLKT3sMdn34e5Muo"

# Pinecone API Key:
pinecone_api_key =os.getenv('PINECONE_API_KEY')
environment =os.getenv('PINECONE_ENV')

# Hugging Face Token:
HF_TOKEN = os.getenv('HF_TOKEN')

# configure Pinecone client
pc = Pinecone(api_key= "pcsk_549ncM_8EkcmEKkZ4MAHHRYzgVeKAa9hkHumAJ5K3G7yE4X7JYm6rgRLKT3sMdn34e5Muo" )

In [None]:
use_serverless = os.environ.get("USE_SERVERLESS", "False").lower() == "true"

In [None]:
spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 

# specify the Pinecone environment to use:
#if use_serverless:
    #spec = pinecone.ServerlessSpec(cloud='aws', region="us-east-1")
#else:
   # spec = pinecone.PodSpec(environment=environment)

In [None]:
# Name our Pinecone Index:
index_name = "hw02"

# If a Pinecone index of the same name already exists, delete it:
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

### **choose a similarity metric to use for the vector store:**

In [None]:

# define similarity and additional parameters for the vector store index:
dimensions = 1536 #364  #1536 #768              # the dimensions of the index need to align with the LLM we are using for the RAG system. For example, if using openAI then dimenion = 1536. If using Llama2, then dimension = 384.

# "dotproduct" is one similarity metric we can for the vector store index. We can use different distance metrics to measure the similarity between vector embeddings and user queries. This is where we define what similarity metric we are going to use for the vector store.
# "cosine" is another similarity metric we can use for the vector store index.
# "euclidean" is another similarity metric we can use for the vector store index.

pc.create_index(
    name=index_name, 
    dimension=dimensions, 
    metric="cosine",          # we can use different distance metrics to measure the similarity between vector embeddings and user queries. this is where we define what similarity metric we are going to use for the vector store.
    spec=spec
)

# wait for index to be ready before connecting
while not pc.describe_index(index_name).status['ready']:
   time.sleep(1)

for index in pc.list_indexes():
    print(index['name'])


pc.describe_index("hw02")


pc_index = pc.Index(index_name)  # create an index to use in the vector store


vector_store = PineconeVectorStore(pinecone_index=pc_index)    # this function creates a vector store where we will add and store embeddings

In [None]:
pc_index = pc.Index(index_name)  # create an index to use in the vector store
vector_store = PineconeVectorStore(pinecone_index=pc_index)    # this function creates a vector store where we will add and store embeddings

In [None]:

pc_index.describe_index_stats()


In [None]:
# # llm = OpenAI(model="gpt-3.5-turbo")

# # extractors = [
# #     TitleExtractor(nodes=5, llm=llm),
# #     QuestionsAnsweredExtractor(questions=3, llm=llm),
# # ]

# # pipeline = IngestionPipeline(
# #     transformations=extractors,
# # )
# # nodes = await pipeline.arun(nodes=nodes, in_place=False)

# llm = OpenAI(model="gpt-3.5-turbo")

# extractors = [
#     TitleExtractor(nodes=5, llm=llm),
#     QuestionsAnsweredExtractor(questions=3, llm=llm),
# ]

# pipeline = IngestionPipeline(
#     transformations=extractors,
# )
# nodes = await pipeline.arun(nodes=nodes, in_place=False)

### ***choose an embedding model to use for the vector store:**

#### **OpenAI Embeddings**

In [None]:
model_ada="text-embedding-ada-002"
# small_txt_embedmodel_="text-embedding-3-small"


In [None]:
embed_model = OpenAIEmbedding(model="text-embedding-3-small", api_key = "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")

for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

In [None]:
print(f'Node embedding dimension is {len(nodes[0].embedding)}')  # type: ignore

In [None]:
from llama_index.core import ServiceContext
from llama_index.embeddings.openai import OpenAIEmbedding

# Initialize OpenAI embedding model
embed_model = OpenAIEmbedding(model="text-embedding-ada-002")

# Apply embeddings to nodes
for node in nodes:
    node.embedding = embed_model.get_text_embedding(node.get_text())

# Now, add nodes with embeddings to Pinecone
vector_store.add(nodes)


### **load the embeddings into the vector store (e.g. create a vector store):**

In [None]:
vector_store.add(nodes)

In [None]:

pc_index.describe_index_stats()


In [None]:
print(nodes[0].metadata)

In [None]:
print(nodes[0])

### **Retrieve Content from the Vector Store**

In [None]:
from openai import OpenAI

# Directly pass the API key
client = OpenAI(api_key="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")


In [None]:
# # define the query:
# query = (
#     "Where are pets allowed on CMU?"
# )

# # choose one of these models:
# embed_model_ada = "text-embedding-ada-002"
# embed_model_3_small = "text-embedding-3-small"

# res = client.embeddings.create(
#     input=[query],
#     model= embed_model_ada 
# )

# # retrieve from Pinecone
# xq = res.data[0].embedding #res['data'][0]['embedding']

# # get relevant contexts (including the questions)
# res2 = pc_index.query(vector=xq, top_k=2, include_metadata=True)

# Define the queries
k = 5
queries = [
    "What is the policy statement for the academic integrity policy?",
    "What is the policy violation definition for cheating?",
    "What is the policy statement for improper or illegal communications?",
    "What are CMU’s quiet hours?",
    "Where are pets allowed on CMU?"
]

responses = []


# Choose one of these models:
embed_model_ada = "text-embedding-ada-002"
embed_model_3_small = "text-embedding-3-small"

for query in tqdm(queries):
    res = client.embeddings.create(
        input=[query],
        model=embed_model_3_small
    )

    # Retrieve from Pinecone
    xq = res.data[0].embedding  # res['data'][0]['embedding']

    # Get relevant contexts (including the questions)
    res2 = pc_index.query(vector=xq, top_k=k, include_metadata=True)

    # Add response results
    responses.append(res2)
    

# # Choose an embedding model
# embed_model = "text-embedding-ada-002"  # or "text-embedding-3-small"

# # Generate embeddings for each query
# res = client.embeddings.create(
#     input=queries,
#     model=embed_model
# )

# # Retrieve relevant contexts from Pinecone
# query_embeddings = [item.embedding for item in res.data]  # Extract embeddings

# # Query Pinecone index for each query
# top_k = 5  # Number of relevant contexts to retrieve
# results = [
#     pc_index.query(vector=embedding, top_k=top_k, include_metadata=True)
#     for embedding in query_embeddings
# ]

# # Print or process results
# for i, query in enumerate(queries):
#     print(f"Query: {query}")
#     print(f"Top {top_k} results: {results[i]}\n")


In [None]:
response_results = []

for q in range(len(queries)):
    response = responses[q]
    for k, match in enumerate(response.matches):
        curr_result = {
            'q': q+1,
            'k': k+1,
            'score': match.score,
            'text': match.metadata.get('text', ''),
            'document_title': match.metadata.get('document_title', '')
        }
        response_results.append(curr_result)

In [None]:
pd.DataFrame(response_results).to_csv('parta_responses.csv')

In [None]:
# print the results:
res2

#### **Query the vector store using these queries**

**Instruction: set the 'k' parameter to 5**

Query 1: What is the policy statement for the academic integrity policy?

Query 2: What is the policy violation definition for cheating?

Query 3: What is the policy statement for improper or illegal communications?

Query 4: What are CMU’s quiet hours?

Query 5: Where are pets allowed on CMU?

### ***query the vector store with the 5 queries above (don't forget to record the responses in your homework submission spreadsheet: see instructions for a link to the spreadsheet!):***

In [None]:
# query the vector store with the 5 queries above (don't forget to record the responses in your homework submission!):

### **Project Questions:**

**A.II.** Explain your rationale for choosing the similarity metric you decided to use in the vector store. What is one pro of using the metric, and what is one difference between using the metric you selected and the other two similarity metrics we discussed in the lab. (We discussed cosine, dot product, and euclidean similarity metrics).

**A.III.** Copy and paste the results or information retrieved from the vector store in response to each of the queries you submitted to the vector store in the SPREADSHEET TEMPLATE (please see instructions for a link to the spreadsheet template you should copy and use).

**A.IV.** Qualitatively analyze the responses to your queries submitted to the vector store. Did the queries retrieve the information you were expecting to obtain. Why or why not? Why do you think the queries were successful / unsuccessful in retrieving the information you expected or needed?

## **Section B. Experimenting with Vector Store Embeddings & Query Parameters (50 points)**

1) Choose 1 of the 5 queries provided in A.1.6.A, above, and experiment with submitting the query to the vector store by changing the search parameters in the following manner:


*   A) Baseline query, e.g. query, k=1.

*   B) Query, parameter k = 3

*   C) Query, parameter k = 5

*   D) Query, parameter k = 10

**In your written homework submission, record the UNIQUE responses/results of each query submitted to the vector store.**

2. Return to step A.1.B., above, and select a different text chunking method (e.g. word, sentence, paragraph). 
- Chunk your text data using the method. Create embeddings for the text. 
- Load the embeddings into the vector store. 
- Submit the same query you selected in B.1, above, and submit it to the vector store 6 times (using the different ‘k’ parameter settings defined in B.1, above), and record the responses.

**In your written homework submission, record the responses/results of each query submitted to the vector store.**

### **Project Questions:**

**B.I.** Explain your rationale for selecting the query you choose in B.1. Why did you choose this query vs. the other 4 queries?

**B.II.** Copy and paste the responses to the queries you submitted to the vector store in the SPREADSHEET TEMPLATE.

**B.III.** Copy and paste the responses to the queries you submitted to the vector store in the SPREADSHEET TEMPLATE.

**B.IV.** In observing the responses from the vector store to the queries created in B.1., which ‘k’ parameter do you think retrieved the highest quality / most accurate result? Why do you think this parameter was the best to use with the query?

**B.V.** In observing the responses from the vector store to the queries created in B.2., which ‘k’ parameter do you think retrieved the highest quality / most accurate result? Why do you think this parameter was the best to use with the query?

# **BONUS TASKS / QUESTIONS: Define function to call LLM API**

## Please email Sara for the Bonus Task Python Notebook once you've completed your homework assignment