In [66]:
import fitz  # PyMuPDF
import json 
import os 
import io
import re
#import requests 
import dotenv 
import transformers
import pypdf
import faiss
#import sqlite3

from dotenv import load_dotenv

from openai import OpenAI
from sentence_transformers import util, SentenceTransformer
from transformers import pipeline, BertTokenizer, BertModel

import pandas as pd
import numpy as np
from io import StringIO
from dotenv import load_dotenv
#from operator import itemgetter

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt_tab')
nltk.download('punkt')

from llama_index.core.node_parser import SentenceSplitter

# **Setting up Python Environment**

## Instructions for setting up your .env file:

1. Create a .env file in the same directory as this notebook

2. Add the following lines to the .env file:

    OPENAI_API_KEY=<your_openai_api_key>

    HF_TOKEN=<your_huggingface_token>

3. Replace the placeholders with your actual keys.

4. Save the file.

5. Restart the kernel to ensure the keys are loaded correctly.

# Load API Keys into the Notebook Environment:

In [None]:
load_dotenv()

openai = os.getenv('OPENAI_API_KEY')
HF_TOKEN = os.getenv('HF_TOKEN')

# Custom Functions for Chunking the CMU Student Handbook & Measuring Computational Cost
(Optional / not required for the homework that you use these functions)

In [68]:
# Splitting Text into Sentences
def split_text_into_sentences_v1(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

def split_text_into_sentences_v2(text):
    sentences = sent_tokenize(text, language='english')  # Default is usually 'english'
    return sentences

In [69]:
# define a function to split the resumes into sentences and assign unique identifiers:

def split_resumes_to_sentences(df, text_column):
    """
    Split the resumes into individual sentences and assign unique identifiers.
    
    Parameters:
        df (pd.DataFrame): The DataFrame containing the resumes.
        text_column (str): The name of the column containing the resume texts.
        
    Returns:
        pd.DataFrame: A DataFrame with each sentence and its corresponding unique identifier.
    """
    # Initialize an empty list to hold the resulting data
    sentences_list = []
    
    # Iterate through the DataFrame rows
    for idx, row in df.iterrows():
        # Tokenize the resume text into sentences
        sentences = sent_tokenize(row[text_column])
        
        # Append each sentence along with the original index to the list
        for sentence in sentences:
            sentences_list.append((idx, sentence))
    
    # Convert the list to a DataFrame
    sentences_df = pd.DataFrame(sentences_list, columns=['unique_identifier', 'sentence'])
    
    return sentences_df

In [70]:

def compute_embedding_costs(text, model_name='all-MiniLM-L6-v2', eps=0.6, min_samples=2):
    """
    Computes the computational cost (in terms of execution time) for creating
    sentence embeddings and paraphrase embeddings.

    Parameters:
    - text (str): The input text to be processed.
    - model_name (str): The name of the model to use for embedding.
    - eps (float): The epsilon value for DBSCAN clustering.
    - min_samples (int): The minimum sample count for DBSCAN clustering.

    Returns:
    - A tuple containing the execution times for sentence embeddings and paraphrase-level embeddings.
    """
    model = SentenceTransformer(model_name)
    
    # Sentence Embedding Timing
    start_time = time.time()
    sentences = sent_tokenize(text)
    sentence_embeddings = model.encode(sentences)
    sentence_embedding_time = time.time() - start_time

    # Paraphrase Embedding (Clustering) Timing
    start_clustering_time = time.time()
    clustering = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine').fit(sentence_embeddings)
    cluster_labels = clustering.labels_
    
    paraphrase_embeddings = []
    for cluster_id in set(cluster_labels):
        if cluster_id == -1:
            continue
        cluster_sentences = np.array(sentences)[cluster_labels == cluster_id]
        paraphrase = ' '.join(cluster_sentences)
        paraphrase_embeddings.append(paraphrase)
    paraphrase_embedding_time = time.time() - start_clustering_time

    return sentence_embedding_time, paraphrase_embedding_time

# Example usage
if __name__ == "__main__":
    text = ("This is a sample text. It has several sentences, meant to showcase "
            "how embeddings are computed. Some of these sentences may be clustered "
            "together, representing paraphrases or semantically similar groups.")

    sent_time, para_time = compute_embedding_costs(text)
    print(f"Sentence Embedding Time: {sent_time:.4f} seconds")
    print(f"Paraphrase Embedding Time: {para_time:.4f} seconds")

# This function was created with GenerativeAI Assistance. 

In [None]:
def estimate_model_flops(model_name, text):
    """
    Estimate the FLOPs for generating embeddings for a given text using a specified model.

    Parameters:
    - model_name (str): Model identifier from Hugging Face Transformers.
    - text (str): Text to process.

    Returns:
    - FLOPs (int): An estimated number of floating point operations.
    """
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name)

    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs['input_ids']

    # Hooks for the operations
    def hook_fn_forward(module, input, output):
        # Attempt to access the tensor shape in a safer manner
        input_shape = input[0].size()
        
        # A generalized fallback if shape isn't what's expected
        if len(input_shape) == 2:  # Assuming shape [batch, seq_len] for simplicity
            batch_size, seq_len = input_shape
            # Hypothetical FLOPs calculation: For demonstration, let's assume it's just the product
            flops = batch_size * seq_len
        elif len(input_shape) > 2:  # Assuming more dimensions (e.g., embeddings)
            flops = torch.prod(torch.tensor(input_shape))
        else:
            # In case of unsupported dimensions, set flops to 0 or some placeholder
            flops = 0

        # Storing calculated FLOPs in the module
        if hasattr(module, '__flops__'):
            module.__flops__ += flops
        else:
            module.__flops__ = flops

    def add_hooks_to_model(model, hook_fn):
        """
        Recursively add hook_fn to all the layers of the model.
        """
        total_flops = 0
        for layer in model.children():
            if list(layer.children()):  # if the layer has children, recursively add hooks
                total_flops += add_hooks_to_model(layer, hook_fn)
            else:
                if hasattr(layer, 'weight'):
                    layer.register_forward_hook(hook_fn)
                    layer.__flops__ = 0
        return total_flops

    add_hooks_to_model(model, hook_fn_forward)

    with torch.no_grad():
        _ = model(**inputs)

    total_flops = sum([mod.__flops__ for mod in model.modules() if hasattr(mod, '__flops__')])

    return total_flops

# Example usage
if __name__ == "__main__":
    model_name = "bert-base-uncased"
    text = "This is an example sentence"
    flops = estimate_model_flops(model_name, text)
    print(f"Estimated FLOPs: {flops}")

# This function was created with GenerativeAI Assistance. 

# Load Data into the Notebook Environment:

In [7]:
doc = fitz.open("the-word-2023-24-12.11.23.pdf")
doc

# **Homework 2 Assignment**

## **Section A. Experimenting with Vector Store Query Design (50 points)**

### **Choose a method to chunk the text data:**

- [Semantic chunking](https://python.langchain.com/docs/modules/data_connection/document_transformers/semantic-chunker)

- [Recursive chunking](https://python.langchain.com/docs/modules/data_connection/document_transformers/recursive_text_splitter)

- [Character chunking](https://python.langchain.com/docs/modules/data_connection/document_transformers/character_text_splitter)

- [Token chunking](https://python.langchain.com/docs/modules/data_connection/document_transformers/split_by_token)

# Sentence Chunking

### Choose a type of chunker:

In [72]:
# this is an example chunker. You don't have to use it. Email Sara with questions.

# parser to split up PDF resume:
text_parser = SentenceSplitter(
    chunk_size=1024
)

#### **Chunker Choices**

In [None]:
# Chunker choice #1:

In [None]:
# Chunker choice #2:

In [73]:
# example code, feel free to use in homework. 

text_chunks = [] # create an empty list to store the text chunks.
doc_idxs = []    # create an empty list to store unique identifiers for the text chunks.

# split the CMU handbook up into chunks and assign unique identifiers to each chunk:
for doc_idx, page in enumerate(doc):
    page_text = page.get_text("text")
    cur_text_chunks = text_parser.split_text(page_text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

In [None]:
text_chunks # glance at the text chunks to observe how the chunks look.

In [18]:
text_chunk_df = pd.DataFrame(text_chunks) # put the chunks into a pandas dataframe.
text_chunk_df

In [20]:
# Split resumes into sentences and include a unique identifier for each sentence:
sentences_df = split_resumes_to_sentences(text_chunk_df, 0) 
sentences_df

In [30]:
# get the length of the sentences dataframe:
len(sentences_df)

### **Choose an embedding model to use for creating embeddings of the text chunks and create the Embeddings**

In [74]:
# define a function to create embeddings for the sentences:

model = SentenceTransformer('bert-base-nli-mean-tokens') # here we are selecting to use a Bert model on HuggingFace to create the embeddings.

In [23]:
# create sentence embeddings:
sentence_embeddings = model.encode(sentences_df['sentence'])

# check the shape of the sentence embeddings:
sentence_embeddings.shape

## **Create a FAISS Vector Store**

In [103]:
# specify the dimensions of the sentence embeddings:
dimension = sentence_embeddings.shape[1]

# specify the number of sentences:
nb = len(set(sentences_df))

# specify the number of queries:
nq = 10000 
np.random.seed(1234)             # set a random number to make the process reproducible
xb = np.random.random((nb, d)).astype('float32')

#
nlist = 100

In [104]:
# glance at the shape of the sentence embeddings or dimension for the vector store:
dimension

In [105]:
# create an index for the vector store:
index = faiss.IndexFlatL2(d)

In [106]:
# add the sentence embeddings to the index:
index.add(sentence_embeddings)

In [107]:
# check the number of vectors in the index:
index.ntotal

In [108]:
# train the index:
index.train(sentence_embeddings)

index.is_trained  # check if index is now trained

### **Construct Query and Perform Search of the Vector Store**

In [109]:
# define a query to submit to the vector store:
question = "<INSERT QUERY FROM HOMEWORK ASSIGNMENT INSTRUCTIONS HERE>"

In [110]:
# define the number of documents to retrieve from the vector store in response to the query:
retrival_number=10

# create an embedding for the query:
query_embedding = model.encode([question])

In [112]:
%%time 
 # measure the time it takes to search the index
D, I = index.search(query_embedding, retrival_number)  # search the index for the query, using the number of documents to retrieve specified by k
print(I) # print the indices of the documents that are most similar to the query

In [113]:
# Retrieve and print the string data from 'text' column of the first index in I

first_index = I[0] # Get the first index from I

first_row_string = sentences_df['sentence'].iloc[first_index].sum()  # Use iloc to access the row by index

print(first_row_string) # Print the string data

### **Define System Prompt (e.g. context message) to send to LLM**

In [114]:
# define a function to get retrieve the results from the vector store:
def get_sys_message(user_query: str, retrieval_number: int):
    query_embedding = model.encode([user_query])
    D, I = index.search(query_embedding, retrival_number)  # search
    first_index = I[0]  # Get the first index from I
    first_row_string = sentences_df['sentence'].iloc[first_index].sum()
    return first_row_string 

In [115]:
# use the custom function to retrieve the results from the vector store:
get_sys_message(user_query="Which resume has the most software skills listed?", retrieval_number=100)

In [116]:
# custom function for using an LLM with a RAG retriever:
def rag_openAI_gpt(
    model: str, 
    query: str, 
    retrieval_number: int, 
    llm_prompt: str):
    
    import openai
    from openai import OpenAI
    
    client = OpenAI()
    
    f=get_sys_message(query, retrieval_number)
    
    response = client.chat.completions.create(
    model=model,
    messages=[
        {"role": "system", "content": f"Instruction: use the information in {f} to answer the user's question."},
        {"role": "user", "content": f"{llm_prompt}"},
        {"role": "assistant", "content": f"{f}"},
        {"role": "user", "content": "What is the answer?"}
    ]
    )
    return response.choices[0].message.content

In [117]:
gpt_3_5_turbo = "gpt-3.5-turbo"
gpt_4 = "gpt-4"
gpt_4_turbo = "gpt-4-0125-preview"
gpt_4o = "gpt-4o"

## Examples for demonstration only:

In [118]:
rag_openAI_gpt(model=gpt_3_5_turbo, query="Which resume has the most software skills listed?", retrieval_number=20, llm_prompt="Classify the document and return a label based on the document type or class. Make the label specify which occupation the document pertains to")

In [119]:
rag_openAI_gpt(model=gpt_4, query="Which resume has the most software skills listed?", retrieval_number=20, llm_prompt="summarize the resume")

In [120]:
rag_openAI_gpt(model=gpt_4_turbo, query="Which resume has the most software skills listed?", retrieval_number=20, llm_prompt="summarize the resume")

# Homework requirement:

# Section A

## **Query the vector store using these queries**

**Instruction: set the 'k' parameter to 5**

Query 1: What is the policy statement for the academic integrity policy?

Query 2: What is the policy violation definition for cheating?

Query 3: What is the policy statement for improper or illegal communications?

Query 4: What are CMU’s quiet hours?

Query 5: Where are pets allowed on CMU?


### ***query the vector store with the 5 queries above (don't forget to record the responses in your homework submission spreadsheet: see instructions for a link to the spreadsheet!):***

In [121]:
# query the vector store with the 5 queries above (don't forget to record the responses in your homework submission!):

# **Homework Questions:**

**A.I.** 

(i) Describe these distance metrics: Cosine similarity; Euclidean Distance; Dot Product.

(ii) For each of the metrics you defined in (i), describe how the metric is different from the other metrics.

(iii) For each of the metrics you defined in (i), describe one advantage and one disadvantage of using the metric.

 

**A.II.** Copy and paste the results or information retrieved from the vector store in response to each of the queries you submitted to the vector store in the SPREADSHEET TEMPLATE (please see instructions for a link to the spreadsheet template you should copy and use).  


**A.III.** Qualitatively analyze the responses to your queries submitted to the vector store. Did the queries retrieve the information you were expecting to obtain. Why or why not? Why do you think the queries were successful / unsuccessful in retrieving the information you expected or needed? 

# **Section B. Experimenting with Vector Store Embeddings & Query Parameters (50 points)**

1) Choose 1 of the 5 queries provided in Section A, above, and experiment with submitting the query to the vector store by changing the QUERY and RETRIEVAL_NUMBER parameters in the following manner:


*   A) Baseline query (e.g. query), retrieval_number parameter=1.

*   B) Query, retrieval_number parameter  = 3

*   C) Query, retrieval_number parameter  = 5

*   D) Query, retrieval_number parameter  = 10

**In your written homework submission, record the UNIQUE responses/results of each query submitted to the vector store.**

2. Select a different text chunking method (e.g. word, sentence, paragraph) and:
   
- Chunk your text data using the method.
- Create embeddings for the text. 
- Load the embeddings into the vector store. 
- Submit the same query you selected in B.1, above, and submit it to the vector store 6 times (using the different ‘retrieval_number’ parameter settings defined in B.1, above), and record the responses.

**In your written homework submission, record the responses/results of each query submitted to the vector store.**

### **Homework Questions:**

**B.I.** Explain your rationale for selecting the query you choose in B.1. Why did you choose this query vs. the other 4 queries? 

**B.II.** Copy and paste the responses to the queries you submitted to the vector store in the SPREADSHEET TEMPLATE.

**B.III.** Copy and paste the responses to the queries you submitted to the vector store in the SPREADSHEET TEMPLATE. 

**B.IV.** In observing the responses from the vector store to the queries created in B.1., which ‘k’ parameter do you think retrieved the highest quality / most accurate result? Why do you think this parameter was the best to use with the query?

**B.V.** In observing the responses from the vector store to the queries created in B.2., which ‘k’ parameter do you think retrieved the highest quality / most accurate result? Why do you think this parameter was the best to use with the query?

# **BONUS TASKS / QUESTIONS: Define function to call LLM API**

## Please email Sara for the Bonus Task Python Notebook once you've completed your homework assignment