# import Python packages

In [1]:
import huggingface_hub, datasets, transformers, os, sys, json, random, re, nltk, time, requests, faiss

import pandas as pd
import numpy as np

from transformers import pipeline
from sentence_transformers import SentenceTransformer

from nltk.tokenize import sent_tokenize

from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from dotenv import load_dotenv

nltk.download('punkt_tab')
nltk.download('')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/rzrizaldy/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Error loading : Package '' not found in index


False

In [3]:
import llm_functions, rag_functions, evaluation_metrics, data_functions
# FYI: It's necessary to have these .PY files in the same directory as this Python Notebook and an __init__.py file needs to be in the same directory to import the functions from these Python files. 
# Email Sara Kingsley or the Teaching Assistants with any issues importing these.

from llm_functions import LLM_requesters
from rag_functions import ragLLM
from  evaluation_metrics import RagMetrics
from data_functions import DataFunctions

llm = LLM_requesters()
ragllm = ragLLM()
metrics = RagMetrics()
datafunc = DataFunctions()


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/rzrizaldy/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [4]:
# load in the required API keys from your .env file:
load_dotenv()

openai = os.getenv('OPENAI_API_KEY')       # OpenAI API Key. 
hf_token = os.getenv('HF_TOKEN')    # HuggingFaceHub Authentication Token. 
HF_INF_TOKEN = os.getenv("HF_INF_TOKEN")   # HuggingFace Inference Endpoint Token. 

gemma_endpoint = os.getenv("gemma_endpoint_url")

In [5]:
model_01 = "gpt-3.5-turbo"
model_02 = "gpt-4o"
model_03="meta-llama/Llama-3.2-1B"

In [None]:
input_message = {
	"inputs": "Can you please let us know more details about your ",
	"parameters": {
		"max_new_tokens": 100}
}

output = llm.query_llm_model(input_message, HF_INF_TOKEN, model_endpoint=gemma_endpoint) 

print(output)

ValueError: The HuggingFace Inference endpoint URL (gemma_endpoint_url) was not found in your .env file.

# Coding Tasks Required to Answer Bonus Questions

In [13]:
# import the Resume dataset into the Python environment:

df_path = "Resume.csv"   # Note you will need to change this path to the location of the Resume.csv file on your local machine

df=pd.read_csv(df_path)

In [14]:
# change the column name to 'text'; this will format the dataset in the fashion required by some of the functions in this notebook
df.rename(columns = {'Resume_str':'text'}, inplace = True)

In [15]:
''' Create a sample dataset from the Resume dataset: 
Since the Resume dataset contains many rows, we will create a sample dataset with a smaller number of rows to work with.
'''

# Function to select 10 random rows from each group, where group is defined by the column named 'Category', and the row selection is from the column named 'text
def select_random_rows(group, n=10):
    return group.sample(n=min(n, len(group)))

# Applying the function to each group in the column named 'Category':
sample_resume_df = df.groupby('Category').apply(lambda x: select_random_rows(x['text'], 10)).reset_index(drop=False)

sample_resume_df.tail(100)

  sample_resume_df = df.groupby('Category').apply(lambda x: select_random_rows(x['text'], 10)).reset_index(drop=False)


Unnamed: 0,Category,level_1,text
140,DIGITAL-MEDIA,1257,"SENIOR DIRECTOR, PRODUCT MANAGEMENT ..."
141,DIGITAL-MEDIA,1254,DIRECTOR OF NEW BUSINESS DEVELOPMENT ...
142,DIGITAL-MEDIA,1252,SENIOR MARKETING MANAGER Execut...
143,DIGITAL-MEDIA,1312,DIGITAL MARKETING MANAGER Summa...
144,DIGITAL-MEDIA,1311,DIGITAL MARKETING DIRECTOR ...
...,...,...,...
235,TEACHER,419,Marilyn Hunter Summary Focus...
236,TEACHER,410,LEAD TEACHER Summary Solid ...
237,TEACHER,416,Kimberly Fisheli Summary Ded...
238,TEACHER,394,SUBSTITUTE TEACHER Skills ...


In [16]:
# # Split resumes into sentences
sentences_df = datafunc.split_resumes_to_sentences(sample_resume_df, 'text')

In [17]:
# create a list to store the chunks of text that we will create next:
resume_sentences = []

# split the text in each row of the 'text' column into sentences and store the sentences in the list:
for row in sample_resume_df['text']:
    sentences = datafunc.split_text_into_sentences(row)
    resume_sentences.extend(sentences)

In [18]:
# define the model for creating embeddings:
model = SentenceTransformer('bert-base-nli-mean-tokens')

# create embeddings for the sentences:
sentence_embeddings = model.encode(sentences_df['sentence'])
sentence_embeddings.shape

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

(9151, 768)

In [19]:
len(set(sentences))

17

### Create a FAISS Index, Add the sentence embeddings to the index and create a Vector store:

In [20]:

d = sentence_embeddings.shape[1]

nb = len(set(sentences))

nq = 10000 
np.random.seed(1234)             # make reproducible
xb = np.random.random((nb, d)).astype('float32')
nlist = 100

import faiss


index = faiss.IndexFlatL2(d)
index

index.add(sentence_embeddings)

index.ntotal

index.train(sentence_embeddings)

index.is_trained  # check if index is now trained


True

## Define the models we will use in the experiments:

In [21]:
model_01 = "gpt-3.5-turbo"
model_02 = "gpt-4o"
model_03="meta-llama/Llama-3.2-1B"

In [22]:
sentence_embeddings

array([[-0.8429529 ,  0.64041024,  0.6867857 , ...,  0.4879449 ,
         0.10311098,  0.3043144 ],
       [-0.53987557,  0.7626106 ,  1.1137615 , ..., -0.32825473,
         0.23717196,  0.6216686 ],
       [ 0.12165318, -0.31398743,  1.1303501 , ..., -0.12323594,
        -0.6754674 ,  0.07177585],
       ...,
       [ 0.20645209,  0.708416  ,  0.600371  , ...,  0.3412706 ,
        -0.46779758, -0.32732195],
       [ 0.3091744 ,  0.32063502,  1.7302022 , ..., -0.97805065,
        -1.4428306 , -0.15120117],
       [-0.453167  ,  1.0767179 ,  0.82169944, ..., -0.402537  ,
        -0.20317224, -0.11159677]], dtype=float32)

## Define the Queries for the Experiments:

In [23]:
d = sentence_embeddings.shape[1]
nb = len(set(sentences))
nq = 10000 
np.random.seed(1234)             # make reproducible
xb = np.random.random((nb, d)).astype('float32')
nlist = 100
index = faiss.IndexFlatL2(d)
index
index.add(sentence_embeddings)
index.train(sentence_embeddings)
print(index.is_trained)

def get_sys_message(user_prompt, k):
    model = SentenceTransformer('bert-base-nli-mean-tokens')
    k=k
    xq = model.encode([user_prompt])
    D, I = index.search(xq, k)  # search
    first_index = I[0]  # Get the first index from I
    rag_search_results = sentences_df['sentence'].iloc[first_index].sum()
    return rag_search_results

True


# Create 3 system prompts to use in a call to different LLMs by querying the vector store with the queries given to you in the Bonus Task Instructions:
(the questions below are examples; don't use these to complete the bonus tasks)

In [29]:
sys_messageA = get_sys_message(user_prompt="Which resume has the most software skills listed?", k=5)
sys_messageA

'Expert computing and technology skills including competence in multiple software applications.Good understanding of Software Development Life Cycles and its different phases.Performed complexity installation and maintenance of software.Incluya conocimientos de paquetes de software y sistemas informáticos pertinentes e indique su nivel de conocimiento básico, intermedio, experto).Software management for Dept.'

In [30]:
sys_messageB = get_sys_message(user_prompt="Which resume has the most accounting skills listed?", k=5)
sys_messageB

'Leadership and business accounting skills.Established process for professional services project accounts receivable, collections, and accounts payable bookkeeping systems which created a monthly cash flow that was not previously available.Developed expertise navigating financial management pipelines like accounting, reporting, internal controls, budgeting, analysis and performance management.Bookkeeping of small and midsized companies Worked on bank reconciliation, cash management and financial statements analysis.Financial Analyst  ,     07/2013   to   07/2014     Company Name          Selected to examine accounting records to compile financial information and reconcile reports.'

In [31]:
sys_messageC = get_sys_message(user_prompt="Which resume has the most biology skills listed?", k=5)
sys_messageC

'Designed and built a novel spotting-device for producing protein microarrays compatible with a 96-well microplate format for high throughput applications.Experience      Company Name    City  ,   State    GRADUATE RESEARCH ASSISTANT   01/2013   to   04/2016       Assist with various research project in the Center of Biological control lab.Perform specialized tests such as aflatoxin and protein analysis; calibrates equipment, grind samples, read results and enters reading on official certificates.Total genomic DNA isolation & PCR & Cloning of polyene CYP gene \n     (cytochrome P-450 hydroxlase).Programmed Labview spotting routine for the protein microarray spotting-device.'

# Request responses from 4 different LLMs, following the Bonus Task Instructions:
(the questions below are examples; don't use these to complete the bonus tasks)

In [25]:

textA=ragllm.rag_llm_openai(model=model_01, user_prompt="Which resume has the most software skills listed?", k=10, task="summarize", system_message=sys_message)

In [26]:
textA

'The candidate possesses expert computing and technology skills, competence in multiple software applications, a good understanding of Software Development Life Cycles, and has performed complex installations and maintenance of software. They have knowledge of relevant software packages and computer systems at an intermediate to expert level and have experience in software management for departmental tasks.'

In [27]:

textB= ragllm.rag_llm_openai(model=model_02, user_prompt="Which resume has the most software skills listed?", k=10, task="summarize", system_message=sys_message)

In [28]:
textB

'The resume highlights strong computing and technological expertise, including proficiency in multiple software applications. It demonstrates a good understanding of Software Development Life Cycles and its various phases. The candidate has experience in the complex installation and maintenance of software and possesses knowledge of relevant software packages and computer systems, specifying their proficiency levels as basic, intermediate, or expert. Additionally, they manage software for their department to accomplish tasks.'

In [35]:

textC=ragllm.rag_llm(model=model_03, user_prompt="Which resume has the most software skills listed?", k=10, task="summarize", system_message=sys_message)

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


In [32]:
textD = "<insert call to LLM of your chocie>"

## Evaluate the Information Retrieved and LLM responses by comparing the similarity of the responses across models

In [33]:
def compare_text_similarity(text_a, text_b, text_c):
    """
    Compares the similarity between three texts using TF-IDF vectors and cosine similarity.
    
    Parameters:
    - text_a (str): Text A
    - text_b (str): Text B
    - text_c (str): Text C
    
    Returns:
    - A dictionary with similarity scores between Text A & Text B, A & C, and B & C.
    """
    # Initialize the vectorizer and transform texts into TF-IDF vectors
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text_a, text_b, text_c])
    
    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # Similarity between Text A and B, A and C, and then B and C
    similarity_scores = {
        "A_B": similarity_matrix[0, 1],
        "A_C": similarity_matrix[0, 2],
        "B_C": similarity_matrix[1, 2]
    }

    return similarity_scores

def compare_text_similarity_response2context(om, model_response):
    """
    Compares the similarity between three texts using TF-IDF vectors and cosine similarity.
    
    Parameters:
    - text_a (str): Text A
    - text_b (str): Text B
    - text_c (str): Text C
    
    Returns:
    - A dictionary with similarity scores between Text A & Text B, A & C, and B & C.
    """
    # Initialize the vectorizer and transform texts into TF-IDF vectors
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([om, model_response])
    
    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # Similarity between Text A and B, A and C, and then B and C
    similarity_scores = {
        "A_B": similarity_matrix[0, 1],
        #"A_C": similarity_matrix[0, 2],
        #"B_C": similarity_matrix[1, 2]
    }

    return similarity_scores

## Compare the Similarity of LLM Response Messages A, B, and C:

In [40]:
similarities = compare_text_similarity(text_a=textA, text_b=textB, text_c=textC)

print("LLM Response Similarity Scores:")
for pair, score in similarities.items():
    print(f"{pair}: {score:.4f}")

LLM Response Similarity Scores:
A_B: 0.7036
A_C: 0.0000
B_C: 0.0000


## Compare the Similarity of System Messages A, B, and C:

In [42]:
similaritiesOM =compare_text_similarity(text_a=sys_messageA, text_b=sys_messageB, text_c=sys_messageC)

print("System Prompt Similarity -to-  System Prompt Similarity Scores:")
for pair, score in similaritiesOM.items():
    print(f"{pair}: {score:.4f}")

System Prompt Similarity -to-  System Prompt Similarity Scores:
A_B: 0.1049
A_C: 0.0769
B_C: 0.1255


## Compare the Similarity of System Messages A, B, and C to LLM Response Messages A, B and C:

In [44]:
similaritiesOM_LLM_A =compare_text_similarity_response2context(om=sys_messageA, model_response=textA)
similaritiesOM_LLM_B =compare_text_similarity_response2context(om=sys_messageB, model_response=textB)
similaritiesOM_LLM_C =compare_text_similarity_response2context(om=sys_messageC, model_response=textC)

print("System Prompt to LLM Response \n Similarity Scores \n Model 1:")
for pair, score in similaritiesOM_LLM_A.items():
    print(f"{pair}: {score:.4f}")
print("\n")
print("System Prompt to LLM Response\n Similarity Scores \n Model 2:")
for pair, score in similaritiesOM_LLM_B.items():
    print(f"{pair}: {score:.4f}")
print("\n")
print("System Prompt to LLM Response\n Similarity Scores \n Model 3:")
for pair, score in similaritiesOM_LLM_C.items():
    print(f"{pair}: {score:.4f}")

System Prompt to LLM Response 
 Similarity Scores 
 Model 1:
A_B: 0.5392


System Prompt to LLM Response
 Similarity Scores 
 Model 2:
A_B: 0.1686


System Prompt to LLM Response
 Similarity Scores 
 Model 3:
A_B: 0.0000
