In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Feb 17 15:11:03 2025

@author: randy

this file performs inference on a semantic search model training with custom data

you much unzip the model from the tar file

"""

# %pip install accelerate==1.3.0 #0.26.0
# %pip install sentence-transformers==3.4.1
# %pip install datasets==3.3.1


import datetime as dt
import pandas as pd
import re
import numpy as np
import json
import random
import ast
import copy
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pickle

#for vector embeddings
from sentence_transformers import SentenceTransformer, losses, InputExample, util

data_filename = "/home/randy/supportiv/mle_screening_dataset.csv"
embedding_model_filename = '/home/randy/supportiv/custome_model'
nn_model_filename = "/home/randy/supportiv/trained_nn_model.pkl"

In [None]:

#############################################################################################################
# Step one: Build the search model <-------------------- you only need to do this one
#   1. load the data
#   2. load the model from HuggingFace and save locallly
#   3. encode all the answers
#   4. compute the nearest neighbor search model using the encoded answers
#   5. save the nearest neigbor model (as pickle file)

# read the raw data
train_data = pd.read_csv(data_filename)
train_data.columns
docs = train_data['answer'].astype(str).tolist()

#Load the pre-trained (not fine-tuned) embedding model
model = SentenceTransformer(embedding_model_filename)

# encode the answeres with the embedding model
doc_emb = model.encode(docs)

# compute a nearest neighbor model with answer embeddings
nbrs = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(doc_emb)

# same model to storage
pickle.dump(nbrs, open(nn_model_filename, 'wb'))

In [2]:

#############################################################################################################
# Step two: inference
#   1. load nearest neigbor model
#   2. load the HuggingFace model from local strage
#   3. load the orgnial answeres
#   4. ask a question
#   5. show the answers that are closet to the question

# load the nearest meighbor model
loaded_nn_model = pickle.load(open(nn_model_filename, 'rb'))

# load the embedding model
loaded_embedding_mode = SentenceTransformer(embedding_model_filename)

# load the orginal data set
train_data = pd.read_csv(data_filename)

  return torch._C._cuda_getDeviceCount() > 0


In [3]:

def ask_question(the_question, num_results = 5):
    """Aks the semantic search a question and see the results"""
    new_embedding = loaded_embedding_mode.encode(the_question)
    distances, indices = loaded_nn_model.kneighbors([new_embedding], num_results)
    return_len = len(indices[0])

    return_results = []
    for i in range(return_len): #an_index in indices[0]:
        this_answer = train_data.loc[indices[0][i]]['answer']
        this_answers_distance = distances[0][i]
        return_results.append([i+1, the_question, this_answers_distance, this_answer])
        #print("******************************************************************************")
        #print(i+1, this_answers_distance, this_answer)
    
    return return_results


In [4]:
answers = ask_question("can someone be allergic to water?")

print(answers[0][1])

for i in range(len(answers)): #an_index in indices[0]:
    answer_rank = answers[i][0]
    this_answer = answers[i][3]
    answer_distance = answers[i][2]
    
    print("******************************************************************************")
    print(answer_rank, ":", answer_distance,":", this_answer)

can someone be allergic to water?
******************************************************************************
1 : 0.918213472942668 : Summary : We all need to drink water. How much you need depends on your size, activity level, and the weather where you live.    The water you drink is a combination of surface water and groundwater. Surface water includes rivers, lakes and reservoirs. Groundwater comes from underground. The United States has one of the safest water supplies in the world, but drinking water quality can vary from place to place. It depends on the condition of the source water and the treatment it receives. Treatment may include adding fluoride to prevent cavities and chlorine to kill germs.     Your water supplier must give you annual reports on drinking water. The reports include where your water came from and what contaminants are in it.    Centers for Disease Control and Prevention
******************************************************************************
2 : 0.

In [5]:
answers = ask_question("tell me about glaucoma research?")

print(answers[0][1])

for i in range(len(answers)): #an_index in indices[0]:
    answer_rank = answers[i][0]
    this_answer = answers[i][3]
    answer_distance = answers[i][2]
    
    print("******************************************************************************")
    print(answer_rank, ":", answer_distance,":", this_answer)

tell me about glaucoma research?
******************************************************************************
1 : 0.5150078020563468 : Through studies in the laboratory and with patients, the National Eye Institute is seeking better ways to detect, treat, and prevent vision loss in people with glaucoma. For example, researchers have discovered genes that could help explain how glaucoma damages the eye. NEI also is supporting studies to learn more about who is likely to get glaucoma, when to treat people who have increased eye pressure, and which treatment to use first.
******************************************************************************
2 : 0.6567313450002238 : Glaucoma is a group of diseases that can damage the eye's optic nerve and result in vision loss and blindness. The most common form of the disease is open-angle glaucoma. With early treatment, you can often protect your eyes against serious vision loss. (Watch the video to learn more about glaucoma. To enlarge the vi

In [6]:
answers = ask_question("Are pets good to have around?")

print(answers[0][1])

for i in range(len(answers)): #an_index in indices[0]:
    answer_rank = answers[i][0]
    this_answer = answers[i][3]
    answer_distance = answers[i][2]
    
    print("******************************************************************************")
    print(answer_rank, ":", answer_distance,":", this_answer)

    

Are pets good to have around?
******************************************************************************
1 : 0.8237381398961968 : Summary : Pets can add fun, companionship and a feeling of safety to your life. Before getting a pet, think carefully about which animal is best for your family. What is each family member looking for in a pet? Who will take care of it? Does anyone have pet allergies? What type of animal suits your lifestyle and budget?    Once you own a pet, keep it healthy. Know the signs of medical problems. Take your pet to the veterinarian if you notice:       - Loss of appetite    - Drinking a lot of water    - Gaining or losing a lot of weight quickly    - Strange behavior    - Being sluggish and tired    - Trouble getting up or down    - Strange lumps
******************************************************************************
2 : 0.8884635748592732 : Summary : As parents, we want to keep our children safe from harm. Take steps to keep your children safe:      

In [7]:

# test the model accuracy

num_correct = 0
for i,r in train_data.iterrows():
    the_question = r['question']
    the_expected_answer = r['answer']
    
    answers = ask_question(the_question)
    
    for i in range(len(answers)): #an_index in indices[0]:
        answer_rank = answers[i][0]
        this_answer = answers[i][3]
        answer_distance = answers[i][2]
        
        if this_answer==the_expected_answer:
            #print("got it!!")
            num_correct+=1

print("training set accuracy: ", num_correct/train_data.shape[0], "  number correct:", num_correct)     


# training set accuracy:  0.941   number correct: 15449
# training set accuracy:  1.000   number correct: 16407 # possibly over trained <------ final model    

training set accuracy:  1.0000609533097646   number correct: 16407
