In [176]:
import google.generativeai as genai
from sklearn.metrics.pairwise import cosine_similarity
import pickle as pkl
import numpy as np
from typing import List
import pandas as pd
import textwrap
from sentence_transformers import SentenceTransformer

In [178]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [181]:
# create question embedding
def embed_questions(questions:List, location:str)->None:
    try:
        ques_embeddings = model.encode(questions)
        np.save(location,ques_embeddings, allow_pickle=True, fix_imports=True)
        print('Questions successfully embedded')
    except Exception as e:
        return f"Error encountered {e}"

In [189]:
df= pd.read_csv('qna.csv')

In [188]:
embed_questions(df['Questions'].to_list(),'ques_embedding')

Questions successfully embedded


In [195]:
# load embeddings
ques_embeddings = np.load('ques_embedding.npy', allow_pickle=True)

# load qna data
df = pd.read_csv('qna.csv')

# embedded our query
def get_similar_queries(query:str, ques_df:pd.DataFrame,
                        threshold:int = 0.5)->dict:
    try:
        emb_query = model.encode(query)
        scores = cosine_similarity([emb_query], ques_embeddings)
        mask = scores > threshold
        return ques_df[mask.reshape(-1,1)].to_dict(orient='records')
    except Exception as err:
        return f"Error encountered : {err}"

In [208]:
query = '''Blueberry Pudding Recepie'''
response = get_similar_queries(query,df,0.5)
response

[]

In [205]:
query = '''Data Lake'''
response = get_similar_queries(query,df, 0.5)
response

[{'Questions': 'What is a cloud-based data lake?',
  'Answers': 'A cloud-based data lake is a centralized repository that allows you to store all your structured and unstructured data at any scale.'}]

### Using a LLM to make the answers better

In [209]:
GOOGLE_API_KEY="XXXXXXXXXXXXXXX"
genai.configure(api_key=GOOGLE_API_KEY)

In [234]:
def generate_llm_answers(query):

    prompt = '''
                I will provide you with a context consisting of questions and answers.
                You should use only the provided context to answer the query.
                Do not use any outside knowledge or make up your own answers.

                Context:
                {context}

                Query:
                {query}

                Instructions:
                - If the query can be answered using the context, provide the answer.
                - If the query cannot be answered using the context, respond with "I do not have the required details to answer the query."
                '''
    response = get_similar_queries(query, df)
    prompt_ = prompt.format(context = response, query = query)
    llm_response = genai.chat(prompt=prompt_, temperature=0)
    return llm_response.messages[-1]['content']


In [235]:
print(generate_llm_answers('Data Lakes'))

A data lake is a centralized repository that allows you to store all your structured and unstructured data at any scale. It is a cloud-based data storage solution that can be used to store any type of data, including text, images, videos, and audio. Data lakes are often used by businesses to store and analyze large amounts of data.

Data lakes are a relatively new technology, and there are a number of different vendors that offer data lake solutions. Some of the most popular data lake vendors include Amazon Web Services (AWS), Microsoft Azure, and Google Cloud Platform.

Data lakes offer a number of advantages over traditional data warehouses. First, data lakes are designed to store any type of data, including unstructured data. This makes them ideal for storing data from a variety of sources, such as social media, sensors, and IoT devices. Second, data lakes are designed to be scalable. This means that they can be easily expanded to store more data as your business grows. Third, data 

In [238]:
print(generate_llm_answers('Blueberry Pie Recepie'))

I do not have the required details to answer the query.
