In [1]:
import google.generativeai as palm
from qdrant_client import QdrantClient
import os
import qdrant_client, pickle
from google.generativeai.types import HarmCategory
from google.generativeai.types import HarmBlockThreshold

import uuid
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from qdrant_client.http.models import PointStruct, CollectionStatus, UpdateStatus
from qdrant_client.http.models import Filter, FieldCondition, MatchValue
from qdrant_client.http import models
from typing import List

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
models = [m for m in palm.list_models() if 'embedText' in m.supported_generation_methods]
model = models[0]
models

[Model(name='models/embedding-gecko-001', base_model_id='', version='001', display_name='Embedding Gecko', description='Obtain a distributed representation of a text.', input_token_limit=1024, output_token_limit=1, supported_generation_methods=['embedText'], temperature=None, top_p=None, top_k=None)]

In [4]:
def split_string_into_chunks_bytes(string, chunk_size=9500):
  """
  Splits a string into chunks of a given size in bytes.

  Args:
      string: The string to split.
      chunk_size: The size of each chunk in bytes.

  Returns:
      A list of strings, where each string is a chunk of the original string.
  """

  chunks = []
  byte_offset = 0
  while byte_offset < len(string.encode()):
    chunk = string[byte_offset:byte_offset + chunk_size]
    byte_offset += chunk_size
    chunks.append(chunk)
  return chunks

text = []
files = os.listdir('parsedData')
for i in range(len(files)):
    file = os.path.join('parsedData', files[i])
    temp = open(file, 'r', encoding='utf-8').read()
    if len(temp.encode('utf-8')) > 9500:
        chunks = split_string_into_chunks_bytes(temp)
        text.extend(chunks)
    else:
        text.append(temp)
    

print(len(text))

661


In [5]:
count = 0

def embed_function(texts):
  # Embed the documents using any supported method
  all_embeddings = []
  global count
  for i in texts:
    all_embeddings.append(palm.generate_embeddings(model, i)['embedding'])
  print(count)
  count += 1
  return all_embeddings


def create_qdrant_db(documents, name):
  client = qdrant_client.QdrantClient(":memory:")
  # Create a collection if it doesn't exist.
  # if not client.get_collection(name):
  client.create_collection(name, vectors_config=VectorParams(size=768, distance=Distance.COSINE))
  
  points = []
  for i in documents:
    points.append(PointStruct(
      id=str(uuid.uuid4()),
      vector=embed_function([i])[0],
      payload={"text": i}
    ))

  client.upsert(name, points)
  return client

In [6]:
db = create_qdrant_db(text, "palm_hazwoper_qdrant")

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [32]:
def get_relevant_chunk(db, query):
    ret = db.search(collection_name="palm_hazwoper_qdrant", query_vector=embed_function([query])[0], limit=3)
    return ret[0].payload['text'] + '\n\n' + ret[1].payload['text'] + '\n\n' + ret[2].payload['text']

In [55]:
ret = get_relevant_chunk(db, "What is the purpose of the HAZWOPER standard?")

30


In [46]:
def make_prompt(query, relevant):
  escaped = relevant.replace("'", "").replace('"', "").replace("\n", " ")
  prompt = ("""You are a customer support agent for the company "Hazwoper-osha", do not play any other role. Only strict use the following pieces of context to answer the question at the end. If you don't know the answer or if the answer is not explicitly mentioned, just say "I can only answer Hazwoper-OSHA related questions, please contact support if you need further assistance", strictly don't try to make up an answer. If the context has relevant links, use them in the answer.
  
  Context: {relevant}
Question: {query}
Kindly Answer Question:""").format(query=query, relevant=escaped)

  return prompt

In [12]:
text_models = [m for m in palm.list_models() if 'generateText' in m.supported_generation_methods]
text_model = text_models[0]
text_model

Model(name='models/text-bison-001', base_model_id='', version='001', display_name='Text Bison', description='Model targeted for text generation.', input_token_limit=8196, output_token_limit=1024, supported_generation_methods=['generateText', 'countTextTokens', 'createTunedTextModel'], temperature=0.7, top_p=0.95, top_k=40)

In [33]:
def answer(model, query, db, temperature=0.01):
  passage = get_relevant_chunk(db, query)
  print("Passage: ", passage)
  prompt = make_prompt(query, passage)
  print("Prompt: ", prompt)
  answer = palm.generate_text(
    prompt=prompt,
    model=model,
    candidate_count=3, 
    temperature=temperature, 
    max_output_tokens=1000,
    safety_settings = [
      {
        "category": HarmCategory.HARM_CATEGORY_DEROGATORY,
        "threshold": HarmBlockThreshold.BLOCK_NONE,
      },
      {
        "category": HarmCategory.HARM_CATEGORY_TOXICITY,
        "threshold": HarmBlockThreshold.BLOCK_NONE,
      },
      {
        "category": HarmCategory.HARM_CATEGORY_SEXUAL,
        "threshold": HarmBlockThreshold.BLOCK_NONE,
      },
      {
        "category": HarmCategory.HARM_CATEGORY_MEDICAL,
        "threshold": HarmBlockThreshold.BLOCK_NONE,
      },
      {
        "category": HarmCategory.HARM_CATEGORY_DANGEROUS,
        "threshold": HarmBlockThreshold.BLOCK_NONE,
      },
      {
        "category": HarmCategory.HARM_CATEGORY_VIOLENCE,
        "threshold": HarmBlockThreshold.BLOCK_NONE,
      },
      {
        "category": HarmCategory.HARM_CATEGORY_UNSPECIFIED,
        "threshold": HarmBlockThreshold.BLOCK_NONE,
      },
      
    ]
  )
  return answer

In [101]:
query = "tell me about Toxicology?"
ans = answer(text_model, query, db)

664
Passage:  title
What is Toxicology?
description
What is Toxicology?
Most people will relate the word
toxicology
to popular medical dramas and crime shows on television. Dr. Richard Weber’s case of cobalt poisoning in the season 16 finale of
Grey’s Anatomy
is a good example of medical toxicology. While shows such as
CSI, NCIS,
and
Bones
depend on
forensic toxicology
to solve their crime-scene cases. Some of us may have also heard the word used in science laboratories while in high school. But what exactly does
toxicology
mean? The Merriam-Webster dictionary defines
toxicology
as “a science that deals with poisons and their effect and with the problems involved (such as clinical, industrial, or legal problems)”. The National Institute of Environmental Sciences (
NIEHS
) provides a more comprehensive understanding of toxicology:
Toxicology is a field of science that helps us understand the harmful effects that chemicals, substances, or situations, can have on people, animals, and the 

In [102]:
ans

Completion(candidates=[{'output': 'tell me about Toxicology?\n\nToxicology is the study of the harmful effects that chemicals, substances, or situations, can have on people, animals, and the environment.\n\nToxicology is intrinsically related to occupational diseases. Many workers develop occupational diseases due to long-term exposure to chemicals, radioactive materials, and other harmful substances.\n\nOSHA has identified the importance of toxicology in the workplace and enforces safety standards to be followed by employers. These requirements range from the number of chemicals and other hazardous substances that workers can be exposed to; toxic substance safe limits at a worksite; processes and practices that should be implemented; protective measures and equipment that must be used; the need for periodic toxicological testing and medical surveillance for workers; and the recording of workplace injuries, occupational diseases, and accidents due to the use of hazardous substances.', 

In [83]:
type(db)

qdrant_client.qdrant_client.QdrantClient

In [8]:
import pickle
with open("embeddings_qdrant_palm_all2.pkl", "wb") as f:
    pickle.dump(db, f)

## Load qdrant DB and query

In [9]:
# db1 = pickle.load(open("embeddings_qdrant_palm.pkl", "rb"))
with open("embeddings_qdrant_palm_all2.pkl","rb") as f:
    db1 = pickle.load(f)

In [14]:
type(db1)

qdrant_client.qdrant_client.QdrantClient

In [42]:
query = "Is there any discount I can avail for silica awareness course? If there is, tell me the source."
ans = answer(text_model, query, db1)

670
Passage:  title
Accreditation & Requirements
alt_title
OSHA Silica Awareness Training
hours
2
price
$29.99
course_category
OSHA Construction Series
Available in English and Spanish.Do you want to know more about the course? or register/enroll/buy/purchase the course? or try the demo/trial? If you want the demo/registration/course link then go to the webpage https://hazwoper-osha.com/online-courses/silica-awareness-training/ 
description
This course is designed by OSHA Certified Outreach Trainers and is updated and reviewed as soon as new requirements or regulations are issued by OSHA, DOT, and other regulatory agencies. The course meets the compliance requirements of OSHA’s Respirable Crystalline Silica Standards for the General Industry (
29 CFR 1910.1053
) and the Construction Industry (
29 CFR 1926.1153
).
Training Requirements
There are no prerequisites for enrolling in this OSHA Silica Awareness Training course.
Safety Training Certification
Once you complete this OSHA Silica 

In [45]:
ans

Completion(candidates=[{'output': 'Is there any discount I can avail for silica awareness course? If there is, tell me the source.\n\nYes, there is a 10% discount available on the OSHA Silica Awareness Training course. To avail of this discount, use the code "SILICA10" at checkout.', 'safety_ratings': [{'category': <HarmCategory.HARM_CATEGORY_DEROGATORY: 1>, 'probability': <HarmProbability.LOW: 2>}, {'category': <HarmCategory.HARM_CATEGORY_TOXICITY: 2>, 'probability': <HarmProbability.MEDIUM: 3>}, {'category': <HarmCategory.HARM_CATEGORY_VIOLENCE: 3>, 'probability': <HarmProbability.NEGLIGIBLE: 1>}, {'category': <HarmCategory.HARM_CATEGORY_SEXUAL: 4>, 'probability': <HarmProbability.NEGLIGIBLE: 1>}, {'category': <HarmCategory.HARM_CATEGORY_MEDICAL: 5>, 'probability': <HarmProbability.NEGLIGIBLE: 1>}, {'category': <HarmCategory.HARM_CATEGORY_DANGEROUS: 6>, 'probability': <HarmProbability.LOW: 2>}]}, {'output': 'Is there any discount I can avail for silica awareness course? If there is, 