In [15]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np
import chromadb
from chromadb.config import Settings

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# need huggingface apikey
from config import api_key

apikey=api_key


from torch import cuda, bfloat16
import transformers

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

hf_auth = apikey

In [22]:
# read json summaries in

import json

with open(r'/home/scotsditch/stuff/scotsditch_storage/LLM/RAG/data/summaries.json') as f:
    summaries_data = json.load(f)
    
# convert to dataframe
summary_df=pd.DataFrame(summaries_data['summaries'])

# create meta data for vector db

summary_df['meta'] = summary_df.apply( lambda x: {
    'id': x['id'],
    'summary': x['summary']  
}, axis=1)
    
summary_df.head()


Unnamed: 0,id,summary,meta
0,1,An email with title: Long term contract- Data ...,"{'id': 1, 'summary': 'An email with title: Lon..."
1,2,An email with title: W2 Contract //Data Analys...,"{'id': 2, 'summary': 'An email with title: W2 ..."
2,3,An email with title: HBITS-05-12948 - Data War...,"{'id': 3, 'summary': 'An email with title: HBI..."
3,4,An email with title: I just left you a Voicem...,"{'id': 4, 'summary': 'An email with title: I ..."
4,5,An email with title: JD // Automation Engineer...,"{'id': 5, 'summary': 'An email with title: JD ..."


In [82]:
# creating custom embeddings with non-default embedding model

from chromadb import Documents, EmbeddingFunction, Embeddings

class MyEmbeddingFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        # embed the documents somehow
        
        from sentence_transformers import SentenceTransformer
#         sentences = ["This is an example sentence", "Each sentence is converted"]

        sentences = input
    
        model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2',
                                   device='cuda',
                           use_auth_token=hf_auth,
                           cache_folder='/home/scotsditch/stuff/scotsditch_storage/LLM/weights/huggingface/hub/')
        embeddings = model.encode(sentences)

        # Convert embeddings to a list of lists
        embeddings_as_list = [embedding.tolist() for embedding in embeddings]
        
        return embeddings_as_list

In [83]:
custom_embeddings=MyEmbeddingFunction()

custom_embeddings('this is a test.')

[[-0.03300582990050316,
  0.13584616780281067,
  0.016182007268071175,
  -0.0022714489605277777,
  -0.02264948934316635,
  0.011405914090573788,
  -0.008481341414153576,
  -0.005180958658456802,
  -0.07104773074388504,
  -0.010902365669608116,
  0.04024652764201164,
  -0.06977316737174988,
  0.010477611795067787,
  -0.0295464638620615,
  -0.15845359861850739,
  -0.039165765047073364,
  0.04759972542524338,
  -0.12054993212223053,
  0.032062046229839325,
  0.03315838798880577,
  0.06646911054849625,
  -0.045023415237665176,
  0.07239864766597748,
  0.033379461616277695,
  -0.011803864501416683,
  -0.011123333126306534,
  0.014493635855615139,
  -0.02896473929286003,
  0.04176318645477295,
  -0.07552534341812134,
  0.02333850972354412,
  0.09401130676269531,
  0.09806432574987411,
  0.00938559789210558,
  -0.0054093520157039165,
  0.03693075478076935,
  0.006654756143689156,
  0.014179966412484646,
  0.038344115018844604,
  0.03971683606505394,
  0.03699067234992981,
  -0.079488009214401

In [84]:

# Create chromadb vector database
chroma_client = chromadb\
.PersistentClient(path="RAG-Example-chroma-db")

# testing creating 2nd collection

test_collection = chroma_client\
.get_or_create_collection(name="test_custom_embeddings",
                          embedding_function=custom_embeddings
                         
                         )


In [93]:
# inserting data

test_collection.upsert(
    ids=[f"{x}" for x in summary_df['id'].tolist()],
    documents=summary_df['summary'].tolist(),
    metadatas=summary_df['meta'].tolist()    
)


qry_str = """Title contains Data Scientist"""


db_query_results=test_collection.query(query_texts=qry_str, n_results=2)

result_summaries=[x['summary'] for x in db_query_results['metadatas'][0]]

result_summaries


["An email with title: Urgent || Data Scientist/Engineer || Location - Las Vegas, NV was sent to job seeker Scot Shields on Tuesday, August 22, 2023 at 06:54 AM PDT.  It was for the position of Data Scientist/Engineer.  It's location was Las Vegas, NV.  The employment type was contract.  It had the required skills: statistical programming languages, R, Python, sql, hive, pig, scala, java, C++, statistics, statistical tests, distributions, regression, maximum likelihood estimators, machine learning,k-Nearest Neighbors, Naive Bayes, SVM, Decision Forests, Data Wrangling, Data Visualization, matplotlib, ggplot, d3.js., Tableau, Communication Skills, Software Engineering, Problem-solving, analytical, degree.",
 "An email with title: Lead Data Scientist - O'Fallon, MO (Hybrid) was sent to job seeker Scot Shields on Tuesday, August 22, 2023 at 07:16 AM PDT.  It was for the position of Lead Data Scientist.  It's location was O'Fallon, MO (Hybrid).  The employment type was contract.  It had th

In [19]:
# alternatively specify different sentence transformer model for embeddings
# model stored locally

from chromadb.utils import embedding_functions

import os
os.environ['TRANSFORMERS_CACHE'] = '/home/scotsditch/stuff/scotsditch_storage/LLM/weights/huggingface/hub/'

sentence_transformer_ef = embedding_functions\
.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L12-v2",
                                     device='cuda',
                                     )

  return self.fget.__get__(instance, owner)()


In [20]:
test_collection2 = chroma_client\
.get_or_create_collection(name="test2_custom_embeddings",
                          embedding_function=sentence_transformer_ef                         
                         )




In [23]:
# inserting data

test_collection2.upsert(
    ids=[f"{x}" for x in summary_df['id'].tolist()],
    documents=summary_df['summary'].tolist(),
    metadatas=summary_df['meta'].tolist()    
)


qry_str = """Title contains Data Scientist"""


db_query_results=test_collection2.query(query_texts=qry_str, n_results=2)

result_summaries=[x['summary'] for x in db_query_results['metadatas'][0]]

result_summaries

["An email with title: Urgent || Data Scientist/Engineer || Location - Las Vegas, NV was sent to job seeker Scot Shields on Tuesday, August 22, 2023 at 06:54 AM PDT.  It was for the position of Data Scientist/Engineer.  It's location was Las Vegas, NV.  The employment type was contract.  It had the required skills: statistical programming languages, R, Python, sql, hive, pig, scala, java, C++, statistics, statistical tests, distributions, regression, maximum likelihood estimators, machine learning,k-Nearest Neighbors, Naive Bayes, SVM, Decision Forests, Data Wrangling, Data Visualization, matplotlib, ggplot, d3.js., Tableau, Communication Skills, Software Engineering, Problem-solving, analytical, degree.",
 "An email with title: Lead Data Scientist - O'Fallon, MO (Hybrid) was sent to job seeker Scot Shields on Tuesday, August 22, 2023 at 07:16 AM PDT.  It was for the position of Lead Data Scientist.  It's location was O'Fallon, MO (Hybrid).  The employment type was contract.  It had th

In [24]:
# add semantic search results as context for prompt

LLM_prompt_context="""You are helping review summaries of emails and answer questions related to the summaries.

Here are two example summaries of emails that were sent to job seeker Scot Shields:

An email with title: Long term contract- Data Engineer (Lambda and SQL). was sent to job seeker Scot Shields on Tuesday, August 22, 2023 at 11:46 AM PDT.  It was for the position of Data Engineer.  It's location was Remote.  The employment type was Long Term contract.  It had the required skills: AWS, Matillion, Snowflake, Lambda, SQL.
An email with title: W2 Contract //Data Analyst // Remote (Only PST Candidate ) was sent to job seeker Scot Shields on Tuesday, August 22, 2023 at 11:40 AM PDT.  It was for the position of Data Analyst.  It's location was Remote ( West Coast).  The employment type was Contract.  It had the required skills: SQL, Azure, Power BI, DataBricks, Elicit Requirements, Analytics, Reporting, healthcare, TSQL, Power BI, Data Visualization, Synapse, NLP, R, Python, AI.

And here is a comma separated list of the titles from the example summaries above:

['Long term contract- Data Engineer (Lambda and SQL).','W2 Contract //Data Analyst // Remote (Only PST Candidate )']

These are additional summaries of emails that were sent to job seeker Scot Shields: """


for i in range(len(result_summaries)):
    
    if i==0:
        
        additional_context=LLM_prompt_context+"""\n\n """+result_summaries[i]
    else:
        
        additional_context=additional_context+"""\n\n """+result_summaries[i]
        
print(additional_context)




You are helping review summaries of emails and answer questions related to the summaries.

Here are two example summaries of emails that were sent to job seeker Scot Shields:

An email with title: Long term contract- Data Engineer (Lambda and SQL). was sent to job seeker Scot Shields on Tuesday, August 22, 2023 at 11:46 AM PDT.  It was for the position of Data Engineer.  It's location was Remote.  The employment type was Long Term contract.  It had the required skills: AWS, Matillion, Snowflake, Lambda, SQL.
An email with title: W2 Contract //Data Analyst // Remote (Only PST Candidate ) was sent to job seeker Scot Shields on Tuesday, August 22, 2023 at 11:40 AM PDT.  It was for the position of Data Analyst.  It's location was Remote ( West Coast).  The employment type was Contract.  It had the required skills: SQL, Azure, Power BI, DataBricks, Elicit Requirements, Analytics, Reporting, healthcare, TSQL, Power BI, Data Visualization, Synapse, NLP, R, Python, AI.

And here is a comma sep

In [25]:
# Add few shot learning example to prompt

LLM_query="""Please create a comma separated list of the titles of all emails who's titles contained the phrase 'Data Scientist' sent to job seeker Scot Shields."""


LLM_prompt=additional_context+"""\n\n"""+LLM_query

print(LLM_prompt)

You are helping review summaries of emails and answer questions related to the summaries.

Here are two example summaries of emails that were sent to job seeker Scot Shields:

An email with title: Long term contract- Data Engineer (Lambda and SQL). was sent to job seeker Scot Shields on Tuesday, August 22, 2023 at 11:46 AM PDT.  It was for the position of Data Engineer.  It's location was Remote.  The employment type was Long Term contract.  It had the required skills: AWS, Matillion, Snowflake, Lambda, SQL.
An email with title: W2 Contract //Data Analyst // Remote (Only PST Candidate ) was sent to job seeker Scot Shields on Tuesday, August 22, 2023 at 11:40 AM PDT.  It was for the position of Data Analyst.  It's location was Remote ( West Coast).  The employment type was Contract.  It had the required skills: SQL, Azure, Power BI, DataBricks, Elicit Requirements, Analytics, Reporting, healthcare, TSQL, Power BI, Data Visualization, Synapse, NLP, R, Python, AI.

And here is a comma sep

In [27]:
# using openai LLM to generate response

from config import openai_apikey




def openai_response(prompt,apikey):
    
    import os
    from openai import OpenAI
    client = OpenAI()
    
    OpenAI.api_key =apikey

    completion = client.completions.create(
      model="gpt-3.5-turbo-instruct",
      prompt=prompt,
      max_tokens=1000,
      temperature=0
    )

    return completion.choices[0].text.strip()



In [28]:
# test_prompt='What are common roofing materials?'

test_prompt=LLM_prompt

openai_response(prompt=test_prompt,apikey=openai_apikey)

"['Urgent || Data Scientist/Engineer || Location - Las Vegas, NV', 'Lead Data Scientist - O\\'Fallon, MO (Hybrid)']"