In [235]:
import dotenv
from transformers import LongformerTokenizer, LongformerModel
import torch
from langchain_pinecone import PineconeVectorStore
from langchain_anthropic import ChatAnthropic
from langchain import LLMChain
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQAWithSourcesChain

# Load environment variables
dotenv.load_dotenv()

True

In [156]:
# Load Longformer model and tokenizer
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-large-4096')
model = LongformerModel.from_pretrained('allenai/longformer-large-4096')

In [158]:
# Define custom embedding class with embed_query method
class LongformerEmbedding:
    def __init__(self, tokenizer, model):
        self.tokenizer = tokenizer
        self.model = model

    def embed_query(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", max_length=4096, truncation=True)
        with torch.no_grad():
            outputs = self.model(**inputs)
            embedding = outputs.last_hidden_state.mean(dim=1).squeeze().tolist()  # Mean pooling
        return embedding

# Instantiate the custom embedding class
longformer_embedder = LongformerEmbedding(tokenizer, model)

In [210]:
# Connect to Pinecone and initialize with the existing index
index_name = "hubspot-crm-txts"

# Initialize PineconeVectorStore with the custom embedding class
vector_store = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=longformer_embedder  # Pass the embedding class here
)

In [293]:
# Set up the Retrieval Chain with LangChain
llm = ChatAnthropic(model='claude-3-opus-20240229', temperature=0.1)

# Define your custom prompt with context and prefix
template = """You are a helpful assistant that generates HubSpot API code based on the provided context.
Always prioritize information from the context when available.

Context: {text}

Generate the Python code using the HubSpot Client Library with no comments to answer the following question.
Use your general knowledge as a helpful assistant if no specific context is provided.
Only return code, no additional text. Use the HubSpot Python library where possible.
You will be provided with the access_token so be sure to use it. 
Note that the returned response from your code should be a json object, do not parse it. 
Your final line should be: print(response), where response is the json object returned from your API call.
Do not add a 'limit' parameter within the response unless explicitly asked.
If you are asked to filter the data by a specific property, you can create a Filter from the Hubspot Python library. 
Don't forget to import the proper library as well based on the HubSpot object in question.
Be sure to wrap the code in a try catch block and print the error if any.

Question: {question}"""

prompt = PromptTemplate(template=template, input_variables=["text", "question"])

In [294]:
# Define the Retrieval-Augmented Generation (RAG) chain
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vector_store.as_retriever(search_kwargs={"k": 3}),
    chain_type="stuff",
    chain_type_kwargs = {
        "prompt": prompt,
        "document_variable_name": "text"
    }
)

In [295]:
# Query the RAG Chain
query = "what company has the website hubspot.com"
response = rag_chain.invoke(query)
print(response['result'])

try:
    from hubspot import HubSpot
    from hubspot.crm.companies import ApiException, Filter, FilterGroup, PublicObjectSearchRequest
    
    hubspot = HubSpot(access_token='your_access_token')
    
    filter = Filter(property_name="domain", operator="EQ", value="hubspot.com")
    filter_group = FilterGroup(filters=[filter])
    public_object_search_request = PublicObjectSearchRequest(filter_groups=[filter_group])
    
    response = hubspot.crm.companies.search_api.do_search(public_object_search_request=public_object_search_request)
    
    print(response)
except ApiException as e:
    print("Error: {0}".format(e))
