In [1]:
# Using Langchain to upload documentation to a Pinecone index:

# Initial Imports
import os
from dotenv import load_dotenv
from langchain.chat_models import ChatOpenAI
from git import Repo
import openai
from langchain.document_loaders import GitLoader
import pinecone
from langchain.vectorstores import Pinecone


# Load the .env file
load_dotenv()

# Load the OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")
openai_org = os.getenv("OPENAI_ORG")

pinecone_key = os.getenv("PINECONE_KEY2")
pinecone_env = os.getenv("PINECONE_ENV2")

  from tqdm.autonotebook import tqdm


In [3]:
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

tiktoken_len("hello I am a chunk of text and using the tiktoken_len function "
             "we can find the length of this chunk of text in tokens")

26

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [3]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("vlocker.pdf")
pages = loader.load()

In [4]:
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(
    model="text-embedding-ada-002",
    openai_api_key = openai.api_key
)

In [5]:
docs=pages

In [7]:
index = Pinecone.from_documents(
    documents=docs,
    embedding=embeddings,
    index_name="vocalockr-bplan"
)

In [2]:
pinecone.init(api_key = pinecone_key, environment=pinecone_env) # Initialize pinecone
pinecone.list_indexes()

['vocalockr-bplan']

In [309]:
from git import Repo

repo = Repo.clone_from(
    "https://github.com/tiangolo/fastapi", to_path="./example_data/test_repo1"
)
branch = repo.head.reference

In [262]:
from langchain.document_loaders import TextLoader
loader = TextLoader("openai.yaml")

In [221]:
from langchain.document_loaders import PythonLoader

In [326]:
from langchain.document_loaders import DirectoryLoader, TextLoader
# Initialize the document loader.  Use the glob
loader = DirectoryLoader('../../bakespace_fastapi/', loader_cls=PythonLoader, glob='**/*.py')

In [8]:
for doc in docs:
    print(doc.page_content)

CONFIDENTIALVocalockr Your Voice, Your Signature, Your LegacyBusiness PlanPrepared August 2023
Contact InformationJoel Kaiser vocalockr@gmail.com5714656108www.vocalockr.com
Vocalockr 1
CONFIDENTIAL - DO NOT DISSEMINATE. This business plan contains confidential, trade-secret 
information and is shared only with the understanding that you will not share its contents or ideas with 
third parties without the express written consent of the plan author.Executive SummaryOpportunityProblemVocal technology is advancing rapidly, but protections for artists have lagged 
behind. There are currently few regulations preventing unauthorized vocal 
replication or ensuring fair compensation when an artist's voiceprint is used. 
Vocalockr seeks to empower singers by giving them full control over if, when, and 
how their voice is used.Our platform solves five key problems facing artists today:1.Deepfake Risks: Artificial intelligence can now create synthetic media, 
including audio, that manipulates or g

In [None]:
pinecone.init(api_key = pinecone_key, environment=pinecone_env) # Initialize pinecone

In [8]:
from langchain.embeddings.openai import OpenAIEmbeddings

embed = OpenAIEmbeddings(
    model="text-embedding-ada-002",
    openai_api_key = openai.api_key,
    disallowed_special=()
)


In [9]:
# Create a chroma db from the documents
from langchain.vectorstores import Chroma

vectorstore = Chroma.from_documents(docs, embed)

Unable to connect optimized C data functions [No module named 'clickhouse_connect.driverc.buffer'], falling back to pure Python


In [10]:
# For each doc in docs, create a list of tuples, where each tuple is doc.page_content, doc.metadata['source']
docs_list = [(doc.page_content, doc.metadata['source']) for doc in docs]

In [11]:
# Determine the token length of each doc
docs_token_length = [tiktoken_len(doc[0]) for doc in docs_list]

# Sum the token lengths
sum(docs_token_length)

16402

In [3]:
from langchain.vectorstores import Pinecone

text_field = "text"

index = pinecone.Index("coding-assist")



In [25]:
index = Pinecone.from_documents(docs, embed, index_name="vocalockr-bplan")

TypeError: expected string or bytes-like object

In [4]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.1,
 'namespaces': {'documentation': {'vector_count': 1931}},
 'total_vector_count': 1931}

In [6]:
from langchain.embeddings.openai import OpenAIEmbeddings

embed = OpenAIEmbeddings(
    openai_api_key=openai.api_key,
    openai_organization=openai_org,
)


vectorstore = Pinecone(
    index, embed.embed_query, text_field, namespace="documentation"
)



In [7]:
query = "How can I create a multi-tool agent using the langchain library?"

answers = vectorstore.similarity_search(
    namespace="documentation",
    query=query,
    k=3
)

In [9]:
for answer in answers:
    print(answer.page_content)

"""Agent toolkits."""

from langchain.agents.agent_toolkits.azure_cognitive_services.toolkit import (
    AzureCognitiveServicesToolkit,
)
from langchain.agents.agent_toolkits.csv.base import create_csv_agent
from langchain.agents.agent_toolkits.file_management.toolkit import (
    FileManagementToolkit,
)
from langchain.agents.agent_toolkits.gmail.toolkit import GmailToolkit
from langchain.agents.agent_toolkits.jira.toolkit import JiraToolkit
from langchain.agents.agent_toolkits.json.base import create_json_agent
from langchain.agents.agent_toolkits.json.toolkit import JsonToolkit
from langchain.agents.agent_toolkits.nla.toolkit import NLAToolkit
from langchain.agents.agent_toolkits.office365.toolkit import O365Toolkit
from langchain.agents.agent_toolkits.openapi.base import create_openapi_agent
from langchain.agents.agent_toolkits.openapi.toolkit import OpenAPIToolkit
from langchain.agents.agent_toolkits.pandas.base import create_pandas_dataframe_agent
from langchain.agents.agent_too

In [11]:
user_question = "What are the highlights of next year's revenue projections?"
context = vectorstore.similarity_search(
    query=user_question,
    k=3
)

In [14]:
# Create a list of tuples from the context
context_list = [(doc.page_content, doc.metadata['source'], doc.metadata['page']) for doc in context]

In [19]:
# Create a list of the page content from the context
context_page_content = [doc[0] for doc in context_list]

def get_bplan_response(question: str, context: list):
    messages = [
        {
            "role": "system", "content": f"""You are a master busines advisor
            and start-up strategist answering a question {question} about 
            an early stage company's business plan.  The relevant information
            from the business plan is {context}."""
        },
        {
            "role": "user", "content": f"""Please answer my {question} about the 
            business plan."""
        },
    ]
    models = ["gpt-3.5-turbo-16k-0613", "gpt-3.5-turbo-16k", "gpt-3.5-turbo-0613, gpt-3.5-turbo"] # Set list of models to iterate through
    for model in models:
        try:
            response = openai.ChatCompletion.create(
                model=model,
                messages = messages,
                max_tokens=500,
                frequency_penalty=0.5,
                presence_penalty=0.5,
                temperature=1,
                n=1
            )
            answer = response.choices[0].message.content

            return answer
        except Exception as e:
            print(e)
            continue

In [20]:
answer = get_bplan_response(user_question, context_page_content)

print(f"{answer}, Sources: {[doc[1] for doc in context_list]}")

Based on the information provided, the revenue projections for next year are as follows:

- January: $406,380
- February: $488,037
- March: $566,877
- April: $730,579
- May: $834,314
- June: $935,740
- July: $1,030,004
- August: $1,227,245
- September: $1,357,593
- October: $1,438,669
- November: $1,555,585
- December: $1,850,950

These projections show a steady increase in revenue throughout the year. It is important to note that these figures are estimates and may be subject to change based on market conditions and other factors., Sources: ['vlocker.pdf', 'vlocker.pdf', 'vlocker.pdf']


In [14]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

# completion llm
llm = ChatOpenAI(
    openai_api_key=openai.api_key,
    model_name='gpt-3.5-turbo-16k-0613',
    temperature=0.0
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

In [15]:
from langchain.chains import RetrievalQAWithSourcesChain

qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

In [23]:
def create_prompt():
    """Create the prompt for the chatbot"""
    template = """
    You are a master business advisor, advising a client based on
    the provided context about their business plan.  If you cannot
    find any relevant data in the context, you may offer general 
    advice, but note that you did not find any relevant context.
    {context}
    Question: {question}
    Helpful Answer:"""
    qa_chain_prompt = PromptTemplate.from_template(template)

    return qa_chain_prompt


In [24]:
qa_chain_prompt = create_prompt()

In [33]:
from langchain.chains import RetrievalQA

question = "How much do you think we should be asking for in a pre-seed round of financing."
llm = ChatOpenAI(model_name="gpt-4", temperature=0.5)
qa_chain = RetrievalQA.from_chain_type(llm,retriever=vectorstore.as_retriever(),
chain_type_kwargs={"prompt": qa_chain_prompt}, verbose=True)
qa_chain({"query": question})




[1m> Entering new RetrievalQA chain...[0m


InvalidRequestError: The model `gpt-4` does not exist or you do not have access to it. Learn more: https://help.openai.com/en/articles/7102672-how-can-i-access-gpt-4.

In [42]:
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import List, Optional
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import OpenAI

class Answer(BaseModel):
    answer: str = Field(description = "The corrected answer text")

output_parser = PydanticOutputParser(pydantic_object=Answer)
template = "Look over this response {response} from a large language model\
      and correct any errors before returning the result as a string.  If there are not\
    specific code examples provided, please add them before returning the result.\n{format_instructions}\n{response}\n"
prompt = PromptTemplate(
    template=template, 
    input_variables=["response"], 
    partial_variables = {"format_instructions": output_parser.get_format_instructions()}
)

gpt_4 = ChatOpenAI(model_name = 'gpt-4-0613', verbose = True, max_retries = 4, max_tokens = 1000)
llm_chain = LLMChain(prompt=prompt, llm=gpt_4, verbose=True)

In [43]:
from langchain.python import PythonREPL
from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType
from langchain.chat_models import ChatOpenAI

In [45]:
#llm = ChatOpenAI(temperature=0, model=gpt_4)

python = PythonREPL(llm=gpt_4, verbose=True)

tools = [
    Tool(
        name="python_repl",
        func=python.run,
        description="Useful for when you need to write or test python code."
    ),
    Tool(
        name="documentation",
        func=qa.run,
        description="Useful for when you need to search for documentation, api references, etc."
    ),
    Tool(
        name="format_final_response",
        func=llm_chain.predict_and_parse,
        description="Useful for when you need to check the final answer for errors and format before\
              returning to the user."
    )    
]

In [46]:
mrkl = initialize_agent(tools, gpt_4, agent=AgentType.OPENAI_FUNCTIONS, verbose=True)

In [47]:
base_query = f'How can I use the langchain library to create a multi-tool agent\
    that will help farmers find the legal forms they need to fill out in order to start and maintain a farm?\
    I want to use multi-tool agents that can also utilize openai\'s function capabilities and any other tools\
    that the agent might need to accomplish this task?  I want to use Streamlit to create the interface for the app.'

'''additional_query = "How can I adjust the pairings functions to be able to generate multiple pairings per recipe,\
    use redis to manage the pairings and the associated recipes in state, and then use sqalchemy to store the pairings in the bakespace database?\
    as well as be able to initiate a new chat session with the pairings as context?"'''

#query = base_query + additional_query

'additional_query = "How can I adjust the pairings functions to be able to generate multiple pairings per recipe,    use redis to manage the pairings and the associated recipes in state, and then use sqalchemy to store the pairings in the bakespace database?    as well as be able to initiate a new chat session with the pairings as context?"'

In [48]:
response = mrkl.run(base_query)

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None)).




[1m> Entering new  chain...[0m
[32;1m[1;3mBuilding a multi-tool agent to help farmers find the legal forms they need to start and maintain a farm using the langchain library, openai's function capabilities, and Streamlit for the interface involves several steps. Here's a general guide:

1. **Understand the Legal Requirements**: First, you need to understand the legal requirements for starting and maintaining a farm. This could vary by location, so you might need to gather data from various legal databases or use APIs that can provide this information.

2. **Use Langchain Library**: Langchain is a library that allows you to parse, transform, and analyze legal texts. You can use it to process the legal requirements you gathered in the first step. It can help you identify key phrases and sections that refer to forms that need to be filled out.

3. **Leverage openai's Function Capabilities**: Openai's function capabilities can be used to automate the process of searching for and retr

In [49]:
print(response)

Building a multi-tool agent to help farmers find the legal forms they need to start and maintain a farm using the langchain library, openai's function capabilities, and Streamlit for the interface involves several steps. Here's a general guide:

1. **Understand the Legal Requirements**: First, you need to understand the legal requirements for starting and maintaining a farm. This could vary by location, so you might need to gather data from various legal databases or use APIs that can provide this information.

2. **Use Langchain Library**: Langchain is a library that allows you to parse, transform, and analyze legal texts. You can use it to process the legal requirements you gathered in the first step. It can help you identify key phrases and sections that refer to forms that need to be filled out.

3. **Leverage openai's Function Capabilities**: Openai's function capabilities can be used to automate the process of searching for and retrieving the necessary forms based on the legal re

In [425]:
import pandas as pd
session_log_df = pd.read_csv("session_log.csv")

In [426]:
# Add the response and the query to the session log df
session_log_df = session_log_df.append({"query": additional_query, "response": response}, ignore_index=True)

  session_log_df = session_log_df.append({"query": additional_query, "response": response}, ignore_index=True)


In [427]:
# Export the session log df to a csv
session_log_df.to_csv("session_log.csv", index=False)

In [318]:
# Loop through the docs and create a vector for each
# In the format needed for Pinecone
# Create a pandas dataframe with the vectors and metadata

# Create the dataframe with the columns "id", "values", "metadata", "text"
# The "id" column should be a unique identifier for each vector
# The "values" column should be a list of floats
# The "metadata" column should be a dictionary with keys for "type", "url", and "access"
# The "text" column should be the text of the document
import pandas as pd

vectors_df = pd.DataFrame(columns=["id", "values", "metadata", "text"])
texts = []
for i, doc in enumerate(docs):
    text = doc.page_content
    source = doc.metadata["source"]
    texts.append(text)

    # Create the vectors
    vectors = [
        {'id': f'FastAPI_docs{i}',
        'values': [],
        'metadata': {'type': 'Github repo docs', 'url': 'https://github.com/tiangolo/fastapi', 'access': 'public', 'text': text, 'source': source},
        }
    ]
    # Add the vectors to the dataframe
    vectors_df = vectors_df.append(vectors, ignore_index=True)

  vectors_df = vectors_df.append(vectors, ignore_index=True)
  vectors_df = vectors_df.append(vectors, ignore_index=True)
  vectors_df = vectors_df.append(vectors, ignore_index=True)
  vectors_df = vectors_df.append(vectors, ignore_index=True)
  vectors_df = vectors_df.append(vectors, ignore_index=True)
  vectors_df = vectors_df.append(vectors, ignore_index=True)
  vectors_df = vectors_df.append(vectors, ignore_index=True)
  vectors_df = vectors_df.append(vectors, ignore_index=True)
  vectors_df = vectors_df.append(vectors, ignore_index=True)
  vectors_df = vectors_df.append(vectors, ignore_index=True)
  vectors_df = vectors_df.append(vectors, ignore_index=True)
  vectors_df = vectors_df.append(vectors, ignore_index=True)
  vectors_df = vectors_df.append(vectors, ignore_index=True)
  vectors_df = vectors_df.append(vectors, ignore_index=True)
  vectors_df = vectors_df.append(vectors, ignore_index=True)
  vectors_df = vectors_df.append(vectors, ignore_index=True)
  vectors_df = vectors_d

In [319]:
# Add the text column to the dataframe
vectors_df["text"] = texts

# Embed the texts for the values column
vectors_df["values"] = embed.embed_documents(vectors_df["text"])

In [23]:
from langchain.vectorstores import Pinecone
index = Pinecone.from_documents(docs, embed, index_name="vocalockr-bplan")

TypeError: expected string or bytes-like object

In [321]:
# Drop the text column
vectors_df = vectors_df.drop(columns=["text"])


In [322]:
# Convert the dataframe to a list of dictionaries
vectors = vectors_df.to_dict(orient="records")



In [323]:
for i, vector in enumerate(vectors):
    if len(vector["values"]) != 1536:
        print(i, len(vector["values"]))

In [324]:
index = pinecone.Index(index_name="coding-assist")

In [325]:
# Upsert the vectors into the vector store
index.upsert(vectors=vectors, batch_size=25, namespace="documentation")

Upserted vectors:   0%|          | 0/403 [00:00<?, ?it/s]

{'upserted_count': 403}

In [15]:
# Query the vector store
from langchain.vectorstores import Pinecone

text_field = "text"

index = pinecone.Index('coding-assist')

vectorstore = Pinecone(
    index, embed.embed_query, text_field
)


In [16]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

llm = ChatOpenAI(
    model_name = 'gpt-3.5-turbo-16k',
    temperature = 0.5,
)

qa = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff",
    retriever = vectorstore.as_retriever(),
)


In [17]:
query = "How can I use the langchain library to code an app in Streamlit that\
    can be used by farmers to find the forms that they need to fill out to open and\
    run their business based on their location?  I want to use multi-tool agents\
    to accomplish this task."

response = qa.run(query)

In [18]:
print(response)


I'm sorry, but I don't have any information about the langchain library or its capabilities. It's possible that the langchain library is a custom library or a library that is not widely known. I recommend referring to the documentation or resources specific to the langchain library for guidance on how to use it for your specific task.


In [None]:
# Create functions to be able to load YouTube transcripts and create a vector store from them
from langchain.document_loaders import YouTubeTranscriptLoader

# Create a YouTubeTranscriptLoader object
yt_loader = YouTubeTranscriptLoader(
    video_id="dQw4w9WgXcQ",
    language="en",
)