In [1]:
import os
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()

# Access the OpenAI key
openai_key = os.getenv("OPENAI_API_KEY")
activeloop_key = os.getenv("ACTIVELOOP_TOKEN")


In [31]:
from langchain.chains import RetrievalQA
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAIChat
from langchain.vectorstores.deeplake import DeepLake
from langchain.document_loaders import PyPDFLoader
import random
from langchain.embeddings import OpenAIEmbeddings
from tqdm import tqdm

In [4]:
openai_embeddings = OpenAIEmbeddings()

In [51]:
# dataset_path = "hub://101010/text_embedding"
# # Create a DeepLake instance and add the documents
# db = DeepLake.from_documents(documents, dataset_path=dataset_path, embedding=OpenAIEmbeddings())

db = DeepLake(
    dataset_path=f"hub://siddartha10/manufacturing_CSI",  # org_id stands for your username or organization from activeloop
    embedding=openai_embeddings,
    runtime={"tensor_db": True},
    token=activeloop_key,
    # overwrite=True, # user overwrite flag if you want to overwrite the full dataset
    read_only=False,
)

Deep Lake Dataset in hub://siddartha10/manufacturing_CSI already exists, loading from the storage


In [15]:
# Replace 'file_paths' with the paths to your local PDF files
file_paths = [
    "manual\'s.pdf",
    "Operation and Maintenance Manual.pdf",
    "Operations Manual.pdf"
]
# Initialize an empty list to store all pages
pages = []

# Loop through each manual and load its pages
for manual_path in file_paths:
    loader = PyPDFLoader(manual_path)
    pages1 = loader.load_and_split()
    pages.extend(pages1)

In [16]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

chunk_size = 4096
docs_new = []

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
)

for doc in pages:
    if len(doc.page_content) < chunk_size:
        docs_new.append(doc)
    else:
        docs = text_splitter.create_documents([doc.page_content])
        docs_new.extend(docs)

In [44]:
docs = db.add_documents(docs_new)

Creating 197 embeddings in 1 batches of size 197:: 100%|██████████| 1/1 [00:10<00:00, 10.17s/it]

Dataset(path='hub://siddartha10/manufacturing_CSI', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype       shape      dtype  compression
  -------    -------     -------    -------  ------- 
   text       text      (197, 1)      str     None   
 metadata     json      (197, 1)      str     None   
 embedding  embedding  (197, 1536)  float32   None   
    id        text      (197, 1)      str     None   





In [37]:
from typing import List

from langchain.chains.openai_functions import (
    create_structured_output_chain,
)
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.schema import HumanMessage, SystemMessage
from pydantic import BaseModel, Field

In [45]:
# fetch dataset docs and ids if they exist (optional you can also ingest)
docs = db.vectorstore.dataset.text.data(fetch_chunks=True, aslist=True)["value"]
ids = db.vectorstore.dataset.id.data(fetch_chunks=True, aslist=True)["value"]

In [46]:
# If we pass in a model explicitly, we need to make sure it supports the OpenAI function-calling API.
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)


class Questions(BaseModel):
    """Identifying information about the manufacturing system."""

    question: str = Field(..., description="Questions about manufacturing system")


prompt_msgs = [
    SystemMessage(
        content="You are a world class expert for generating questions based on provided context. \
                You make sure the question can be answered by the text."
    ),
    HumanMessagePromptTemplate.from_template(
        "Use the given text to generate a question from the following input: {input}"
    ),
    HumanMessage(content="Tips: Make sure to answer in the correct format"),
]
prompt = ChatPromptTemplate(messages=prompt_msgs)
chain = create_structured_output_chain(Questions, llm, prompt, verbose=True)

text = """
UWM CSI Vial Filling Connected Smart Manufacturing (CSM) System The UWM CSI Vial Filling CSM  System is an intelligent manufacturing system using the latest Industry 4.0 connected advanced manufacturing equipment and techniques to produce  vials filled with varying product using a variety of filling methods and capturing process data that can be used for data analysis and system optimization.  The  Vial Filling CSM System is a platform that university faculty and students will use for both education and research to  further the advancement of a connected enterprise.   The CAM components are  integrate d seamlessly within a fully  integrated architecture and connected  enterprise using cutting -edge smart -data devices at all layers.     The process overview detailed in this section describes in general how the Vial Filling Connected Smart Manufacturing system functions as a complete system processing components.  Operational steps and slight variances in the process may differ from what is described here depending on the configuration parameters or  using the stations  in a dry cycle mode .  The Vial Filling CSM utilizes localized system configuration setting within the machine as well as process data requirements and parameters to determine how  to process the product and which stations and inspections are required to complete the production job.  When running under MES Production Center control, the process requirements and parameters are received  from the MES system as to where  and how to process and inspect the vials produced by the system .    """
questions = chain.run(input=text)
print(questions)



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: You are a world class expert for generating questions based on provided context.                 You make sure the question can be answered by the text.
Human: Use the given text to generate a question from the following input: 
UWM CSI Vial Filling Connected Smart Manufacturing (CSM) System The UWM CSI Vial Filling CSM  System is an intelligent manufacturing system using the latest Industry 4.0 connected advanced manufacturing equipment and techniques to produce  vials filled with varying product using a variety of filling methods and capturing process data that can be used for data analysis and system optimization.  The  Vial Filling CSM System is a platform that university faculty and students will use for both education and research to  further the advancement of a connected enterprise.   The CAM components are  integrate d seamlessly within a fully  integrated architecture and connected  enter

In [32]:
def generate_queries(docs: List[str], ids: List[str], n: int = 100):
    questions = []
    relevances = []
    pbar = tqdm(total=n)
    while len(questions) < n:
        # 1. randomly draw a piece of text and relevance id
        r = random.randint(0, len(docs) - 1)
        text, label = docs[r], ids[r]

        # 2. generate queries and assign and relevance id
        generated_qs = [chain.run(input=text).question]
        questions.extend(generated_qs)
        relevances.extend([[(label, 1)] for _ in generated_qs])
        pbar.update(len(generated_qs))
        if len(questions) % 10 == 0:
            print(f"q: {len(questions)}")
    return questions[:n], relevances[:n]


chain = create_structured_output_chain(Questions, llm, prompt, verbose=False)
questions, relevances = generate_queries(docs, ids, n=200)

train_questions, train_relevances = questions[:100], relevances[:100]
test_questions, test_relevances = questions[100:], relevances[100:]



q: 10




q: 20




q: 30




q: 40




q: 50




q: 60




q: 70




q: 80




q: 90




q: 100




q: 110




q: 120




q: 130




q: 140




q: 150




q: 160




q: 170




q: 180




q: 190


100%|██████████| 200/200 [10:19<00:00,  3.10s/it]

q: 200





In [54]:
job_id = db.vectorstore.deep_memory.train(
    queries=train_questions,
    relevance=train_relevances,
)

DeepMemoryWaitingListError: Deep Memory is available only for waiting list users. Please, follow the link and join the waiting list: https://www.deeplake.ai/deepmemory