In [33]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
#from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

import os
from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, UnstructuredWordDocumentLoader

In [2]:
# Uncomment if you want to convert a CSV file into a Json file (csv should be in working directory)
# import pandas as pd
# import json
# import os

# def csv_to_json(csv_file_name, json_file_name):
#     # read csv file
#     data = pd.read_csv(csv_file_name)
    
#     # ensure the data frame columns are in string format
#     data = data.astype(str)

#     # convert dataframe to dict
#     data_dict = data.to_dict('records')

#     # write data to json file
#     with open(json_file_name, 'w') as json_file:
#         json.dump(data_dict, json_file)

# # prompt user to enter the csv file name
# csv_file_name = input("Enter the name of your CSV file (with .csv extension): ")

# # create the json file name
# json_file_name = os.path.splitext(csv_file_name)[0] + '.json'

# csv_to_json(csv_file_name, json_file_name)


In [3]:
import langchain.document_loaders
import json
import os

class JSONLoader:
    def __init__(self, filename):
        self.filename = filename

    def load_and_split(self):
        with open(self.filename, 'r') as f:
            data = json.load(f)
        return [json.dumps(record) for record in data]  # Splitting JSON objects

# Directory containing your documents
directory = "documents/"
# Get a list of all files in the directory
files = os.listdir(directory)
# Filter the list for .pdf, .docx, and .json files
pdf_files = [f for f in files if f.endswith('.pdf')]
docx_files = [f for f in files if f.endswith('.docx')]
json_files = [f for f in files if f.endswith('.json')]

print(f"PDF files: {pdf_files}")
print(f"DOCX files: {docx_files}")
print(f"JSON files: {json_files}")

pages = []

# Load each .pdf file and add its pages to the list
for file in pdf_files:
    loader = PyPDFLoader(os.path.join(directory, file))
    pages += loader.load_and_split()

# Load each .docx file and add its pages to the list
for file in docx_files:
    loader = Docx2txtLoader(os.path.join(directory, file))  # or UnstructuredWordDocumentLoader
    pages += loader.load_and_split()

# Load each .json file and add its data to the list
for file in json_files:
    loader = JSONLoader(os.path.join(directory, file))
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    texts = loader.load_and_split()
    
    pages += text_splitter.create_documents(texts)
    print(pages)

PDF files: []
DOCX files: ['BEST PRACTICES 3.23.2023 ans.docx', 'BEST PRACTICES 1.12.2023 ans.docx', 'BEST PRACTICES 12.1.2022 ans.docx', 'BEST PRACTICES 5.18.2023 ans.docx', 'BEST PRACTICES 7.28.2022 ans.docx', 'BEST PRACTICES 2.9.2023 ans.docx', 'BEST PRACTICES 4.6.2023 ans.docx', 'BEST PRACTICES 3.9.2023 ans.docx', 'BEST PRACTICES 11.3.2022 ans.docx', 'BEST PRACTICES 1.26.2023 ans.docx', 'BEST PRACTICES 2.23.2023 ans.docx', 'BEST PRACTICES 9.8.2022 ans.docx']
JSON files: ['CompiledCus.json']


In [4]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

import openai
import config
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())  # read local .env file
openai.organization = config.OPENAI_ORG_KEY
openai.api_key = config.OPENAI_API_KEY

In [5]:
embeddings = OpenAIEmbeddings()
docsearch = Chroma.from_documents(pages, embeddings)

Using embedded DuckDB without persistence: data will be transient


In [6]:
# qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=docsearch.as_retriever(), return_source_documents=True)

In [7]:
# query = "What is the spec page for primary and neutral dead end assemblies?"
# result = qa({"query": query})

In [8]:
# result["result"]

In [9]:
#result["source_documents"]

In [10]:
def process_llm_response(llm_response):
    print(llm_response['result'])
    
    print('\n\nSources:')
    for index, source in enumerate(llm_response["source_documents"]):
        print(source)
        try:
            print(f"{index + 1}: {source.metadata['source']}, page = {source.metadata['page']}")
        except:
            try:
                print(f"{index + 1}: {source.metadata['source']}")
            except:
                pass

In [11]:
# query = "What Compatible units should I use for the single-phase construction - tangent post top insulator?"
# llm_response = qa(query)
# process_llm_response(llm_response)

In [39]:
from langchain.prompts import PromptTemplate
prompt_template = """Use the following pieces of context to answer the question. If you don't know the answer, just say that you don't know, don't try to make up an answer.
You are a problem solver and you are able to find information regarding any topic based on the context provided to you. Use reasoning to answer your question. First breakdown the steps to solve it and reason according to the context provided. You are smart, wise, and very meticulous in your work. You answer questions in an organized fashion and understand the importance of step by step thinking for problem solving. If you get a question about compatible units (CU) remember that the last letter denotes the state (e.g., CUs ending in -F denote Florida) 

{context}

Question: {question}
Answer:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [40]:
llm = ChatOpenAI(
    temperature=1,
    model_name='gpt-3.5-turbo'
)

In [41]:
chain_type_kwargs = {"prompt": PROMPT}
# Another option is to use llm=OpenAI()
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever(), chain_type_kwargs=chain_type_kwargs, return_source_documents=True)

In [42]:
query = """How do I know if a transformer is single bushing or double bushing from the CU name? Give a comprehensive explanation"""
llm_response = qa(query)
process_llm_response(llm_response)

To determine if a transformer is single bushing or double bushing based on the CU name, we need to carefully analyze the information provided in the CU name and the context given.

Based on the context provided, we can see that the CU name contains several codes and abbreviations. Let's break down the CU names and analyze each part:

1. "TF-OH" indicates that the transformer is an overhead transformer.
2. The number after "TF-OH" represents the kVA (kilovolt-ampere) rating of the transformer. For example, "TF-OH-75" indicates a 75kVA transformer.
3. The next number denotes the voltage rating of the primary side of the transformer. For example, "TF-OH-75-12KV" means the primary side voltage is 12 kilovolts.
4. The following "120/240V" indicates the secondary side voltage of the transformer, which is either 120V or 240V.
5. The "1P" in the CU name refers to single-phase, indicating that the transformer is designed to handle single-phase electrical systems.
6. The numbers after "1P" repre