In [1]:
%pip install -U langchain-community faiss-cpu langchain-openai tiktoken


Collecting faiss-cpu
  Using cached faiss_cpu-1.8.0.post1-cp311-cp311-win_amd64.whl.metadata (3.8 kB)
Collecting langchain-openai
  Downloading langchain_openai-0.1.16-py3-none-any.whl.metadata (2.5 kB)
Collecting tiktoken
  Using cached tiktoken-0.7.0-cp311-cp311-win_amd64.whl.metadata (6.8 kB)
Collecting langchain-core<0.3.0,>=0.2.12 (from langchain-community)
  Downloading langchain_core-0.2.20-py3-none-any.whl.metadata (6.0 kB)
Using cached faiss_cpu-1.8.0.post1-cp311-cp311-win_amd64.whl (14.6 MB)
Downloading langchain_openai-0.1.16-py3-none-any.whl (46 kB)
   ---------------------------------------- 0.0/46.1 kB ? eta -:--:--
   ----------------- ---------------------- 20.5/46.1 kB ? eta -:--:--
   ----------------- ---------------------- 20.5/46.1 kB ? eta -:--:--
   ----------------------------------- ---- 41.0/46.1 kB 245.8 kB/s eta 0:00:01
   ---------------------------------------- 46.1/46.1 kB 256.0 kB/s eta 0:00:00
Using cached tiktoken-0.7.0-cp311-cp311-win_amd64.whl (799 

In [1]:
import os
from dotenv import load_dotenv
import openai

load_dotenv()

endpoint = os.environ.get("OPENAI_URL")
api_key = os.environ.get("OPEN_AI_KEY")
deployment = os.environ.get("OPENAI_DEPLOY")

client = openai.AzureOpenAI(
    azure_endpoint=endpoint,
    api_key=api_key,
    api_version="2024-02-01",
)

In [8]:
import re
import nltk
from pdfminer.high_level import extract_text
from nltk.tokenize import word_tokenize

"""
Function for document processing.
Initially the document is read using the extract_text function, as a result we have a reasonable processing
but with noise and loss of context.
NLTK is used to count tokens per text and thus generate document partitions.
The output is obtained with the execute function and returns three results:
 - Previous: Part before the current one
 - Current: Current part
 - Next: Part after the current one

 These outputs are used with the help of LLMs to optimize the extracted text by combining the generated parts. As a consequence, 
 Some information may be repeated, however, it will not be lost.
"""

class DocProcessing:
    def __init__(self, filename):
        self.filename = filename

    def filter_lines(self, text):
        filtered_lines = []
        for line in text.splitlines():
            stripped_line = line.strip()
            if len(stripped_line) >= 15 and len(re.findall(r'[a-zA-Z]', stripped_line)) >= 7:
                filtered_lines.append(stripped_line)
        return "\n".join(filtered_lines)

    def tokenize_text(self, text):
        return word_tokenize(text)

    def split_into_parts(self, tokens, current_part_size=700, context_size=250):
        parts = []
        total_tokens = len(tokens)
        index = 0
        
        while index < total_tokens:
            current_end = min(index + current_part_size, total_tokens)
            previous_start = max(index - context_size, 0)
            next_end = min(current_end + context_size, total_tokens)
            
            previous = tokens[previous_start:index]
            actual = tokens[index:current_end]
            next = tokens[current_end:next_end]
            
            parts.append({
                'previous': previous,
                'actual': actual,
                'next': next
            })
            
            index = current_end
        
        return parts

    def execute(self):
        try:
            extracted_text = extract_text(self.filename)

            filtered_text = self.filter_lines(extracted_text)

            tokens = self.tokenize_text(filtered_text)

            parts = self.split_into_parts(tokens)

            # return[' '.join(part['actual']) for part in parts]
            parts_dict = {
                f"part {i+1}": {
                    "previous": ' '.join(part['previous']),
                    "actual": ' '.join(part['actual']),
                    "next": ' '.join(part['next'])
                } for i, part in enumerate(parts)
            }
            return parts_dict
        except Exception as e:
            print(e)

In [9]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter

processor = DocProcessing(filename="../files/Forsthoffer's Vol 1 - Rotating Equipment.pdf")
docs = processor.execute()

In [12]:
from GPTcalls import GPTAssistant

model = os.environ.get("OPENAI_DEPLOY")
helper = GPTAssistant(deployment = model)

full_text = []

cont = 1
for i in docs:
    print(f"Progress: {cont/len(docs)*100:.2f} %!")
    previous = docs[i]['previous']
    actual = docs[i]['actual']
    next = docs[i]['next']
    cont += 1

    full_text.append(helper.aux_processing(previous, next, actual))

n = 1
m = 1

questions = []
answers = []
instructions = []

instruction = helper.create_instruction(full_text[0])

print(f'============= STARTED! =============')

for i in range(len(full_text)):
    print(f'|       Working on Chunk {i} / {len(full_text)}!')
    txt = full_text[i]
    previous = []
    for i in range(n):
        question = helper.question_gpt(txt, previous)
        previous.append(question)

        for j in range(m):
            questions.append(question)
            answers.append(helper.answer_gpt(txt, question))
            instructions.append(instruction)
    
print(f'============ COMPLETED! ============')
print(questions)

Progress: 0.67 %!
Progress: 1.34 %!
Progress: 2.01 %!
Progress: 2.68 %!
Progress: 3.36 %!
Progress: 4.03 %!
Progress: 4.70 %!
Progress: 5.37 %!
Progress: 6.04 %!
Progress: 6.71 %!
Progress: 7.38 %!
Progress: 8.05 %!
Progress: 8.72 %!
Progress: 9.40 %!
Progress: 10.07 %!
Progress: 10.74 %!
Progress: 11.41 %!
Progress: 12.08 %!
Progress: 12.75 %!
Progress: 13.42 %!
Progress: 14.09 %!
Progress: 14.77 %!
Progress: 15.44 %!
Progress: 16.11 %!
Progress: 16.78 %!
Progress: 17.45 %!
Progress: 18.12 %!
Progress: 18.79 %!
Progress: 19.46 %!
Progress: 20.13 %!
Progress: 20.81 %!
Progress: 21.48 %!
Progress: 22.15 %!
Progress: 22.82 %!
Progress: 23.49 %!
Progress: 24.16 %!
Progress: 24.83 %!
Progress: 25.50 %!
Progress: 26.17 %!
Progress: 26.85 %!
Progress: 27.52 %!
Progress: 28.19 %!
Progress: 28.86 %!
Progress: 29.53 %!
Progress: 30.20 %!
Progress: 30.87 %!
Progress: 31.54 %!
Progress: 32.21 %!
Progress: 32.89 %!
Progress: 33.56 %!
Progress: 34.23 %!
Progress: 34.90 %!
Progress: 35.57 %!
Progres

RateLimitError: Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2024-02-01 have exceeded token rate limit of your current OpenAI S0 pricing tier. Please retry after 2 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.'}}

In [13]:
# To save the chunks
import json
formatted_text = [{"Part {:02d}".format(i + 1): item} for i, item in enumerate(full_text)]

output_name = 'chunks.json'
with open(output_name, 'w', encoding='utf-8') as f:
    json.dump(formatted_text, f, ensure_ascii=False, indent=4)

In [34]:
formatted_text = []
for i in range(len(questions)):
    formatted_text.append(
        {
            "id":i,
            "chunk": full_text[i],
            "question": questions[i]
        }
    )
output_name = 'questions.json'
with open(output_name, 'w', encoding='utf-8') as f:
    json.dump(formatted_text, f, ensure_ascii=False, indent=4)

In [25]:
embeddings = OpenAIEmbeddings()
chunk_with_id = [str(i)+" "+c for i,c in enumerate(full_text)]
db = FAISS.from_texts(chunk_with_id, embeddings)
print(db.index.ntotal)

149


In [49]:
query = "How do positive displacement pumps and dynamic pumps differ in terms of their sensitivity to changes in liquid properties and system conditions?"
docs = db.similarity_search(query,5)
print(docs[0].page_content)

4
12 A pump is defined as a device that moves a liquid by increasing the energy level of the liquid. There are various types of pumps, including positive displacement pumps and dynamic pumps. Positive displacement pumps operate by displacing a fixed volume in a confined area, such as those utilizing rotary blades like screw, gear, and reciprocating pumps. They provide constant volume delivery, variable differential head, and are relatively insensitive to liquid properties. On the other hand, dynamic pumps, like centrifugal and axial pumps, generate pressure by using rotating impellers and are sensitive to changes in the system and liquid properties.

Regardless of whether using positive displacement or dynamic pumps, each pump consists of hydraulic and mechanical components. The hydraulic end involves moving the liquid, while the mechanical end includes components like shafts, bearings, seals, couplings, and casings. The performance relationships of head, horsepower, and efficiency rem

In [54]:
num_k = 10

input_name = 'questions.json'
with open(input_name, 'r', encoding='utf-8') as f:
    chunks = json.load(f)

num_correct=[0]*num_k

for chunk in chunks:
    query = chunk["question"]
    docs = db.similarity_search(query,num_k)
    correct = False
    for i, doc in enumerate(docs):
        if correct:
            num_correct[i]+=1
        else:
            db_index = int(doc.page_content.split()[0])
            if db_index == chunk["id"]:
                correct = True
                num_correct[i]+=1

formatted_array = ["{:.2f}".format(num/len(chunks)) for num in num_correct]
print(formatted_array)

['0.72', '0.87', '0.93', '0.93', '0.94', '0.95', '0.97', '0.98', '0.98', '0.98']
