In [1]:
%pip install -U langchain-community faiss-cpu langchain-openai tiktoken


Collecting faiss-cpu
  Using cached faiss_cpu-1.8.0.post1-cp311-cp311-win_amd64.whl.metadata (3.8 kB)
Collecting langchain-openai
  Downloading langchain_openai-0.1.16-py3-none-any.whl.metadata (2.5 kB)
Collecting tiktoken
  Using cached tiktoken-0.7.0-cp311-cp311-win_amd64.whl.metadata (6.8 kB)
Collecting langchain-core<0.3.0,>=0.2.12 (from langchain-community)
  Downloading langchain_core-0.2.20-py3-none-any.whl.metadata (6.0 kB)
Using cached faiss_cpu-1.8.0.post1-cp311-cp311-win_amd64.whl (14.6 MB)
Downloading langchain_openai-0.1.16-py3-none-any.whl (46 kB)
   ---------------------------------------- 0.0/46.1 kB ? eta -:--:--
   ----------------- ---------------------- 20.5/46.1 kB ? eta -:--:--
   ----------------- ---------------------- 20.5/46.1 kB ? eta -:--:--
   ----------------------------------- ---- 41.0/46.1 kB 245.8 kB/s eta 0:00:01
   ---------------------------------------- 46.1/46.1 kB 256.0 kB/s eta 0:00:00
Using cached tiktoken-0.7.0-cp311-cp311-win_amd64.whl (799 

In [2]:
import os
from dotenv import load_dotenv
import openai

load_dotenv()

endpoint = os.environ.get("OPENAI_URL")
api_key = os.environ.get("OPEN_AI_KEY")
deployment = os.environ.get("OPENAI_DEPLOY")

client = openai.AzureOpenAI(
    azure_endpoint=endpoint,
    api_key=api_key,
    api_version="2024-02-01",
)

In [10]:
import re
import nltk
from pdfminer.high_level import extract_text
from nltk.tokenize import word_tokenize

"""
Function for document processing.
Initially the document is read using the extract_text function, as a result we have a reasonable processing
but with noise and loss of context.
NLTK is used to count tokens per text and thus generate document partitions.
The output is obtained with the execute function and returns three results:
 - Previous: Part before the current one
 - Current: Current part
 - Next: Part after the current one

 These outputs are used with the help of LLMs to optimize the extracted text by combining the generated parts. As a consequence, 
 Some information may be repeated, however, it will not be lost.
"""

class DocProcessing:
    def __init__(self, filename):
        self.filename = filename

    def filter_lines(self, text):
        filtered_lines = []
        for line in text.splitlines():
            stripped_line = line.strip()
            if len(stripped_line) >= 15 and len(re.findall(r'[a-zA-Z]', stripped_line)) >= 7:
                filtered_lines.append(stripped_line)
        return "\n".join(filtered_lines)

    def tokenize_text(self, text):
        return word_tokenize(text)

    def split_into_parts(self, tokens, current_part_size=700, context_size=250):
        parts = []
        total_tokens = len(tokens)
        index = 0
        
        while index < total_tokens:
            current_end = min(index + current_part_size, total_tokens)
            previous_start = max(index - context_size, 0)
            next_end = min(current_end + context_size, total_tokens)
            
            previous = tokens[previous_start:index]
            actual = tokens[index:current_end]
            next = tokens[current_end:next_end]
            
            parts.append({
                'previous': previous,
                'actual': actual,
                'next': next
            })
            
            index = current_end
        
        return parts

    def execute(self):
        try:
            extracted_text = extract_text(self.filename)

            filtered_text = self.filter_lines(extracted_text)

            tokens = self.tokenize_text(filtered_text)

            parts = self.split_into_parts(tokens)

            return[' '.join(part['actual']) for part in parts]
            # parts_dict = {
            #     f"part {i+1}": {
            #         "previous": ' '.join(part['previous']),
            #         "actual": ' '.join(part['actual']),
            #         "next": ' '.join(part['next'])
            #     } for i, part in enumerate(parts)
            # }
            # return parts_dict
        except Exception as e:
            print(e)

In [12]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter

processor = DocProcessing(filename="../files/Forsthoffer's Vol 1 - Rotating Equipment.pdf")
docs = processor.execute()
for doc in docs:
    doc.join()
print(docs)
# embeddings = OpenAIEmbeddings()
# db = FAISS.from_documents(docs, embeddings)
# print(db.index.ntotal)

[['Forsthoffer', "'s", 'Rotating', 'Equipment', 'Handbooks', 'Vol', '.', '1', ':', 'Fundamentals', 'of', 'Rotating', 'Equipment', '•', 'Pub', '.', 'Date', ':', 'December', '2005', '•', 'Publisher', ':', 'Elsevier', 'Science', '&', 'Technology', 'Books', 'the', 'design', ',', 'selection', ',', 'This', 'series', 'has', 'evolved', 'from', 'my', 'personal', 'experience', 'over', 'the', 'last', '40', 'testing', ',', 'start-up', 'and', 'condition', 'monitoring', 'of', 'rotating', 'equipment', '.', 'Most', 'of', 'the', 'concept', 'figures', 'were', 'originally', 'written', 'on', 'a', 'blackboard', 'or', 'whiteboard', 'during', 'a', 'training', 'session', 'and', 'on', 'a', 'spare', 'piece', 'of', 'paper', 'or', 'I', 'beam', 'during', 'a', 'start-up', 'or', 'a', 'problem', 'solving', 'plant', 'visit', '.', 'My', 'entire', 'career', 'has', 'been', 'devoted', 'to', 'this', 'interesting', 'and', 'important', 'field', '.', 'Then', 'and', 'now', 'more', 'than', 'ever', ',', 'the', 'cost', 'of', 'rot