In [89]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from dotenv import load_dotenv
load_dotenv()

True

In [39]:
%pip install "unstructured[md]" nltk

Note: you may need to restart the kernel to use updated packages.


## Document Loading

In [74]:
loader = DirectoryLoader('./manual2', glob="**/*.txt", loader_cls=TextLoader, show_progress=True)
docs = loader.load()
print(docs[0])


100%|██████████| 1/1 [00:00<00:00, 471.06it/s]

page_content='### INI file with explanations as comments.
[SigmaFileInfo]
FileType ; Parameter explanation: Internal parameter identifying the file type
FileVersion ; Parameter explanation: Internal version parameter
Date ; Parameter explanation: The current date
SigmaVersion ; Parameter explanation: Internal version of SIGMA used
ConfigId ; Parameter explanation: Internal system job ID, user may refer to this as srid

[E3DGeometryData]
[E3DGeometryData/Machine]
Type ; Parameter explanation: Extruder type, possible values: TSE, SSE, DIE
Unit ; Parameter explanation: Unit for input parameters, mm, cm, dm, m, ... etc.
Zwickel ; Parameter explanation: Barrel shape; can be straight or curved
MachineName ; Parameter explanation: Extruder name
RotationDirection ; Parameter explanation: Extruder rotation direction: LEFT (default) or RIGHT
BarrelDiameter ; Parameter explanation: Barrel diameter (default: 2 * screw diameter + 2 * screw clearance), German: Zylinderdurchmesser , [mm]
CenterlineDi




## Text Splitting

In [81]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=145,
    chunk_overlap=30,
)

split_docs = text_splitter.split_documents(docs)
print(f"Num of chunks: {len(split_docs)}")
print(split_docs)

Num of chunks: 55
[Document(metadata={'source': 'manual2/input.txt'}, page_content='### INI file with explanations as comments.\n[SigmaFileInfo]\nFileType ; Parameter explanation: Internal parameter identifying the file type'), Document(metadata={'source': 'manual2/input.txt'}, page_content='FileVersion ; Parameter explanation: Internal version parameter\nDate ; Parameter explanation: The current date'), Document(metadata={'source': 'manual2/input.txt'}, page_content='SigmaVersion ; Parameter explanation: Internal version of SIGMA used'), Document(metadata={'source': 'manual2/input.txt'}, page_content='ConfigId ; Parameter explanation: Internal system job ID, user may refer to this as srid'), Document(metadata={'source': 'manual2/input.txt'}, page_content='[E3DGeometryData]\n[E3DGeometryData/Machine]\nType ; Parameter explanation: Extruder type, possible values: TSE, SSE, DIE'), Document(metadata={'source': 'manual2/input.txt'}, page_content='Unit ; Parameter explanation: Unit for inpu

## Embedding Model 

In [60]:
from langchain_openai import OpenAIEmbeddings
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-large")


## Embed Documents

In [83]:
from langchain_community.vectorstores import Chroma

CHROMA_PATH = "./chroma_storage4"
vectorstore = Chroma.from_documents(documents=split_docs, embedding=embeddings_model, persist_directory=CHROMA_PATH)

In [40]:
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

question = f"What is the NoOfFlights parameter about?"

PROMPT_TEMPLATE = """
Answer the question based on the following information if neccessary use also your world knowledge of the topic, but keep it concise:

{context}

---

Answer the question based on the following information if neccessary use also your world knowledge of the above topic, but keep it concise: {question}
"""

db = vectorstore

results = db.similarity_search_with_relevance_scores(question, k=5)

context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=question)
print(prompt)

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.3)
response_text = llm.invoke(prompt)
print(response_text.content)


  results = db.similarity_search_with_relevance_scores(question, k=5)


Human: 
Answer the question based on the following information if neccessary use also your world knowledge of the topic, but keep it concise:

NoOfElements=undefined
NoOfFlights=undefined
BarrelLength=undefined

---

NoOfFlights=undefined ; Number of flights in the extruder, can use single flight (German: eingängig)
BarrelLength=undefined ; Length of the barrel/housing, German: Länge, Gehäuselänge or similar , [mm]

---

BarrelStraightCut=undefined ; Depth of the V-cut (default: 2.5 percent of BarrelDiameter, used only for twin screws), user may refer to this as vcut , [mm]
NoOfElements=undefined ; Number of elements in the extruder

---

[E3DSimulationsettings]
MeshQuality=undefined ; Mesh resolution, can be: coarse, medium, fine
HexMesher=undefined ; Mesh generator type: TwinScrew or HollowCylinder
KTPRelease=undefined ; Flag indicating whether KTP release is activated; NO or YES

---

outerdiameter=undefined ; Outer diameter of the screw, German: Minimaler Schneckendurchmesser , [mm

In [65]:

price_per_token = 1e-6

In [84]:
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

question = f"What can you tell me about the off_filelist parameter?"

PROMPT_TEMPLATE = """
Answer the question based on the following information if neccessary use also your world knowledge of the topic, but keep it concise:

{context}

---

Answer the question based on the above information if neccessary use also your world knowledge of the above topic, but keep it concise: {question}
"""

embeddings_model = OpenAIEmbeddings(model="text-embedding-3-large")
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embeddings_model)

results = db.similarity_search_with_relevance_scores(question, k=5)
#results2 = db.similarity_search(query=question, k=5)

context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results if _score > 0.0])
print(context_text)

prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)

# This command replaces the placeholders in the prompt template with concrete data
prompt = prompt_template.format(context=context_text, question=question)
#print(prompt)

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.1)
response = llm.invoke(prompt)
print(response.content)
#print(response)


off_filelistL ; Parameter explanation: File for the left-hand side of the screw geometry, this key-value is used if type=OFF_LR in this section

---

off_filelistR ; Parameter explanation: File for the right-hand side of the screw geometry, this key-value is used if type=OFF_LR in this section

---

off_filelist ; Parameter explanation: File containing the 3D geometry of the screw, this key-value is used if type=OFF in this section

---

type ; Parameter explanation: Type of input geometry; OFF requires `off_filelist`, OFF_LR requires both `off_filelistL` and `off_filelistR`

---

### INI file with explanations as comments.
[SigmaFileInfo]
FileType ; Parameter explanation: Internal parameter identifying the file type
The off_filelist parameter is used to specify the file containing the 3D geometry of the screw. It is used when the type parameter is set to OFF in the section. Additionally, if the type parameter is set to OFF_LR, then both off_filelistL and off_filelistR parameters are r

In [88]:
results[2][1]

0.3433192401756373

In [69]:
#response.usage_metadata['total_tokens'] * price_per_token
dir(results1[0])


['__add__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getnewargs__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmul__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'count',
 'index']