## Ingesting PDF

In [None]:
%pip install --q unstructured langchain 
%pip install --q "unstructured[all-docs]"
%pip install --upgrade langchain 

In [None]:
%pip install langchain-community langchain-core pymupdf

In [None]:
%pip install --q chromadb
%pip install --q langchain-text-splitters

In [1]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
import fitz
import os
import json

from langchain.retrievers import MultiQueryRetriever
from langchain.prompts import ChatPromptTemplate

In [4]:
# ไดเรกทอรีที่เก็บไฟล์ PDF
pdf_directory = "pdf/"

# สร้างคลาสเพื่อให้แต่ละหน้าของ PDF มี .page_content และ .metadata
class PDFPage:
    def __init__(self, page_number, text, metadata=None):
        self.page_number = page_number
        self.page_content = text  # เก็บข้อความใน .page_content
        self.metadata = metadata or {}  # เก็บข้อมูลเพิ่มเติมใน .metadata

# อ่านไฟล์ PDF ด้วย PyMuPDF
def load_pdf_with_pymupdf(file_path):
    doc = fitz.open(file_path)
    data = []
    
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)  # โหลดหน้าจากไฟล์ PDF
        text = page.get_text("text")  # ดึงข้อความออกมาในรูปแบบของ text
        data.append(PDFPage(page_num + 1, text, metadata={"file_name": os.path.basename(file_path)}))  # เพิ่ม object PDFPage ลงใน list
    
    doc.close()
    return data

# อ่านไฟล์ PDF ทั้งหมดในไดเรกทอรี pdf/
def load_all_pdfs_in_directory(directory):
    pdf_files = [f for f in os.listdir(directory) if f.endswith('.pdf')]
    all_pdfs_data = {}

    for pdf_file in pdf_files:
        file_path = os.path.join(directory, pdf_file)
        pdf_data = load_pdf_with_pymupdf(file_path)  # อ่านข้อมูลจาก PDF
        all_pdfs_data[pdf_file] = pdf_data  # เก็บข้อมูลของแต่ละไฟล์ใน dict
    
    return all_pdfs_data

# อ่านไฟล์ PDF ทั้งหมดในไดเรกทอรี pdf/
if os.path.exists(pdf_directory):
    all_pdfs_data = load_all_pdfs_in_directory(pdf_directory)

    # สร้าง chunks จากข้อความที่อ่านได้
    text_documents = [page for pdf_data in all_pdfs_data.values() for page in pdf_data]  # Use PDFPage objects directly

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=500)
    chunks = text_splitter.split_documents(text_documents)

    print(f"Number of chunks created: {len(chunks)}")

else:
    print("Directory not found")

Number of chunks created: 169


In [22]:
import os
import json
from langchain.schema import Document

# กำหนดโฟลเดอร์ที่เก็บไฟล์ JSON
json_dir = 'json/'

# ตัวแปรสำหรับเก็บข้อมูลที่รวมจากไฟล์ JSON ทั้งหมด
data = []

# วนลูปอ่านไฟล์ทั้งหมดในโฟลเดอร์
for filename in os.listdir(json_dir):
    if filename.endswith('.json'):
        json_path = os.path.join(json_dir, filename)
        
        # เปิดและอ่านข้อมูลจากไฟล์ JSON
        with open(json_path, 'r', encoding='utf-8') as json_file:
            j = json.load(json_file)
            print("READ FILE")
            # ตรวจสอบว่า JSON มีคีย์ที่ต้องการหรือไม่
            # สมมติว่าคีย์ที่เก็บเนื้อหาคือ 'content'
            content = j.get('content', str(filename[:-5]))  # เปลี่ยน 'content' เป็นคีย์ที่ถูกต้องถ้าจำเป็น
            
            # สร้างอ็อบเจ็กต์ Document
            document = Document(page_content=content, metadata=j)
            # เพิ่มอ็อบเจ็กต์ Document ลงใน data
            data.append(document)

# แสดงผลข้อมูลที่รวมกัน (เป็นอ็อบเจ็กต์ Document)
for doc in data:
    print(doc.page_content)

READ FILE
READ FILE
Formular
ingredient_function_count


## Vector Embeddings

In [None]:
!ollama pull nomic-embed-text

In [20]:
!ollama list

NAME                       ID              SIZE      MODIFIED       
nomic-embed-text:latest    0a109f422b47    274 MB    28 minutes ago    
llama3.1:8b                42182419e950    4.7 GB    41 hours ago      


In [None]:
# Create a list of documents with only the text content for splitting
text_documents = [{"page_content": page.page_content} for pdf_data in all_pdfs_data.values() for page in pdf_data]

# Now you can split the documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = text_splitter.split_documents(text_documents)
print(f"Number of chunks created: {len(chunks)}")

In [15]:
# Add to vector database
vector_db = Chroma.from_documents(
    documents=chunks, 
    embedding=OllamaEmbeddings(model="nomic-embed-text",show_progress=True),
    collection_name="local-rag"
)

OllamaEmbeddings:   0%|          | 0/169 [00:00<?, ?it/s]

OllamaEmbeddings: 100%|██████████| 169/169 [07:20<00:00,  2.60s/it]


In [None]:
persist_directory = "vector_db/"
vector_db = Chroma.from_documents(
    documents=chunks, 
    embedding=OllamaEmbeddings(model="nomic-embed-text", show_progress=True),
    persist_directory=persist_directory,
    collection_name="local-rag"
)
vector_db.persist()

In [4]:
loaded_vector_db = Chroma(
    persist_directory=persist_directory,
    embedding_function=OllamaEmbeddings(model="nomic-embed-text"),
    collection_name="local-rag"
)

  loaded_vector_db = Chroma(


## Retrieval

In [7]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_ollama import ChatOllama

In [6]:
# LLM from Ollama
local_model = "llama3.1:8b"
llm = ChatOllama(model=local_model)

In [7]:
COSMETIC_FORMULA_PROMPT = PromptTemplate(
    input_variables=["ph", "viscosity", "appearance"],
    template="""You are a Cosmetic Formula Generator. Create 3 unique formulas based on these specifications:

pH: {ph}
Viscosity (cps): {viscosity}
Appearance: {appearance}

Requirements:
1. Generate exactly 3 formulas matching the given specifications.
2. Each formula must have at least 10 ingredients.
3. The sum of %w/w for all ingredients in each formula must equal exactly 100%.
4. Use "di water" to adjust the total if needed.
5. Ensure variety in ingredients across formulas.
6. Use ingredients and functions from unique_ingredient_function.json.
7. Use viscosity_builder.json to estimate viscosity contributions.

Output Format:
Respond ONLY with a JSON object structured as follows:

{
    "Formulas": [
        {
            "pH": <pH_value>,
            "Viscosity (cps)": <viscosity_value>,
            "Appearance": "<appearance_description>",
            "Ingredients": [
                {
                    "Ingredient": "<ingredient_name>",
                    "Phase": "<phase>",
                    "%w/w": <percentage>,
                    "Function": "<function>",
                    "Supplier": "<supplier>"
                },
                ...
            ]
        },
        {...},
        {...}
    ]
}

Ensure the output is a valid JSON object with no additional text or explanations.
"""
)

In [8]:
retriever = MultiQueryRetriever.from_llm(
    loaded_vector_db.as_retriever(), 
    llm
)

template = """Use the following context to generate a cosmetic formula based on the given specifications:
{context}

Specifications:
pH: {ph}
Viscosity (cps): {viscosity}
Appearance: {appearance}

Generate the formula according to the instructions in the context.
"""

prompt = ChatPromptTemplate.from_template(template)


In [9]:
chain = (
    {
        "context": retriever, 
        "ph": RunnablePassthrough(),
        "viscosity": RunnablePassthrough(),
        "appearance": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

In [10]:
result = chain.invoke({
    "ph": 5.5,
    "viscosity": 29600.0,
    "appearance": "white cream"
})

print(result)

Based on the provided specifications and the given data, I will create a Python script that generates the formula for the product.

Here is the code:

```python
import json

# Load the JSON data from the given string
data = '''
{
    "pH": 5.5,
    "Viscosity (cps)": 29600.0,
    "Appearance": "white cream",
    "Ingredients": [
        {
            "Ingredient": "polyglyceryl-4 diisostearate/polyhydroxy-stearate/sebacate",
            "Phase": "a",
            "%w/w": 3.0,
            "Function": "emulsifier",
            "Supplier": "-"
        },
        {
            "Ingredient": "caprylic/capric triglyceride",
            "Phase": "a",
            "%w/w": 5.5,
            "Function": "emollient",
            "Supplier": "-"
        },
        {
            "Ingredient": "diethylhexyl carbonate",
            "Phase": "a",
            "%w/w": 5.0,
            "Function": "emollient",
            "Supplier": "-"
        },
        {
            "Ingredient": "ethylhexyl palmitate",

In [None]:
chain.invoke("What are the 5 pillars of global cooperation?")

In [None]:
# Delete all collections in the db
vector_db.delete_collection()

## Test API

In [18]:
os.environ["GROQ_API_KEY"] = "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"

llm = ChatGroq(
    model="llama-3.1-70b-versatile",
    temperature=0.2
)

loaded_vector_db = Chroma(
    persist_directory='vector_db',
    embedding_function=OllamaEmbeddings(model="nomic-embed-text"),
    collection_name="local-rag"
)

# Create a MultiQueryRetriever
retriever = MultiQueryRetriever.from_llm(
    loaded_vector_db.as_retriever(), 
    llm
)

In [19]:
# Define the prompt template
template = """You are a Cosmetic Formula Generator. Create 3 unique formulas based on these specifications:

pH: {ph}
Viscosity (cps): {viscosity}
Appearance: {appearance}

Use the following context to help generate the formulas:
{context}

Requirements:
1. Generate exactly 3 formulas matching the given specifications.
2. Each formula must have at least 10 ingredients.
3. The sum of %w/w for all ingredients in each formula must equal exactly 100%.
4. Use "di water" to adjust the total if needed and the value is more than 75%.
5. Ensure variety in ingredients across formulas.
6. Use ingredients and functions from unique_ingredient_function.json.
7. Use viscosity_builder.json to estimate viscosity contributions.

Output Format:
Respond ONLY with a JSON object structured as follows:

{{
    "Formulas": [
        {{
            "pH": <pH_value>,
            "Viscosity (cps)": <viscosity_value>,
            "Appearance": "<appearance_description>",
            "Ingredients": [
                {{
                    "Ingredient": "<ingredient_name>",
                    "Phase": "<phase>",
                    "%w/w": <percentage>,
                    "Function": "<function>",
                    "Supplier": "<supplier>"
                }},
                ...
            ]
        }},
        {{...}},
        {{...}}
    ]
}}

Ensure the output is a valid JSON object with no additional text or explanations.
"""

prompt = ChatPromptTemplate.from_template(template)

# Create the chain
chain = (
    {
        "context": retriever, 
        "ph": RunnablePassthrough(),
        "viscosity": RunnablePassthrough(),
        "appearance": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

# Function to generate formulas
def generate_formulas(ph, viscosity, appearance):
    return chain.invoke({
        "ph": ph,
        "viscosity": viscosity,
        "appearance": appearance
    })

# Example usage
result = generate_formulas(ph=5.5, viscosity=29600.0, appearance="white cream")
print(result)

```json
{
    "Formulas": [
        {
            "pH": 5.5,
            "Viscosity (cps)": 29600.0,
            "Appearance": "white cream",
            "Ingredients": [
                {
                    "Ingredient": "cetearyl olivate (and) sorbitan olivate",
                    "Phase": "a",
                    "%w/w": 5.0,
                    "Function": "emulsifier",
                    "Supplier": "-"
                },
                {
                    "Ingredient": "helianthus annuus (sunflower) seed oil",
                    "Phase": "a",
                    "%w/w": 10.0,
                    "Function": "emollient",
                    "Supplier": "-"
                },
                {
                    "Ingredient": "beeswax",
                    "Phase": "a",
                    "%w/w": 5.0,
                    "Function": "emollient",
                    "Supplier": "-"
                },
                {
                    "Ingredient": "hydroxyethyl acrylate

In [21]:
import json
import os
import re

def add_formulation_to_json(formulation_json_string):

    cleaned = re.sub(r'^```json\s*', '', formulation_json_string)
    cleaned = re.sub(r'^```\s*', '', cleaned)
    cleaned = re.sub(r'\s*```$', '', cleaned)
    cleaned = cleaned.strip()

    # แปลง JSON string เป็น Python dictionary
    formulation_data = json.loads(cleaned)
    
    # กำหนดชื่อไฟล์
    filename = 'json/LLM_Formulations.json'
    
    # ตรวจสอบว่าไฟล์มีอยู่แล้วหรือไม่
    if os.path.exists(filename):
        # ถ้ามีอยู่แล้ว, อ่านข้อมูลเดิม
        with open(filename, 'r') as file:
            existing_data = json.load(file)
        
        # เพิ่มข้อมูลใหม่
        existing_data['Formulations'].extend(formulation_data['Formulas'])
    else:
        # ถ้ายังไม่มี, สร้างโครงสร้างข้อมูลใหม่
        existing_data = {'Formulations': formulation_data['Formulas']}
    
    # บันทึกข้อมูลทั้งหมดกลับไปยังไฟล์
    with open(filename, 'w') as file:
        json.dump(existing_data, file, indent=2)
    
    print(f"Data has been added to {filename}")

add_formulation_to_json(result)

Data has been added to json/LLM_Formulations.json
