In [1]:
# pip install pymongo pdfplumber sentence-transformers
# pip install pdfplumber
# pip install python-docx
# pip install sentence-transformers
# pip install chromadb


##**Importing the files**

In [1]:
import os
import pdfplumber
import docx

# Define the folder path
data_folder = "LLM Data"  # Change if your folder path is different

# Get all file names in the directory
all_files = os.listdir(data_folder)

# Filter for PDFs and Word documents
pdf_files = [os.path.join(data_folder, f) for f in all_files if f.endswith(".pdf")]
docx_files = [os.path.join(data_folder, f) for f in all_files if f.endswith(".docx")]

print(f"Found {len(pdf_files)} PDFs and {len(docx_files)} Word documents.")

# Dictionary to store extracted text
document_texts = {}

# Function to extract text from PDFs
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            extracted_text = page.extract_text()
            if extracted_text:
                text += extracted_text + "\n"
    return text.strip()

# Function to extract text from Word documents
def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
    return text.strip()

# Process all PDFs
for pdf_file in pdf_files:
    document_texts[pdf_file] = extract_text_from_pdf(pdf_file)

# Process all Word documents
for docx_file in docx_files:
    document_texts[docx_file] = extract_text_from_docx(docx_file)

# Print extracted text preview
for file, text in document_texts.items():
    print(f"\nExtracted from {os.path.basename(file)}:\n{text[:500]}...\n")


Found 5 PDFs and 3 Word documents.

Extracted from 2010-OCA-REP001-01_ASOW South Construction and Installation Strategy_TRAINING.pdf:
ASOW South Construction and
Installation Strategy
Client Shell New Energies US LLC
Subject Atlantic Shores OWF
Document ID 2010-OCA-REP001 Prepared Nick Wordsworth 15/07/2020
Version 01 Reviewed Jon Chamber 15/07/2020
Date 15/07/2020 Approved Nick Wordsworth 15/07/2020
ASOW South Construction and Installation Strategy
Atlantic Shores OWF
Contents
Tables ...................................................................................................................................................


Extracted from Fabrication_infrastructure_and_logistics.pdf:
FLOATING WIND JOINT INDUSTRY PROGRAMME
Fabrication,
infrastructure and
logistics
Project summary
September 2023
Contents
FABRICATION, INFRASTRUCTURE & LOGISTICS (FIL) ................................................... 3
Introduction .................................................................

## **Chunking and Embedding**

In [2]:
import re

def split_text_into_chunks(text, chunk_size=500, overlap=50):
    """
    Splits text into overlapping chunks of chunk_size words with an overlap.
    """
    words = text.split()
    chunks = []
    
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)

    return chunks


In [3]:
from sentence_transformers import SentenceTransformer

# Load the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Dictionary to store embeddings
document_embeddings = {}

# Generate embeddings for all document chunks
for file, text in document_texts.items():
    chunks = split_text_into_chunks(text)
    embeddings = model.encode(chunks)

    document_embeddings[file] = [
        {"chunk": chunk, "embedding": embedding.tolist()} for chunk, embedding in zip(chunks, embeddings)
    ]

print("Embeddings created successfully!")


  from .autonotebook import tqdm as notebook_tqdm


Embeddings created successfully!


In [4]:
# Print a few embeddings for verification
for file, data in document_embeddings.items():
    print(f"File: {file}")
    for i, item in enumerate(data[:2]):  # Print only first 2 embeddings per file
        print(f"Chunk {i+1}: {item['chunk'][:200]}...")  # Show part of the chunk
        print(f"Embedding: {item['embedding'][:5]}...")  # Show first 5 values of embedding
        print("\n---\n")


File: LLM Data\2010-OCA-REP001-01_ASOW South Construction and Installation Strategy_TRAINING.pdf
Chunk 1: ASOW South Construction and Installation Strategy Client Shell New Energies US LLC Subject Atlantic Shores OWF Document ID 2010-OCA-REP001 Prepared Nick Wordsworth 15/07/2020 Version 01 Reviewed Jon C...
Embedding: [0.02744266949594021, 0.029270712286233902, 0.006209631450474262, -0.023006215691566467, 0.01306066568940878]...

---

Chunk 2: ............................................................................................................................ 30 5.3 Lifting Assessment ....................................................
Embedding: [-0.059674493968486786, -0.004700467921793461, -0.05902133509516716, -0.09297521412372589, -0.12768973410129547]...

---

File: LLM Data\Fabrication_infrastructure_and_logistics.pdf
Chunk 1: FLOATING WIND JOINT INDUSTRY PROGRAMME Fabrication, infrastructure and logistics Project summary September 2023 Contents FABRICATION, INFRASTRUCT

In [5]:
import chromadb

# Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(path="./chroma_db")  # Local storage

# Create a new collection (or connect to an existing one)
collection = chroma_client.get_or_create_collection("document_embeddings")

# Store embeddings in ChromaDB
for file, data in document_embeddings.items():
    for i, item in enumerate(data):
        collection.add(
            ids=[f"{file}_{i}"],  # Unique ID for each chunk
            documents=[item["chunk"]],
            metadatas=[{"file_name": file}],
            embeddings=[item["embedding"]]
        )

print("Embeddings successfully stored in ChromaDB!")


Embeddings successfully stored in ChromaDB!


In [6]:
def query_chromadb(query, model, top_n=3):
    # Generate embedding for the query
    query_embedding = model.encode([query])[0]

    # Query ChromaDB
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_n
    )

    # Print results
    for i in range(len(results["documents"][0])):
        print(f"Chunk: {results['documents'][0][i]}")
        print(f"Similarity Score: {results['distances'][0][i]}\n")

# Example query
query = "What are OCA's services?"
query_chromadb(query, model)


Chunk: of offshore construction. The breadth of services and experience delivered by the OCA team is shown in Figure 1.1. These services have been carried out for projects in several geographic regions including in the UK and US as well as projects within European and APAC countries. Figure 1.1 OCA Experience by Product Line At the core of OCA’s approach is providing expertise gained in the execution of offshore windfarms back into the development phase of projects in new and emerging markets. OCA’s Mission and Values OCA’s mission is to provide high-quality advice to support safe delivery of offshore wind projects and to generate value for the industry and the communities in which we live and work. We aim to achieve our vision through our four core values, as shown below.
Similarity Score: 0.8526894429183445

Chunk: different stages and learn from lots of different people along the way. Life at OCA Cameron McPartland Senior Consultant, USA What did you do before working at OCA? Previo

RUN FROM HERE

In [22]:
import chromadb

# Connect to your existing ChromaDB instance
chroma_client = chromadb.PersistentClient(path="./chroma_db")

# Load the existing embeddings collection
collection = chroma_client.get_collection("document_embeddings")

print("✅ Successfully connected to existing embeddings in ChromaDB!")


✅ Successfully connected to existing embeddings in ChromaDB!


In [23]:
def query_chromadb(query, top_n=3):
    """Retrieve relevant document chunks from ChromaDB without printing."""
    query_results = collection.query(query_texts=[query], n_results=top_n)

    # Extract relevant document chunks
    retrieved_chunks = query_results["documents"][0]

    return "\n\n".join(retrieved_chunks)  # Return formatted context



In [6]:
# from huggingface_hub import login
# login(token="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")


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# from transformers import AutoModelForCausalLM, AutoTokenizer

# model_name = "microsoft/phi-2"
# save_path = "./saved_models/phi-2"  # Change to any local folder

# # Download and save model locally
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.save_pretrained(save_path)  # Saves tokenizer

# model = AutoModelForCausalLM.from_pretrained(model_name)
# model.save_pretrained(save_path)  # Saves model

# print(f"Model saved locally at: {save_path}")


In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer

local_path = "./saved_models/phi-2"  # Path to saved model

# Load model from local storage
tokenizer = AutoTokenizer.from_pretrained(local_path)
model = AutoModelForCausalLM.from_pretrained(local_path, device_map="cpu")

print("Loaded Phi-2 from local storage!")


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00,  5.20it/s]

Loaded Phi-2 from local storage!





In [9]:
# def generate_rag_response(user_query):
#     # Retrieve relevant context from ChromaDB (pass the model if needed)
#     context = query_chromadb(user_query)

#     # Construct an LLM-friendly prompt
#     prompt = f"""
#     You are an AI assistant trained on Offshore Construction Associates (OCA) data.
#     Use the following document information to answer accurately:

#     Context:
#     {context}

#     If you cannot find the answer, respond with "I don't have enough information to answer that."
    
#     Question: {user_query}
#     Answer:
#     """

#     # Tokenize input
#     inputs = tokenizer(prompt, return_tensors="pt")

#     # Generate a response
#     output = model.generate(**inputs, max_new_tokens=200, do_sample=True)
#     response = tokenizer.decode(output[0], skip_special_tokens=True)

#     print("\n💡 **Generated Answer:**")
#     print(response)
#     return response


In [10]:
# generate_rag_response("What are the services offered by OCA?")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



🔹 Retrieved Context:
1. of offshore construction. The breadth of services and experience delivered by the OCA team is shown in Figure 1.1. These services have been carried out for projects in several geographic regions including in the UK and US as well as projects within European and APAC countries. Figure 1.1 OCA Experience by Product Line At the core of OCA’s approach is providing expertise gained in the execution of offshore windfarms back into the development phase of projects in new and emerging markets. OCA’s Mission and Values OCA’s mission is to provide high-quality advice to support safe delivery of offshore wind projects and to generate value for the industry and the communities in which we live and work. We aim to achieve our vision through our four core values, as shown below.
2. different stages and learn from lots of different people along the way. Life at OCA Cameron McPartland Senior Consultant, USA What did you do before working at OCA? Previously to my offshore wind

KeyboardInterrupt: 

In [11]:
# pip install requests


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
# import requests

# # Set your Groq API key
# GROQ_API_KEY = "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"  # Replace with your actual API key

# # Define the API endpoint
# GROQ_API_URL = "https://api.groq.com/v1/chat/completions"

# # Function to call Groq's LLM
# def ask_groq_llm(prompt, model="mistral-7b", max_tokens=300):
#     headers = {
#         "Authorization": f"Bearer {GROQ_API_KEY}",
#         "Content-Type": "application/json",
#     }

#     payload = {
#         "model": model,  # Options: "mistral-7b", "gemma-7b", "llama3-8b"
#         "messages": [{"role": "system", "content": "You are a helpful assistant."},
#                      {"role": "user", "content": prompt}],
#         "max_tokens": max_tokens,
#         "temperature": 0.7  # Adjust for randomness
#     }

#     response = requests.post(GROQ_API_URL, json=payload, headers=headers)
    
#     if response.status_code == 200:
#         return response.json()["choices"][0]["message"]["content"]
#     else:
#         return f"Error: {response.status_code} - {response.text}"

# # Example Usage
# response = ask_groq_llm("What are the key services offered by Offshore Construction Associates?")
# print("\n💡 **Groq's LLM Response:**")
# print(response)



💡 **Groq's LLM Response:**
Error: 404 - {"error":{"message":"Unknown request URL: POST /v1/chat/completions. Please check the URL for typos, or see the docs at https://console.groq.com/docs/","type":"invalid_request_error","code":"unknown_url"}}



In [24]:
import requests

# Set your Groq API key
GROQ_API_KEY = "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"  # Replace with your actual API key

# Correct Groq API Endpoint
GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"

# Function to call Groq's LLM
def ask_groq_llm(prompt, model="mixtral-8x7b-32768", max_tokens=300):
    headers = {
        "Authorization": f"Bearer {GROQ_API_KEY}",
        "Content-Type": "application/json",
    }

    payload = {
        "model": model,  # ✅ Use a valid Groq model
        "messages": [{"role": "system", "content": "You are a helpful assistant."},
                     {"role": "user", "content": prompt}],
        "max_tokens": max_tokens,
        "temperature": 0.7  # Adjust for randomness
    }

    response = requests.post(GROQ_API_URL, json=payload, headers=headers)
    
    if response.status_code == 200:
        return response.json()["choices"][0]["message"]["content"]
    else:
        return f"Error: {response.status_code} - {response.text}"

# Test with a valid model
response = ask_groq_llm("Explain Offshore Wind Energy in simple terms in 1 sentence?", model="mixtral-8x7b-32768")
print("\n💡 **Groq's LLM Response:**")
print(response)



💡 **Groq's LLM Response:**
Offshore wind energy refers to generating electricity by using wind turbines that are installed in bodies of water, such as oceans or large lakes, taking advantage of the strong and consistent winds found offshore.


In [25]:
response = ask_groq_llm("What are the services offered by OCA?", model="mixtral-8x7b-32768")
print("\n💡 **Groq's LLM Response:**")
print(response)


💡 **Groq's LLM Response:**
I'm glad you find me helpful! OCA, or the Office of Civilian Aviation, is a fictional organization and may not provide specific services. However, in the context of aviation, a civil aviation authority is typically responsible for overseeing and regulating non-military aviation, including issuing airworthiness certificates, registering aircraft, licensing pilots, and enforcing safety standards. If you're referring to a different OCA, please provide more context so I can give a more accurate response.


Restrict LLM to Use Only ChromaDB Knowledge

In [26]:
def generate_rag_response(user_query):
    # Step 1: Retrieve relevant document chunks from ChromaDB
    context = query_chromadb(user_query)

    # Step 2: Construct an optimized RAG prompt
    prompt = f"""
    You are OCA Assistant, a specialized AI trained exclusively on Offshore Construction Associates (OCA) data.
    
    Your role is to provide accurate responses using ONLY the provided document information.
    If the requested information is not found in the retrieved context, respond with:
    "I don’t have enough information to answer that."

    📄 **Context from OCA's documents:**
    {context}

    ❓ **User Question:** {user_query}
    
    💡 **Answer:**
    """

    # Step 3: Call Groq’s LLM with retrieved context
    response = ask_groq_llm(prompt, model="mixtral-8x7b-32768")

    print("\n💡 **Generated Answer from OCA Assistant:**")
    return response


In [27]:
generate_rag_response("What is OCA? Explain in 1 short line")



💡 **Generated Answer from OCA Assistant:**


'OCA is a company providing high-quality advice and expertise for offshore wind project development and execution.'

In [29]:
generate_rag_response("How to install an offshore substation?")



💡 **Generated Answer from OCA Assistant:**


'The installation of an offshore substation typically involves the following steps:\n\n1. Selection of the Vessel: The choice of vessel for installing an offshore substation is likely to be driven by market factors and the vessels often serve other markets. Four main types of vessel may be used, including sheerleg crane vessel, barge, heavy lift vessel, and semisubmersible vessel. The vessels used in offshore wind include Rambiz, Stanislav Yudin, and Samson, with crane ratings from 900 tonnes to over 3,000 tonnes.\n\n2. Positioning the Substation: The substation is positioned on pre-installed foundations. This work is included in the substation installation contract.\n\n3. Day Rates: Day rates for most substation installation vessels are about £180,000. Semisubmersible vessels may have day rates greater than £450,000, but if the oil and gas market is quiet, rates may be more competitive.\n\n4. Suppliers: Operators include Bonn & Mees, DBB, Huisman, Saipem, Scaldis Salvage & Marine, and