In [1]:
!pip install openai

# Database options
!pip install chromadb # if you use chromadb as your vector database

# Others
!pip install langchain-community # if you use langchain for orchastration
!pip install transformers #if you use huggingface for vector embedding



In [2]:
!pip install sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [3]:
# enable GPU if needed, GPU can speed up your vector embedding if you computing these vectors locally (not using API)

import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
!pip install transformers



In [3]:
import os
import json
import tempfile
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

embedding = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=0
)

folder_path = "/content/drive/MyDrive/DATATHON2025/cleaned-data"
json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]

persist_directory = tempfile.mkdtemp()

vector_db = Chroma(
    embedding_function=embedding,
    persist_directory=persist_directory
)

batch_size = 100
doc_buffer = []

def process_file(filename):
    file_path = os.path.join(folder_path, filename)

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except Exception as e:
        print(f"Skipping {filename}: {e}")
        return []

    page_texts = data.get("text_by_page_url", {})
    full_text = "\n".join(page_texts.values()).strip()

    if not full_text:
        return []

    try:
        documents = splitter.create_documents([full_text])
        for doc in documents:
            doc.metadata["source"] = filename
        return documents
    except Exception as e:
        print(f"Chunking failed for {filename}: {e}")
        return []

with ThreadPoolExecutor() as executor:
    results = list(tqdm(executor.map(process_file, json_files), desc="Processing files", unit="file"))

for documents in results:
    if documents:
        doc_buffer.extend(documents)

for i in range(0, len(doc_buffer), batch_size):
    try:
        vector_db.add_documents(doc_buffer[i:i + batch_size])
    except Exception as e:
        print(f"Failed saving batch: {e}")

vector_db.persist()
print(f"Vector store saved to: {persist_directory}")


Processing files: 13144file [03:37, 60.48file/s] 


Vector store saved to: /tmp/tmpnrnwa9gq


  vector_db.persist()


In [4]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

pipe = pipeline("text2text-generation", model="google/flan-t5-base", max_new_tokens=256)
llm = HuggingFacePipeline(pipeline=pipe)

retriever = vector_db.as_retriever(search_type="mmr", search_kwargs={"k": 10})

custom_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are an expert assistant helping a Supply Chain Director assess vendors, suppliers, and logistics risks.

Use the context below to answer the question accurately and comprehensively.
List **all relevant entities, companies, or insights**, if available.

- If multiple companies are involved, list them all with brief descriptions.
- If the answer is unclear or missing, say: "Based on the available data, the answer isn't conclusive."
- If reasoning is needed (e.g., counting, comparing), explain your logic step-by-step.

Context:
{context}

Question:
{question}

Answer:
"""
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": custom_prompt}
)

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


(…)b68e004ad934361fb35b9b2bd50b45ea90790fc8:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=pipe)


In [7]:
query = "What company provides assisted living near Richmond, Virginia?"
response = qa.invoke(query)
print(response["result"])


Covenant Woods


In [11]:
query1 = "Who uses Agile Methodologies to deal with Marketing in Fort Lauderdale, FL?"
response1 = qa.invoke(query1)
print(response1["result"])


Strategic Marketing


In [13]:
query2= "What companies uses packaging materials in Valencia, California?"
response2 = qa.invoke(query2)
print(response2["result"])

Grupo Phoenix
