In [None]:
# !pip install "protobuf<6.0.0,>=3.20.3" --force-reinstall
# !pip install numpy==1.26.4 --force-reinstall
!pip install openai
!pip install chromadb
!pip install langchain-community
!pip install transformers
!pip install --upgrade torch transformers
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install sentence-transformers

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m112.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import re
import os
import json
import numpy
import time
import tempfile
import spacy
import openai
from typing import List

import chromadb
from chromadb.utils import embedding_functions
from chromadb.config import Settings

from sklearn.cluster import KMeans
from transformers import pipeline
from transformers import GPT2Tokenizer
from sentence_transformers import SentenceTransformer

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.llms import HuggingFacePipeline
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains import RetrievalQA

import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [None]:
os.environ["OPENAI_API_KEY"] = ""

In [None]:
# Load the Drive and mount
from google.colab import drive
drive.mount('/content/drive/')

folder_path = "/content/drive/Shared drives/Datathon/Data/hackathon_data/"# Google drive path of the dataset
json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
folder_path = "/content/drive/Shared drives/Datathon/Data/hackathon_data/"# Google drive path of the dataset
json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]

In [None]:
# DB setup
persist_directory = tempfile.mkdtemp()
chroma_client = chromadb.Client(Settings(persist_directory=persist_directory, anonymized_telemetry=False))
collection = chroma_client.get_or_create_collection(name="biz_web_chunks")

# Embedder model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# LLM
pipe = pipeline("text2text-generation", model="t5-small")
llm = HuggingFacePipeline(pipeline=pipe)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=pipe)


In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
nlp = spacy.load("en_core_web_sm")
max_token = 128
MAX_CHUNK_SIZE = 1000000

def load_text_from_json(path):
    # with open(path, "r") as f:
    #     data = json.load(f)

    # # Handle multiple URLs - concatenate their content into one large text block
    # all_text = ""
    # for url, page_text in data.get("text_by_page_url", {}).items():
    #     all_text += f"\n\n--- URL: {url} ---\n\n{page_text}"

    # return data["url"], all_text

    with open(path, "r") as f:
        data = json.load(f)

    if "text_by_page_url" not in data:
        print(f"Warning: No 'text_by_page_url' found in {path}")
        return data.get("url", "unknown"), ""

    all_text = ""
    for url, page_text in data["text_by_page_url"].items():
        all_text += f"\n\n--- URL: {url} ---\n\n{page_text}"

    return data.get("url", "unknown"), all_text

def embed_sentences(text):
    """Embeds sentences using a pre-trained Sentence-BERT model."""
    # Check the text length before passing to spaCy
    if len(text) > nlp.max_length:
        print(f"Warning: Text exceeds {nlp.max_length} characters, splitting into smaller chunks.")
        # Split the text into smaller chunks manually
        text_chunks = chunk_text_by_token_limit(text, max_tokens=128)
        all_sentences = []
        all_embeddings = []

        for chunk in text_chunks:
            # Process each chunk with spaCy
            doc = nlp(chunk)
            sentences = [sent.text.strip() for sent in doc.sents]
            embeddings = embedding_model.encode(sentences, convert_to_tensor=True)

            all_sentences.extend(sentences)
            all_embeddings.extend(embeddings)

        return all_sentences, all_embeddings

    else:
        # If the text is under the limit, process it normally
        doc = nlp(text)
        sentences = [sent.text.strip() for sent in doc.sents]
        embeddings = embedding_model.encode(sentences, convert_to_tensor=True)
        return sentences, embeddings

def extract_entities(text):
    """Extracts named entities from text using spaCy's NER."""
    doc = nlp(text)
    entities = [ent.label_ for ent in doc.ents]  # Extract entity labels
    return list(set(entities))  # Remove duplicates and return as a list

"""
DIFFERENT CHUNKING TECHNIQUES

To use during our experimentation, we considered chunking data in various ways.

def paragraph_chunk(text, max_chars=1000): chunks text by considering paragraphs
def chunk_text_by_token_limit(text, max_tokens=128): chunks text by considering a set limit of tokens
semantic_chunking(raw_text, max_tokens=128): chunks text by considering semantic similarity
"""

def split_into_sentences(text: str) -> List[str]:
    """Splits the text into sentences using spaCy's sentence boundary detection."""
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]

def paragraph_chunk(text, max_chars=1000):
    paragraphs = [str(p).strip() for p in text.split("\n") if str(p).strip()]
    chunks = []
    current = ""
    for p in paragraphs:
        if len(current) + len(p) < max_chars:
            current += " " + p
        else:
            chunks.append(current.strip())
            current = p
    if current:
        chunks.append(current.strip())
    return chunks

def chunk_text_by_length_limit(text, max_chars=MAX_CHUNK_SIZE):
    """Chunks text into pieces of max_chars length, trying to respect sentence boundaries."""
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())

    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) + 1 <= max_chars:
            current_chunk += " " + sentence if current_chunk else sentence
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

def chunk_text_by_token_limit(text, max_tokens=128):
    # Tokenize the text and calculate token count
    tokens = tokenizer.encode(text, truncation=False)
    token_count = len(tokens)

    # If the text is too long, chunk it into parts that fit within the token limit
    chunks = []
    current_chunk = []
    current_token_count = 0

    for token in tokens:
        current_token_count += 1
        current_chunk.append(token)

        # If adding the next token exceeds max_tokens, save the current chunk and start a new one
        if current_token_count > max_tokens:
            chunks.append(tokenizer.decode(current_chunk, skip_special_tokens=True))
            current_chunk = [token]  # Start a new chunk with the current token
            current_token_count = 1  # Reset token count for the new chunk

    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(tokenizer.decode(current_chunk, skip_special_tokens=True))

    return chunks

def semantic_chunking(text, max_chunk_size=1_000_000, max_tokens=128):
    """Performs semantic chunking with fallback to token-based splitting for large chunks."""
    # Step 1: Split into sentences
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    if not sentences:
        return []

    # Step 2: Embed sentences
    embedding = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    sentence_embeddings = embedding.embed_documents(sentences)

    # Step 3: Cluster sentences
    num_clusters = max(2, len(sentences) // 5)
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    labels = kmeans.fit_predict(sentence_embeddings)

    # Step 4: Group and conditionally split large chunks
    clustered_texts = {}
    for label, sentence in zip(labels, sentences):
        clustered_texts.setdefault(label, []).append(sentence)

    chunks = []
    for label in sorted(clustered_texts.keys()):
        chunk = " ".join(clustered_texts[label])
        if len(chunk) > max_chunk_size:
            print(f"[!] Large chunk ({len(chunk)} chars), using length-based fallback.")
            sub_chunks = chunk_text_by_length_limit(chunk, max_chars=max_chunk_size)
            chunks.extend(sub_chunks)
        else:
            chunks.append(chunk)

    return chunks

def create_documents(text_chunks, source_url):
    """Creates LangChain Document objects from the text chunks for a specific URL."""
    docs = []
    for chunk in text_chunks:
        if not chunk.strip():
            continue

        # Extract entities for the chunk
        entities = extract_entities(chunk)
        labels_string = ",".join(entities)  # Convert the list of entities to a string

        # Create a Document object with metadata
        doc = Document(
            page_content=chunk,
            metadata={
                "source": source_url,  # Store the source URL
                "labels": labels_string  # Store the extracted NER labels as metadata
            }
        )
        docs.append(doc)

    return docs

"""
DIFFERENT .JSON PROCESSING FUNCTIONS

To use during our experimentation, we have written multiple .json processing functions.

process_specific_json_files(json_folder_path, specific_files): processes specific files through the pipeline to check the results
process_all_json_files(json_folder_path): processes all files through the pipeline to check the results
"""
def process_specific_json_files(json_folder_path, specific_files):
    # Store all documents here
    all_documents = []

    # Process only the specific files provided in the list
    for i, json_file in enumerate(specific_files):
        file_path = os.path.join(json_folder_path, json_file)

        if os.path.exists(file_path):  # Ensure the file exists
            print(f"Processing {json_file} ({i+1}/{len(specific_files)})...")  # Showing progress

            # Load the content from the JSON file
            source_url, all_text = load_text_from_json(file_path)

            # Split the text into chunks based on your chunking logic (e.g., paragraph chunking, semantic chunking, etc.)
            text_chunks = semantic_chunking(all_text)

            # Create documents for each chunk
            docs = create_documents(text_chunks, source_url)  # Now we pass both the chunks and the source URL
            all_documents.extend(docs)  # Add the resulting documents to the list

    return all_documents

def process_all_json_files(json_folder_path):
    # List all JSON files in the directory
    json_files = [f for f in os.listdir(json_folder_path) if f.endswith('.json')]

    # Store all documents here
    all_documents = []

    files_processed = 0

    # Process all files
    for json_file in json_files:
        file_path = os.path.join(json_folder_path, json_file)
        print(f"Processing {json_file}...")

        # Create documents for each JSON file
        docs = create_documents(file_path)  # Call the create_documents function from before
        all_documents.extend(docs)  # Add the resulting documents to the list

        files_processed += 1
    return all_documents

def retrieve_relevant_documents(query, vectorstore, top_k=5):
    """Retrieve the most relevant documents based on a query."""
    # Step 1: Embed the query
    query_embedding = embedding_model.encode([query], convert_to_tensor=True)

    # Step 2: Perform semantic search using Chroma's similarity search
    search_results = vectorstore.similarity_search_by_vector(query_embedding[0], k=top_k)

    # Step 3: Return the top-k relevant documents
    return search_results

# Function to generate an answer from the LLM using retrieved documents
def generate_answer_from_documents(query, relevant_docs):
    """Generate an answer using the LLM and the context provided by the retrieved documents."""
    # Combine the retrieved documents' content into a single text block
    document_content = "\n".join([doc.page_content for doc in relevant_docs])

    # Create a prompt to ask the LLM for an answer based on the context
    prompt = f"""
    You are an AI assistant that answers questions based on the context below. For each answer, please provide your reasoning or explain how you arrived at the conclusion.

    Context:
    {document_content}

    Question: {query}

    Answer: Based on the context, please explain your reasoning for the answer, and then provide the final answer. If you can't find enough information to answer the question, say "I don't know."
    """

    # Use the LLM to generate an answer
    response = llm(prompt)

    return response.strip()

In [None]:
# def create_documents(json_path):
#     # Load and combine all URLs' content into one block
#     source, raw_text = load_text_from_json(json_path)

#     # Deduplicate BEFORE chunking
#     # deduped_text = deduplicate_text(raw_text)

#     # Chunk the deduplicated text
#     # chunks = paragraph_chunk(raw_text)
#     chunks = semantic_chunking(raw_text)

#     # Just for debugging
#     print(f"Total Chunks from {json_path}: {len(chunks)}")


#     # Create LangChain Document objects
#     docs = []
#     for chunk in chunks:
#         if not chunk or not isinstance(chunk, str) or not chunk.strip():
#             continue
#         labels = extract_entities(chunk)
#         labels_string = ",".join(labels)
#         doc = Document(
#             page_content=chunk,
#             metadata={
#                 "source": source,
#                 "labels": labels_string
#             }
#         )
#         docs.append(doc)
#     return docs

# def extract_entities_from_query(query):
#     doc = nlp(query)
#     entities = [ent.label_ for ent in doc.ents]
#     return entities

# def embed_sentences(text):
#     """Embeds sentences using a pre-trained Sentence-BERT model."""
#     # Split the text into sentences using spaCy
#     doc = nlp(text)
#     sentences = [sent.text.strip() for sent in doc.sents]

#     # Embed each sentence
#     embeddings = embedding_model.encode(sentences, convert_to_tensor=True)
#     return sentences, embeddings

# # Function to filter documents based on query entities
# def filter_documents_by_entities(documents, query_entities):
#     relevant_documents = []

#     for doc in documents:
#         metadata = doc.metadata.get("labels", "").split(",")  # Assuming metadata contains labels as a comma-separated string
#         if any(entity in metadata for entity in query_entities):
#             relevant_documents.append(doc)

#     return relevant_documents

# # Function to generate an answer based on relevant documents
# def generate_answer_from_relevant_docs(query, relevant_docs):
#     # Access the page_content attribute of each Document object
#     documents_text = "\n".join([doc.page_content for doc in relevant_docs])  # Fixed this line

#     prompt = f"""
#     You are a helpful assistant. Please provide a detailed, formal response to the questions you are asked.
#     Below are some documents related to the question. Your answers must be based on thes documents.
#     Please read them and provide the best possible answer to the question. If you don't know the answer, just say that you don't know, don't try to make up an answer.:
#     Documents:
#     {documents_text}

#     Question: {query}
#     Answer:
#     """

#     response = llm(prompt, max_length=150, num_return_sequences=1)
#     return response

In [None]:
start_time = time.time()

specific_files = ['ippathways.com.json',
                  'auroraarizona.com.json',
                  'covenantwoods.com.json',
                  'amsfulfillment.com.json',
                  'starmark.com.json',
                  'cariloha.com.json',
                  '12stone.com.json',
                  'beautifuldestinations.com.json',
                  'act-on.com.json',
                  '1-act.com.json']
all_documents = process_specific_json_files(folder_path, specific_files)

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(
    documents=all_documents,
    embedding=embedding_model,
    persist_directory=persist_directory
    )

end_time = time.time()
processing_time = end_time - start_time
print(f"All the .json files processed in {processing_time:.2f} seconds.")

Processing ippathways.com.json (1/10)...
[!] Large chunk (1158610 chars), using length-based fallback.
Processing auroraarizona.com.json (2/10)...
Processing covenantwoods.com.json (3/10)...
Processing amsfulfillment.com.json (4/10)...
Processing starmark.com.json (5/10)...
Processing cariloha.com.json (6/10)...
Processing 12stone.com.json (7/10)...
Processing beautifuldestinations.com.json (8/10)...
Processing act-on.com.json (9/10)...
Processing 1-act.com.json (10/10)...
All the .json files processed in 276.01 seconds.


In [None]:
query = "Who uses Agile Methodologies to deal with Marketing in Fort Lauderdale, FL?"
top_k = 3  # Number of relevant documents to retrieve

# Retrieve relevant documents from Chroma
query_embedding = embedding_model.embed_query(query)

# Retrieve documents from the vector store
relevant_docs = vectorstore.similarity_search_by_vector(query_embedding, k=top_k)

# Generate an answer using the LLM
answer = generate_answer_from_documents(query, relevant_docs)

print(f"Answer to the query '{query}':")
print(answer)

Answer to the query 'Who uses Agile Methodologies to deal with Marketing in Fort Lauderdale, FL?':
LAB Agency Services Subscription Culture People Careers Work Trending Articles eTips


In [None]:
# # vectorstore = Chroma(collection_name="biz_web_chunks", embedding_function=embedding_model, client=chroma_client)
# retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

# pipe = pipeline("text2text-generation", model="t5-small")
# llm = HuggingFacePipeline(pipeline=pipe)

# qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

# # Ask question
# query = "What company provides assisted living near Richmond, Virginia?"
# retrieved_docs = retriever.get_relevant_documents(query)

# # Extract entities from the query and filter documents by these entities
# entities_in_query = extract_entities_from_query(query)
# relevant_docs = filter_documents_by_entities(retrieved_docs, entities_in_query)

# # Generate the answer from relevant documents
# answer = generate_answer_from_relevant_docs(query, relevant_docs)
# print(f"Answer: {answer}")

# # answer = generate_answer_from_relevant_docs(query, relevant_docs)
# # print(f"Answer: {answer}")