In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import openai
import tiktoken
import time
from typing import List



In [18]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """
    Returns the number of tokens in a text string using the specified encoding.
    """
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def perform_semantic_chunking(text: str, max_chunk_size: int = 1000) -> List[str]:
    """
    Uses OpenAI's API to perform semantic chunking on the provided text.
    Ensures each chunk does not exceed the max_chunk_size in tokens.
    """
    prompt = (
        f"Please divide the following text into semantically coherent sections, "
        f"ensuring each section does not exceed {max_chunk_size} tokens:\n\n{text}"
    )
    max_retries = 5
    retry_delay = 1  # Start with a 1-second delay

    for attempt in range(max_retries):
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4",
                messages=[{"role": "user", "content": prompt}],
                max_tokens=max_chunk_size
            )
            chunks = response.choices[0].message['content'].strip().split('\n\n')
            return chunks
        except openai.error.RateLimitError:
            print(f"Rate limit exceeded. Retrying in {retry_delay} seconds...")
            time.sleep(retry_delay)
            retry_delay *= 2  # Exponential backoff
        except openai.error.OpenAIError as e:
            print(f"OpenAI API error: {e}")
            break
        except Exception as e:
            print(f"Unexpected error: {e}")
            break
    return []

def process_file_in_batches(input_file_path: str, output_file_path: str, batch_size: int = 10):
    """
    Processes a large JSONL file in batches, performs semantic chunking on each document,
    and writes the resulting chunks to an output file.
    """
    with open(input_file_path, 'r', encoding='utf-8') as infile, \
         open(output_file_path, 'w', encoding='utf-8') as outfile:
        batch = []
        for line_number, line in enumerate(infile, start=1):
            try:
                data = json.loads(line)
                batch.append(data)
                if len(batch) == batch_size:
                    process_batch(batch, outfile)
                    batch = []
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON on line {line_number}: {e}")
                continue
        if batch:
            process_batch(batch, outfile)

def process_batch(batch: List[dict], outfile):
    """
    Processes each batch: performs semantic chunking on each document's text
    and writes the chunks to the output file in JSONL format.
    """
    for document in batch:
        text = document.get('text', '')
        if text:
            # Ensure the text does not exceed the model's token limit
            if num_tokens_from_string(text, 'cl100k_base') > 4096:
                print("Document exceeds token limit and will be skipped.")
                continue
            chunks = perform_semantic_chunking(text)
            for chunk in chunks:
                json.dump({"chunk": chunk}, outfile)
                outfile.write('\n')


In [19]:
input_file_path = '/content/drive/MyDrive/DATATHON2025/chunkedalr.jsonl'
output_file_path = '/content/drive/MyDrive/DATATHON2025/output_chunks.jsonl'
batch_size = 100  # Adjust based on your memory capacity and API rate limits
process_file_in_batches(input_file_path, output_file_path, batch_size)



In [17]:
# Step 1: Load grouped sentence data
import json
from tqdm import tqdm

def load_grouped_sentences(path, limit=None):
    documents = []
    with open(path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if limit and i >= limit:
                break
            record = json.loads(line)
            documents.append(record)  # { "file": ..., "sentences": [...] }
    return documents

grouped_docs = load_grouped_sentences("/content/drive/MyDrive/DATATHON2025/chunkedalr.jsonl")
print(f"Loaded {len(grouped_docs)} documents")


Loaded 13122 documents


In [33]:
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-core<1.0.0,>=0.3.51 (from langchain-community)
  Downloading langchain_core-0.3.51-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain<1.0.0,>=0.3.23 (from langchain-community)
  Downloading langchain-0.3.23-py3-none-any.whl.metadata (7.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-

In [1]:
import json
from tqdm import tqdm
import numpy as np
from langchain.embeddings import HuggingFaceEmbeddings
from sklearn.cluster import MiniBatchKMeans
from langchain.schema import Document
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor

# Step 1: Load grouped sentence data
def load_grouped_sentences(path, limit=None):
    documents = []
    with open(path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if limit and i >= limit:
                break
            record = json.loads(line)
            documents.append(record)  # { "file": ..., "sentences": [...] }
    return documents

grouped_docs = load_grouped_sentences("/content/drive/MyDrive/DATATHON2025/chunkedalr.jsonl")
print(f"Loaded {len(grouped_docs)} documents")

# Step 2: Set up embedding model (GPU)
embedding_model = HuggingFaceEmbeddings(
    model_name='sentence-transformers/all-MiniLM-L6-v2',
    model_kwargs={'device': 0}
)

# Step 3: Parallelized Semantic chunking (chunk sentences first, then embed)
def process_document(doc):
    file_id = doc["file"]
    sentences = doc["sentences"]

    if len(sentences) < 2:
        return []

    # Embed the sentences first
    sentence_embeddings = embedding_model.embed_documents(sentences)

    # Chunk the sentences into clusters using embeddings
    num_clusters = max(2, len(sentences) // 5)
    kmeans = MiniBatchKMeans(n_clusters=num_clusters, random_state=42, batch_size=100)

    # Fit and predict on sentence embeddings, not raw sentences
    labels = kmeans.fit_predict(sentence_embeddings)

    # Group sentences into clusters
    clusters = defaultdict(list)
    for label, sentence in zip(labels, sentences):
        clusters[label].append(sentence)

    # Create semantic chunks as plain text (before embedding)
    chunk_texts = []
    for cluster_id, cluster_sentences in clusters.items():
        chunk_text = " ".join(cluster_sentences)
        chunk_texts.append(chunk_text)

    # Now embed the chunks (you might want to reuse sentence embeddings here for efficiency)
    chunk_embeddings = embedding_model.embed_documents(chunk_texts)

    # Convert the chunk text and embeddings into semantic chunk objects
    semantic_chunks = []
    for i, chunk_text in enumerate(chunk_texts):
        semantic_chunks.append(
            Document(
                page_content=chunk_text,
                metadata={
                    "source": file_id,
                    "cluster": i  # Store cluster index as metadata
                },
                embeddings=chunk_embeddings[i]  # Attach the embeddings
            )
        )

    return semantic_chunks

# Using ThreadPoolExecutor for parallel processing of documents
semantic_chunks = []
with ThreadPoolExecutor(max_workers=4) as executor:
    # Create a tqdm progress bar for the document processing
    results = list(tqdm(executor.map(process_document, grouped_docs), total=len(grouped_docs), desc="Processing Documents"))

    # Flatten results and add them to semantic_chunks
    for result in results:
        semantic_chunks.extend(result)

print(f"Created {len(semantic_chunks)} semantic chunks")


Loaded 13122 documents


  embedding_model = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Processing Documents: 100%|██████████| 13122/13122 [1:35:17<00:00,  2.29it/s]

Created 1506599 semantic chunks





In [3]:
import json

# Step 4: Save the chunks to a local file (JSON Lines format)
output_path = "/content/semantic_chunks_2.jsonl"  # Specify the path to save the file

with open(output_path, 'w', encoding='utf-8') as f:
    for chunk in tqdm(semantic_chunks, desc="Saving chunks to file"):
        # Explicitly attach embeddings if they are stored elsewhere
        if hasattr(chunk, 'embeddings') and chunk.embeddings is not None:
            embeddings = chunk.embeddings.tolist()  # Convert embeddings to list for serialization
        else:
            embeddings = None  # If no embeddings available, set to None

        chunk_data = {
            "text": chunk.page_content,
            "metadata": chunk.metadata,
            "embeddings": embeddings  # Attach embeddings
        }
        f.write(json.dumps(chunk_data) + '\n')

print(f"Semantic chunks saved to {output_path}")


Saving chunks to file: 100%|██████████| 1506599/1506599 [00:17<00:00, 85626.58it/s]

Semantic chunks saved to /content/semantic_chunks_2.jsonl





In [2]:
!pip install spacy sentence-transformers faiss-cpu openai



Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_6

In [6]:
!python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting spacy-curated-transformers<1.0.0,>=0.2.2 (from en-core-web-trf==3.8.0)
  Downloading spacy_curated_transformers-0.3.0-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting curated-transformers<0.2.0,>=0.1.0 (from spacy-curated-transformers<1.0.0,>=0.2.2->en-core-web-trf==3.8.0)
  Downloading curated_transformers-0.1.1-py2.py3-none-any.whl.metadata (965 bytes)
Collecting curated-tokenizers<0.1.0,>=0.0.9 (from spacy-curated-transformers<1.0.0,>=0.2.2->en-core-web-trf==3.8.0)
  Downloading curated_tokenizers-0.0.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.9 kB)
Downloading spacy_curated_transformers-0.3.0-py2.py3-none-any.whl (236 kB)
[2K   [90m━━━━

#Main solution, but running time was too long;
More options below

In [12]:
import json

import spacy
import openai
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Initialize spaCy NER model
nlp = spacy.load("en_core_web_trf")

# Initialize Sentence-Transformer for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')


# Function to load your JSONL file
def load_jsonl(file_path):
    with open(file_path, 'r') as f:
        data = [json.loads(line) for line in f]
    return data

# Function to extract named entities
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Function to get embedding for the content
def get_embedding(text):
    return model.encode(text)

# Function to query OpenAI with page content and a question
def query_openai(page_content, question):
    prompt = f"Here is some text:\n\n{page_content}\n\nAnswer the following question based on the text: {question}"

    response = openai.Completion.create(
        model="gpt-4",  # Or "gpt-3.5-turbo"
        prompt=prompt,
        max_tokens=150,  # You can adjust this as needed
        temperature=0.7,  # Controls creativity; 0 for deterministic, 1 for more creative
    )

    return response.choices[0].text.strip()

# Function to perform vector search in FAISS
def vector_search(query, index, k=5):
    query_embedding = get_embedding(query)
    D, I = index.search(np.array([query_embedding]), k)  # Get top k results
    return I

# Load JSONL data and process the content
file_path = '/content/semantic_chunks_2.jsonl'  # Replace with your actual file path
data = load_jsonl(file_path)

# Extract NER and embeddings for each page content
embeddings = []
page_contents = []
entities = []

for entry in data:
    page_content = entry['text']
    page_contents.append(page_content)

    # Extract named entities
    page_entities = extract_entities(page_content)
    entities.append(page_entities)

    # Get the embedding of the page content
    embedding = get_embedding(page_content)
    embeddings.append(embedding)

# Convert embeddings list to numpy array for FAISS indexing
embeddings = np.array(embeddings)

# Create the FAISS index
embedding_dimension = embeddings.shape[1]  # Get dimension of embeddings
index = faiss.IndexFlatL2(embedding_dimension)  # Using L2 distance metric
index.add(embeddings)  # Add embeddings to the index

# Function to handle the full workflow: vector search + NER + LLM query
def process_query(query):
    # Step 1: Perform vector search in FAISS to find the most relevant content
    indices = vector_search(query, index, k=3)  # Top 3 results

    # Step 2: Collect the most relevant page content and their entities
    relevant_content = []
    for idx in indices[0]:
        relevant_content.append({
            'content': page_contents[idx],
            'entities': entities[idx]
        })

    # Step 3: Pass the relevant content to OpenAI's LLM for the final answer
    answers = []
    for content in relevant_content:
        answer = query_openai(content['content'], query)
        answers.append({
            'content': content['content'],
            'answer': answer,
            'entities': content['entities']
        })

    return answers

# Example query to process
query = "What company provides assisted living near Richmond, Virginia?"
answers = process_query(query)

# Display the results
for answer in answers:
    print(f"Page Content: {answer['content']}")
    print(f"Entities: {answer['entities']}")
    print(f"Generated Answer: {answer['answer']}")
    print("-" * 50)


KeyboardInterrupt: 

In [14]:
!pip install -q langchain chromadb sentence-transformers transformers scikit-learn


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m97.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m78.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m77.9 MB/s[0m eta [36m0:00:00[

In [None]:

from langchain.vectorstores import Chroma
import tempfile

persist_directory = tempfile.mkdtemp()

vector_db = Chroma.from_documents(
    documents=semantic_chunks,
    embedding=embedding_model,
    persist_directory=persist_directory
)
from transformers import pipeline
from langchain.llms import HuggingFacePipeline

pipe = pipeline("text2text-generation", model="google/flan-t5-base", device=0)
llm = HuggingFacePipeline(pipeline=pipe)
from langchain.chains import RetrievalQA

retriever = vector_db.as_retriever(search_kwargs={"k": 2})

qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)


In [None]:
query = "1. What company provides assisted living near Richmond, Virginia?"
response = qa.invoke(query)
print("Answer 1:", response['result'])

query2 = "What companies uses packaging materials in Valencia, California?"
response2 = qa.invoke(query2)
print("Answer 2:", response2['result'])


In [19]:
!python -m spacy download en_core_web_trf


Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [21]:
!pip install transformers faiss-cpu sentence-transformers




In [10]:
pip install openai transformers




In [4]:
import json
import openai
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np


# Initialize Hugging Face NER model
ner_model = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")

# Initialize Sentence-Transformer model for embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize FAISS index for vector search
index = faiss.IndexFlatL2(embedding_model.get_sentence_embedding_dimension())  # Use L2 distance for similarity

# Function to extract named entities from text
def extract_entities(text):
    return ner_model(text)

# Function to clean text (e.g., remove extra newlines, etc.)
def clean_text(text):
    return text.replace("\n", " ").strip()

# Function to convert text into embeddings
def embed_text(text):
    return embedding_model.encode(text)

# Function to load the JSONL file
def load_jsonl(file_path):
    with open(file_path, 'r') as infile:
        return [json.loads(line) for line in infile]

# Function to process and index data from JSONL file
def process_and_index_data(input_file):
    data = load_jsonl(input_file)  # Load data from the input JSONL file

    embeddings = []

    # Process each line (each text)
    for entry in data:
        text = entry['text']
        cleaned_text = clean_text(text)  # Clean the text

        # Extract named entities using NER
        entities = extract_entities(cleaned_text)
        entry['entities'] = entities  # Add entities to the entry

        # Generate embeddings for the text
        embedding = embed_text(cleaned_text)
        embeddings.append(embedding)  # Store embeddings for later use in FAISS index

    # Convert embeddings list to numpy array
    embeddings = np.array(embeddings).astype('float32')
    index.add(embeddings)  # Add embeddings to the FAISS index

    return data

# Function to perform vector search based on the query
def vector_search(query, k=3):
    query_embedding = embed_text(query)  # Convert query to embedding
    D, I = index.search(np.array([query_embedding]).astype('float32'), k)  # Top k results
    return I

# Function to query OpenAI with a specific question based on the retrieved text
def query_openai(text, question):
    prompt = f"Here is some text:\n\n{text}\n\nAnswer the following question based on the text: {question}"

    response = openai.Completion.create(
        model="gpt-4",  # or "gpt-3.5-turbo"
        prompt=prompt,
        max_tokens=150,
        temperature=0.7,
    )

    return response.choices[0].text.strip()

# Function to process JSONL file and output to another file
def process_jsonl(input_file, output_file, question):
    data = process_and_index_data(input_file)  # Process the data and index

    # Open the output file to write processed data
    with open(output_file, 'w') as outfile:
        for entry in data:
            # Use vector search to get top relevant documents based on the question
            indices = vector_search(question)

            # Retrieve the top relevant document(s) from the FAISS index
            best_text = data[indices[0][0]]['text']  # Get the most relevant document

            # Query OpenAI to generate a refined answer
            answer = query_openai(best_text, question)
            entry['openai_answer'] = answer  # Add OpenAI's answer to the data

            # Write the processed entry back to the output file
            outfile.write(json.dumps(entry) + "\n")

# Example usage
input_file = '/content/semantic_chunks_2-out.jsonl'  # Replace with your actual input file path
output_file = '/content/semantic_chunks_2.jsonl'  # Replace with your desired output file path
question = "What company provides assisted living near Richmond, Virginia?"

# Process the JSONL file
process_jsonl(input_file, output_file, question)


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


ValueError: not enough values to unpack (expected 2, got 1)