In [4]:
import logging
import httpx
import openai
from pathlib import Path
from tqdm import tqdm
import weaviate
from weaviate.auth import AuthApiKey
from weaviate.classes.config import Configure, Property, DataType
from weaviate.classes.data import DataObject
from weaviate.classes.init import Auth
from weaviate.classes.config import Configure
import uuid
import json



In [2]:
# Configure logging

logging_dir="logs"
os.makedirs(logging_dir, exist_ok=True)
logging.basicConfig(filename=os.path.join(logging_dir,"embedding.log"), level=logging.DEBUG, format='%(asctime)s %(message)s', force=True)
logging.info("Logging is configured.")

In [3]:
try:
    # fetch all files from scraped_data_course_content
    logging.info("Fetching course files ...")
    course_folder = Path("scraped_data_course_content/")
    course_content_files = list(course_folder.rglob("*"))
    print(course_content_files)

    logging.info("Fetching discourse files ...")
    # fetch all files from scraped_data_discourse_content
    discourse_folder = Path("scraped_data_discourse_page/")
    discourse_content_files = list(discourse_folder.rglob("*"))
    print(discourse_content_files)
    logging.info("Files fetched successfully.")

except Exception as e:
    logging.error(f"Error while fetching files: {e}")



[PosixPath('scraped_data_course_content/Scraping_with_Excel.md'), PosixPath('scraped_data_course_content/LLM_Video_Screen-Scraping.md'), PosixPath('scraped_data_course_content/TDS_TA_Instructions.md'), PosixPath('scraped_data_course_content/5._Data_Preparation.md'), PosixPath('scraped_data_course_content/Local_LLMs__Ollama.md'), PosixPath('scraped_data_course_content/RAG_with_the_CLI).md'), PosixPath('scraped_data_course_content/Correlation_with_Excel.md'), PosixPath('scraped_data_course_content/Images__Compression.md'), PosixPath('scraped_data_course_content/Markdown.md'), PosixPath('scraped_data_course_content/LLM_Sentiment_Analysis.md'), PosixPath('scraped_data_course_content/Spreadsheet__Excel,_Google_Sheets.md'), PosixPath('scraped_data_course_content/Prompt_engineering.md'), PosixPath('scraped_data_course_content/Interactive_Notebooks__Marimo.md'), PosixPath('scraped_data_course_content/Scraping_PDFs_with_Tabula.md'), PosixPath('scraped_data_course_content/JavaScript_tools__npx.m

In [4]:
# function to convert the content files into chunks 
def chunk_files(file_path, chunk_size=300):
    if file_path.suffix == ".md":
        logging.info(f"chunking course file: {file_path}")
        with open(file_path, 'r') as f:
            content = f.read()

        # Split by double newlines (paragraphs)
        paragraphs = content.split('\n\n')
        
        chunks = []
        chunk = ""
        for para in paragraphs:
            if len(chunk) + len(para) < chunk_size:
                chunk += para + '\n\n'
            else:
                chunks.append(chunk.strip())
                chunk = para + '\n\n'
        if chunk:
            chunks.append(chunk.strip())
        
    elif file_path.suffix == ".json":
        logging.info(f"chunking discourse file: {file_path}")
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Convert JSON content to a string for chunking
        if isinstance(data, dict):
            json_text = json.dumps(data, indent=2)
        elif isinstance(data, list):
            json_text = "\n".join(json.dumps(item, indent=2) for item in data)
        else:
            json_text = str(data)

        # Initialize chunks list if not already
        if 'chunks' not in locals():
            chunks = []
        # Simple chunking by character length
        for i in range(0, len(json_text), chunk_size):
            chunks.append(json_text[i:i+chunk_size].strip())

    return chunks

In [5]:
# convert the course content files into chunks
try :
    logging.info("splitting course_content files into chunks")
    folder = Path("scraped_data_course_content/")
    course_chunks = []

    for file_path in course_content_files:
        chunks = chunk_files(file_path)
        for chunk in chunks:
            course_chunks.append({
                "text": chunk,
                "source": str(file_path)
            })
    print(course_chunks[0])
    logging.info(f"Total chunks created: {len(course_chunks)}")
except Exception as e:
    logging.error(f"Error in splitting course content into chunks: {e}")


{'text': '---\ntitle: "Scraping with Excel"\noriginal_url: "https://tds.s-anand.net/#/scraping-with-excel?id=scraping-with-excel"\ndownloaded_at: "2025-06-13T14:55:37.760467"\n---', 'source': 'scraped_data_course_content/Scraping_with_Excel.md'}


In [6]:
# convert the discourse content files into chunks
try :
    logging.info("splitting discourse_content files into chunks")
    folder = Path("scraped_data_discourse_content/")
    discourse_chunks = []

    for file_path in discourse_content_files:
        chunks = chunk_files(file_path)
        for chunk in chunks:
            discourse_chunks.append({
                "text": chunk,
                "source": str(file_path)
            })
    print(discourse_chunks[0])
    logging.info(f"Total chunks created: {len(discourse_chunks)}")
except Exception as e:
    logging.error(f"Error in splitting course content into chunks: {e}")


{'text': '{\n  "post_stream": {\n    "posts": [\n      {\n        "id": 593909,\n        "name": "HARISH. S",\n        "username": "HARISH.S",\n        "avatar_template": "/user_avatar/discourse.onlinedegree.iitm.ac.in/harish.s/{size}/67995_2.png",\n        "created_at": "2025-02-11T17:03:21.205Z",\n        "cooked":', 'source': 'scraped_data_discourse_page/topic_166816.json'}


In [7]:
# combine all chunks 
all_chunks = course_chunks + discourse_chunks
if not all_chunks:
    raise ValueError("No chunks to embed. Exiting.")
print(f"Total chunks to embed: {len(all_chunks)}")

Total chunks to embed: 52636


In [None]:

OPENAI_API_KEY="eyJhbGciOiJIUzI1NiJ9.eyJlbWFpbCI6IjI0ZjIwMDQ5MjJAZHMuc3R1ZHkuaWl0bS5hYy5pbiJ9.CfwJHvL6a0adtb7_Xu9in2i4Kg7BLXJTP6h3Qr_96D4"
print(OPENAI_API_KEY)
url = "https://aiproxy.sanand.workers.dev/openai/v1/embeddings"

# Function to call the embedding API
def get_embeddings(batch_texts):
    headers = {
        "Authorization": f"Bearer {OPENAI_API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": "text-embedding-3-small", 
        "input": batch_texts
    }

    response = httpx.post(url, headers=headers, json=payload, timeout=30.0)
    response.raise_for_status()
    return [item["embedding"] for item in response.json()["data"]]

eyJhbGciOiJIUzI1NiJ9.eyJlbWFpbCI6IjI0ZjIwMDQ5MjJAZHMuc3R1ZHkuaWl0bS5hYy5pbiJ9.CfwJHvL6a0adtb7_Xu9in2i4Kg7BLXJTP6h3Qr_96D4


In [9]:
# get sample embeddings 
embed = get_embeddings([all_chunks[0]["text"]])
print(embed)


[[-0.016427342, -0.023920728, 0.020143772, -0.031256743, 0.0095513435, -0.005320425, -0.0058712317, -0.005480825, -0.020845897, 0.0061375555, 0.031135686, -0.03447684, -0.040868614, -0.030239869, 0.06323982, 0.051231034, 0.014768871, -0.014635709, -0.014865716, 0.07965506, 0.05980182, -0.022843327, 0.004569876, 0.039440148, 0.016003646, -0.021584341, -0.0056412243, 0.030724093, 0.002241055, 0.0046848794, -0.05156999, -0.043798175, -0.013812526, 0.019296378, 0.018921103, 0.00028940025, 0.059317596, 0.019998504, 0.0007278511, -0.015567843, 0.012305375, -0.05079523, 0.028302968, 0.07854134, -0.004645536, -0.019526385, -0.07089058, -0.03793905, 0.05578275, 0.008116826, 0.0070575834, -0.011615354, -0.04791409, 0.014308857, -0.04830147, -0.023206497, -0.033968404, 0.07331171, -0.027842954, -0.05423323, 0.0067367842, -0.030603038, -0.011681935, 0.022116989, -0.002254674, -0.004690932, -0.058930214, -0.008183408, -0.0059317597, 0.010265576, -0.004585008, -0.015216779, 0.025397616, -0.01551942,

In [10]:
# embed all chunks
batch_size = 2000  # Change if needed
embedded_chunks = []

for i in tqdm(range(0, len(all_chunks), batch_size)):
    batch = all_chunks[i:i + batch_size]
    batch_texts = [chunk["text"] for chunk in batch]
    embeddings = get_embeddings(batch_texts)

    for j, chunk in enumerate(batch):
        embedded_chunks.append({
            "text": chunk["text"],
            "source": chunk["source"],
            "embedding": embeddings[j]
        })

100%|██████████| 27/27 [08:36<00:00, 19.13s/it]


In [25]:
print(embedded_chunks[0])

{'text': '---\ntitle: "Scraping with Excel"\noriginal_url: "https://tds.s-anand.net/#/scraping-with-excel?id=scraping-with-excel"\ndownloaded_at: "2025-06-13T14:55:37.760467"\n---', 'source': 'scraped_data_course_content/Scraping_with_Excel.md', 'embedding': [-0.016427342, -0.023920728, 0.020143772, -0.031256743, 0.0095513435, -0.005320425, -0.0058712317, -0.005480825, -0.020845897, 0.0061375555, 0.031135686, -0.03447684, -0.040868614, -0.030239869, 0.06323982, 0.051231034, 0.014768871, -0.014635709, -0.014865716, 0.07965506, 0.05980182, -0.022843327, 0.004569876, 0.039440148, 0.016003646, -0.021584341, -0.0056412243, 0.030724093, 0.002241055, 0.0046848794, -0.05156999, -0.043798175, -0.013812526, 0.019296378, 0.018921103, 0.00028940025, 0.059317596, 0.019998504, 0.0007278511, -0.015567843, 0.012305375, -0.05079523, 0.028302968, 0.07854134, -0.004645536, -0.019526385, -0.07089058, -0.03793905, 0.05578275, 0.008116826, 0.0070575834, -0.011615354, -0.04791409, 0.014308857, -0.04830147, -

In [19]:
import weaviate
import os
import weaviate
from weaviate.classes.init import Auth

WEAVIATE_URL="zu1ijfg3rlyvlghm1kmzca.c0.asia-southeast1.gcp.weaviate.cloud"
WEAVIATE_API_KEY="TVVoZjVjc0NranZVeEV2VV9UOC9ieTlUbEsxZmhOQiszd0xHczJrVW4xdkYzR28xdllRWmpaN3VRVEt3PV92MjAw"

# Connect to Weaviate Cloud
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
)

print(client.is_ready()) # check the connection
if (client.is_ready()==False):
    logging.ERROR("Error in connecting")


True


In [13]:
'''import weaviate
from weaviate.classes.init import Auth
from weaviate.classes.data import DataObject
from weaviate.classes.query import Filter
from weaviate.classes.config import Configure
import uuid

# Connect to Weaviate Cloud
WEAVIATE_URL = "zu1ijfg3rlyvlghm1kmzca.c0.asia-southeast1.gcp.weaviate.cloud"
WEAVIATE_API_KEY = "TVVoZjVjc0NranZVeEV2VV9UOC9ieTlUbEsxZmhOQiszd0xHczJrVW4xdkYzR28xdllRWmpaN3VRVEt3PV92MjAw"

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
)

print("Connected:", client.is_ready())

# === Step 1: Create a class (like a table) ===
class_name = "TextEmbedding"

if class_name not in client.collections.list_all():
    client.collections.create(
        name=class_name,
        vectorizer_config=Configure.Vectorizer.none(),  # you're adding your own embeddings
        vector_index_config=Configure.VectorIndex.hnsw()
    )
    print(f"Created class '{class_name}'")

# === Step 2: Example data ===
texts = [
    "Weaviate is a vector database",
    "You can store embeddings in Weaviate",
    "It supports hybrid search",
]
embeddings = [
    [0.1, 0.2, 0.3, 0.4],  # dummy 4D vector, replace with real ones
    [0.2, 0.1, 0.4, 0.3],
    [0.3, 0.4, 0.1, 0.2],
]

# === Step 3: Push data ===
collection = client.collections.get(class_name)
for i in range(len(texts)):
    collection.data.insert(
        properties={"text": texts[i]},
        vector=embeddings[i],
        uuid=uuid.uuid4()  # unique ID
    )

print("Embeddings pushed successfully.")
'''

'import weaviate\nfrom weaviate.classes.init import Auth\nfrom weaviate.classes.data import DataObject\nfrom weaviate.classes.query import Filter\nfrom weaviate.classes.config import Configure\nimport uuid\n\n# Connect to Weaviate Cloud\nWEAVIATE_URL = "zu1ijfg3rlyvlghm1kmzca.c0.asia-southeast1.gcp.weaviate.cloud"\nWEAVIATE_API_KEY = "TVVoZjVjc0NranZVeEV2VV9UOC9ieTlUbEsxZmhOQiszd0xHczJrVW4xdkYzR28xdllRWmpaN3VRVEt3PV92MjAw"\n\nclient = weaviate.connect_to_weaviate_cloud(\n    cluster_url=WEAVIATE_URL,\n    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),\n)\n\nprint("Connected:", client.is_ready())\n\n# === Step 1: Create a class (like a table) ===\nclass_name = "TextEmbedding"\n\nif class_name not in client.collections.list_all():\n    client.collections.create(\n        name=class_name,\n        vectorizer_config=Configure.Vectorizer.none(),  # you\'re adding your own embeddings\n        vector_index_config=Configure.VectorIndex.hnsw()\n    )\n    print(f"Created class \'{class_name}\

In [30]:
import weaviate
import uuid
from weaviate.classes.init import Auth
from weaviate.classes.config import Configure, Property, DataType


# Step 2: Create class if not exists
class_name = "TextEmbedding"

if class_name not in client.collections.list_all():
    client.collections.create(
        name=class_name,
        properties=[
            Property(name="text", data_type=DataType.TEXT),
            Property(name="source", data_type=DataType.TEXT)
        ],
        vectorizer_config=Configure.Vectorizer.none(),
        vector_index_config=Configure.VectorIndex.hnsw()
    )
    print(f"Created class '{class_name}'")

# Step 3: Get the collection
collection = client.collections.get(class_name)

# Step 4: Push in batches
batch_size = 1000
from weaviate.classes.data import DataObject
for i in range(0, len(embedded_chunks), batch_size):
    batch = embedded_chunks[i:i+batch_size]
    print(f"Pushing batch {i} to {i + len(batch)}")

    objects = []
    for chunk in batch:
        obj = DataObject(
            properties={
                "text": chunk["text"],
                "source": chunk["source"]
            },
            vector=chunk["embedding"],
            uuid=uuid.uuid4()
        )
        objects.append(obj)

    collection.data.insert_many(objects)

print("✅ All embedded chunks pushed to Weaviate.")


Created class 'TextEmbedding'
Pushing batch 0 to 1000
Pushing batch 1000 to 2000
Pushing batch 2000 to 3000
Pushing batch 3000 to 4000
Pushing batch 4000 to 5000
Pushing batch 5000 to 6000
Pushing batch 6000 to 7000
Pushing batch 7000 to 8000
Pushing batch 8000 to 9000
Pushing batch 9000 to 10000
Pushing batch 10000 to 11000
Pushing batch 11000 to 12000
Pushing batch 12000 to 13000
Pushing batch 13000 to 14000
Pushing batch 14000 to 15000
Pushing batch 15000 to 16000
Pushing batch 16000 to 17000
Pushing batch 17000 to 18000
Pushing batch 18000 to 19000
Pushing batch 19000 to 20000
Pushing batch 20000 to 21000
Pushing batch 21000 to 22000
Pushing batch 22000 to 23000
Pushing batch 23000 to 24000
Pushing batch 24000 to 25000
Pushing batch 25000 to 26000
Pushing batch 26000 to 27000
Pushing batch 27000 to 28000
Pushing batch 28000 to 29000
Pushing batch 29000 to 30000
Pushing batch 30000 to 31000
Pushing batch 31000 to 32000
Pushing batch 32000 to 33000
Pushing batch 33000 to 34000
Pushin