In [4]:
import os
import pandas as pd
from pandas.errors import EmptyDataError
from tqdm.auto import tqdm
from langchain.text_splitter import TokenTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

path = "../data/arxiv_cs_subset.csv"
assert os.path.exists(path), f"{path} not found!"

# Step 1: Check if file is really empty or header-only
with open(path, "r", encoding="utf-8") as f:
    lines = [ln for ln in f if ln.strip() and not ln.startswith("#")]
if len(lines) <= 1:
    print("CSV has no data rows—skipping processing.")
else:
    try:
        df = pd.read_csv(path,
                         sep=None,
                         engine="python",
                         encoding="utf-8",
                         skip_blank_lines=True,
                         comment="#")
    except EmptyDataError:
        print("EmptyDataError: CSV is empty or misformatted—skipping.")
        df = pd.DataFrame()  # fallback
    if df.empty:
        print("DataFrame empty after read—nothing to process.")
    else:
        required = {"title", "abstract", "categories"}
        if not required.issubset(df.columns):
            raise KeyError(f"Missing one of {required} in CSV: got {df.columns.tolist()}")

        df = df.dropna(subset=["title", "abstract"])
        df = df[df["categories"].str.contains(r"cs\.")].reset_index(drop=True)
        if df.empty:
            print("Filtered DataFrame has no CS entries—nothing to process.")
        else:
            df["content"] = df["title"] + "\n" + df["abstract"]
            texts = df["content"].tolist()

            splitter = TokenTextSplitter.from_tiktoken_encoder(
                encoding_name="cl100k_base",
                chunk_size=512,
                chunk_overlap=64
            )

            docs = []
            for idx, txt in enumerate(tqdm(texts, desc="Chunking docs")):
                chunks = splitter.create_documents([txt])
                for chunk in chunks:
                    chunk.metadata = {"source_index": idx}
                    docs.append(chunk)

            embeddings = HuggingFaceEmbeddings(
                model_name="sentence-transformers/all-MiniLM-L6-v2",
                model_kwargs={"device": "cpu"}
            )

            vectorstore = None
            with tqdm(total=len(docs), desc="Ingesting docs") as pbar:
                for doc in docs:
                    if vectorstore:
                        vectorstore.add_documents([doc])
                    else:
                        vectorstore = FAISS.from_documents([doc], embeddings)
                    pbar.update(1)

            os.makedirs("../embeddings", exist_ok=True)
            vectorstore.save_local("../embeddings/index")

            print("Index built and saved successfully.")


CSV has no data rows—skipping processing.
