In [1]:
import time
import os
import faiss
from typing import List
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.vectorstores.faiss import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore import InMemoryDocstore
from langchain.docstore.document import Document
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.embeddings import HuggingFaceEmbeddings

import pickle
from dotenv import load_dotenv
import google.generativeai as genai 


In [2]:
# Define necessary directories and files
data_dir = "./data/Arts.csv"
index_dir = "./faiss_index"  # Directory to save FAISS index and related files
separator = "..\n"
df_list = []

In [3]:
def save_object(obj, filename):
    with open(filename, 'wb') as outp:  # Overwrites any existing file.
        pickle.dump(obj, outp, pickle.HIGHEST_PROTOCOL)

In [4]:
load_dotenv()
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [5]:

# Function to load documents
def load_documents()-> List:
    print("INFO: Loading documents...\n")
    # Load CSV data using CSVLoader (or another appropriate loader)
    csv_loader = CSVLoader(file_path=data_dir)
    pages = csv_loader.load()

    concatenated_texts = []
    concatenated_metadata = []

    for i, doc in enumerate(pages):
        row_text = doc.page_content.replace("\n", " ")+separator
        concatenated_texts.append(row_text)
        concatenated_metadata.append(
            {
                "row_index": i,
                "source": data_dir
            }
        )

        print(f"Row {i}: {row_text}")

    return concatenated_texts, concatenated_metadata

    # # Concatenate text and metadata from the pages
    # concatenated_texts = "\n\n".join(doc.page_content for doc in pages if hasattr(doc, "page_content"))
    # concatenated_metadata = [doc.metadata for doc in pages]
    
    # return concatenated_texts, concatenated_metadata


In [6]:

# Function to split chunks of text
def split_chunks(texts: List[str], splitter: RecursiveCharacterTextSplitter) -> List:
    chunks = []
    for text in texts:
        text_chunks = splitter.split_text(text)
        chunks.extend(text_chunks)
    return chunks


#Generate index from the chunks
def generate_index(chunks: List[str], embeddings: GoogleGenerativeAIEmbeddings, metadata: List[dict])-> FAISS:
    return FAISS.from_texts(chunks, embeddings, metadata)

In [7]:
# Main code
print("INFO: Generating Index...\n")
start = time.time()

# Load documents and get concatenated texts and metadata
concatenated_texts, concatenated_metadata = load_documents()

# Split the data into chunks row-wise
chunk_size = 512
chunk_overlap = 64
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

chunks = split_chunks(concatenated_texts, splitter)

# Match metadata with the chunk size length
metadata_chunks = concatenated_metadata[:len(chunks)]  # Limit metadata to match the chunks

# Generate FAISS index with the chunks and corresponding metadata
vectorstore = generate_index(chunks, embeddings, metadata_chunks)

# Save the vectorstore
vectorstore.save_local(index_dir)

end = time.time()
emb_time = round((end - start), 0)
print("INFO: Index generated.\n")
print("\nEmbedding time : " + str(emb_time) + " sec")


INFO: Generating Index...

INFO: Loading documents...

Row 0: Stream: Arts Course: BA in English Job: Political Analyst Skills Required: Artistic Talent, Fashion Design Techniques Duration: 5 years Post Graduation: MFA..

Row 1: Stream: Arts Course: BA in English Job: Content Writer Skills Required: Artistic Talent, Fashion Design Techniques Duration: 5 years Post Graduation: Fashion Design PG..

Row 2: Stream: Arts Course: BA in History Job: Sociologist Skills Required: Critical Thinking, Understanding of History and Culture Duration: 5 years Post Graduation: Journalism PG..

Row 3: Stream: Arts Course: Fashion Design Job: Historian Skills Required: Writing Skills, Ability to Craft Arguments Duration: 5 years Post Graduation: MA in Psychology..

Row 4: Stream: Arts Course: Fine Arts Job: Political Analyst Skills Required: Communication, Public Speaking Duration: 2 years Post Graduation: MA in Psychology..

Row 5: Stream: Arts Course: Fine Arts Job: Political Analyst Skills Required: W

In [8]:
metadata_chunks = concatenated_metadata[:len(chunks)]  # Limit metadata to match the chunks
print(len(metadata_chunks))

1000
