In [1]:
# Import Libraries
from dotenv import load_dotenv
import os
from pymongo import MongoClient
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAI
from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_community.document_loaders import CSVLoader

In [2]:
load_dotenv()
MONGODB_URI = os.getenv("MONGODB_URI")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

In [None]:
# Load CSV Dataset
loader = CSVLoader(
    file_path='../data/final_dataset_clean.csv',
    content_columns=["answer"],  
    metadata_columns=["question"], 
    csv_args={'delimiter': ','}
)
documents = loader.load()

In [4]:
# Split Into Chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
    length_function=len
)
chunks = text_splitter.split_documents(documents)
print(f"Chunks dari faq menjadi {len(chunks)} sub-documents.")

Chunks dari faq menjadi 1451 sub-documents.


In [None]:
embeddings = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001", 
    google_api_key=GOOGLE_API_KEY,
)

In [6]:
# MongoDB Connection
client = MongoClient(MONGODB_URI)
collection = client['finalproject_db']['faq_tb']

In [7]:
# Vector Store Configuration
vector_store = MongoDBAtlasVectorSearch(
    collection=collection,
    embedding=embeddings,
    index_name='vector_index',
)

In [8]:
# Vector Store Setup
vector_store = MongoDBAtlasVectorSearch.from_documents(
    documents=chunks,
    embedding=embeddings,
    collection=collection,
    index_name="vector_index"
)

print("Data berhasil diunggah ke MongoDB Atlas!")
print(f"Jumlah dokumen: {len(documents)}")
print(f"Jumlah chunks: {len(chunks)}")



Data berhasil diunggah ke MongoDB Atlas!
Jumlah dokumen: 1075
Jumlah chunks: 1451
