In [1]:
# Import Libraries
import pandas as pd
import os
from langchain_ollama import OllamaEmbeddings
from langchain_mongodb import MongoDBAtlasVectorSearch
from dotenv import load_dotenv
from langchain_core.documents import Document
from pymongo import MongoClient
from langchain_community.document_loaders import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
#Load environment variables
load_dotenv()
MONGODB_URI = os.getenv("MONGODB_URI")

In [3]:
# Load CSV Dataset
loader = CSVLoader(
    file_path='../data/final_dataset_clean.csv',
    content_columns=["answer"],  # Kolom utama untuk konten
    metadata_columns=["question"],  # Metadata pertanyaan
    csv_args={'delimiter': ','}
)
documents = loader.load()

In [4]:
# Split Into Chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=120,
    length_function=len
)
chunks = text_splitter.split_documents(documents)
print(f"Chunks dari faq menjadi {len(chunks)} sub-documents.")

Chunks dari faq menjadi 1210 sub-documents.


In [5]:
#initialize embeddings
embeddings=OllamaEmbeddings(model="mxbai-embed-large")

In [6]:
# MongoDB Connection
client = MongoClient(MONGODB_URI)
collection = client['finalproject_db']['faq_tb']

In [7]:
# Vector Store Configuration
vector_store = MongoDBAtlasVectorSearch(
    collection=collection,
    embedding=embeddings,
    index_name='vector_index'
)

In [8]:
# Vector Store Setup
vector_store = MongoDBAtlasVectorSearch.from_documents(
    documents=chunks,
    embedding=embeddings,
    collection=collection,
    index_name="vector_index"
)

print("Data berhasil diunggah ke MongoDB Atlas!")
print(f"Jumlah dokumen: {len(documents)}")
print(f"Jumlah chunks: {len(chunks)}")

Data berhasil diunggah ke MongoDB Atlas!
Jumlah dokumen: 1075
Jumlah chunks: 1210
