In [1]:
# import necessary libraries


In [2]:
from langchain_community.document_loaders.json_loader import JSONLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter, RecursiveJsonSplitter

# prepare json for vector database

In [4]:
file1 = "dataset/sus_goal_indicator_data/SGD_indicator.json"
file2 = "dataset/sus_goal_indicator_data/sus-ind-goal.json"

In [68]:
loader = JSONLoader(
    file_path=file1,
    jq_schema='.[]',
    text_content=False)

In [69]:
data = loader.load()

In [74]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=650,
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=False,
)

In [75]:
chunks = text_splitter.split_documents(data)

In [77]:
import os
import pandas as pd

def load_and_chunk(path):
    chunks = []
    for filename in os.listdir(path):
        if filename.endswith(".json"):
            file_path = os.path.join(path, filename)
            loaded_doc = JSONLoader(
                file_path=file_path,
                jq_schema='.[]',
                text_content=False)
            
            pages = loaded_doc.load()
            
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=650,
                chunk_overlap=0,
                length_function=len,
                is_separator_regex=False,
            )
            chunk = text_splitter.split_documents(pages)
            chunks.extend(chunk)
            
    return chunks

In [78]:
path = "dataset/sus_goal_indicator_data/"

In [79]:
chunks = load_and_chunk(path)

In [84]:
chunks[160]

Document(page_content='{"Indicator number": "8th", "SDG core indicator": "Air quality", "Value 2012": "-", "Value 2013": "-", "Value 2014": "-", "Value 2015": "-", "Value 2016": "-", "Value 2017": "-", "Value 2018": "-", "Definition of the indicator": "Emissions of air pollutants (sulfur dioxide, nitrogen oxides, ammonia, volatile organic compounds and particulate matter)"}', metadata={'source': '/home/jupyter-robelamare2016/experiments/FreiGPT/dataset/sus_goal_indicator_data/SGD_indicator.json', 'seq_num': 51})

# load an embedding model and save to the vector database

In [85]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

In [89]:
path_db = "dataset/db"

In [86]:
embedding_function = HuggingFaceEmbeddings(
            model_name="BAAI/bge-large-en-v1.5",
        )

In [90]:
vectordb = Chroma.from_documents(
            documents=chunks,
            embedding=embedding_function,
            persist_directory=path_db
        )

In [91]:
db = Chroma(persist_directory=path_db, embedding_function=embedding_function)

In [92]:
db.similarity_search("crime in 2014")

[Document(page_content='{"Sustainability Goal": "07.4 Maintaining public safety", "Sustainability Indicator": "Crimes", "Value 2012": "-", "Value 2013": 120, "Value 2014": 123, "Value 2015": 131, "Value 2016": 127, "Value 2017": 121, "Value 2018": 111, "Value 2019": "-", "Value 2020": "-", "Definition of Indicator": "Registered crimes (number per 1,000 inhabitants)", "Data Source": "SDG Core Indicator (43), Police Crime Statistics of the Federal Criminal Police Office"}', metadata={'seq_num': 55, 'source': '/home/jupyter-robelamare2016/experiments/FreiGPT/dataset/sus_goal_indicator_data/sus-ind-goal.json'}),
 Document(page_content='{"Indicator number": 43, "SDG core indicator": "crimes", "Value 2012": "-", "Value 2013": 120.1, "Value 2014": 122.8, "Value 2015": 130.5, "Value 2016": 126.8, "Value 2017": "-", "Value 2018": "-", "Definition of the indicator": "Number of registered crimes per 1,000 inhabitants"}', metadata={'seq_num': 43, 'source': '/home/jupyter-robelamare2016/experiments