# Add all the dataframe to database


## Setup


In [1]:
from dotenv import load_dotenv

load_dotenv(".env")

import os
import sys
import glob
from pathlib import Path
import pandas as pd
import numpy as np

sys.path.append("")

##
# import openai
text_embedding_model = "text-embedding-ada-002"
# from openai.embeddings_utils import get_embedding

##### Langchain Imports


In [2]:
# import langchain
## For generating and persisting Embeddings
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.pgvector import PGVector

embeddings = OpenAIEmbeddings(model=text_embedding_model)

##### Supabase Database Connection


In [15]:
DB_PASSWORD = os.environ["SUPABASE_PASSWORD"]
DB_DBUSER = os.environ["SUPABASE_DBUSER"]
DB_DATABASE = os.environ["SUPABASE_DATABASE"]
DB_HOST = os.environ["SUPABASE_HOST"]
DB_PORT = os.environ["SUPABASE_PORT"]
DB_CONN_STRING = (
    f"postgresql://{DB_DBUSER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_DATABASE}"
)

## Mahabharata Data


In [22]:
data_directory = Path(os.environ.get("DATA_DIRECTORY", "./"))

## Tiny Tales
df_tinytales = pd.read_csv(f"{data_directory}/tiny_tales_summaries.csv", sep="|")
print("TT Dataframe size", df_tinytales.shape)

## Kaggle Tilak dataframe
df_kaggletilak = pd.read_csv(f"{data_directory}/kaggle_tilak_summaries.csv", sep="|")
df_kaggletilak = df_kaggletilak.replace(np.nan, "")
print("KT Dataframe size", df_kaggletilak.shape)

## KM Ganguli
file_list = glob.glob(f"{data_directory}/km_ganguli_*.csv")
dataframes = []
for file in file_list:
    # print("File: ", file)
    df = pd.read_csv(file, sep="|")
    dataframes = dataframes + [df]

kmgt_dataframe = pd.concat(dataframes)
kmgt_dataframe = kmgt_dataframe.replace(np.nan, "")
print("KMGT Dataframe size", kmgt_dataframe.shape)

## Combine Dataframes
dataframes = [kmgt_dataframe, df_kaggletilak, df_tinytales]
combined_dataframe = pd.concat(dataframes)
combined_dataframe = combined_dataframe.replace(np.nan, "")
print("Combined Dataframe size", combined_dataframe.shape)
combined_dataframe.head()

TT Dataframe size (200, 8)
KT Dataframe size (2376, 10)
KMGT Dataframe size (4641, 8)
Combined Dataframe size (7217, 14)


Unnamed: 0,book_number,section,section_name,text,para_number,book_name,num_tokens,chunk_id,chapter_name,title,commentary,section_number,source,chapter_number
0,6.0,,,The Mahabharata\n\nof\n\nKrishna-Dwaipayana Vy...,1.0,Bhishma Parva,93,cid_13a1a2c891a146f8a7744f4da9ceb5d5,,,,,,
1,6.0,SECTION I,Jamvu-khanda Nirmana Parva,"\n\nOM! HAVING BOWED down to Narayana, and Nar...",1.0,Bhishma Parva,1121,cid_49e9381b5aec4f428ef43b2f5bfff46c,,,,,,
2,6.0,SECTION I,Jamvu-khanda Nirmana Parva,"should strike another, giving notice. No one s...",2.0,Bhishma Parva,173,cid_4788eb7749de4bc8acd622c987006dc8,,,,,,
3,6.0,SECTION II,,"\nVaisampayana said,--""Seeing then the two arm...",1.0,Bhishma Parva,1115,cid_fcd1ab24b779460fa62532ea21a82485,,,,,,
4,6.0,SECTION II,,"applauded by the righteous, even that (constel...",2.0,Bhishma Parva,107,cid_1eecac1f06e6445eb74afbada0266d33,,,,,,


### Save to Vectorstore


Defining a new function to save dataframe to vector store.
The reason I am not using the default langchain `add_document` function is because it does not accept custom_ids


In [23]:
def addDataframeToVectorStore(
    dataframe, vector_store, text_col="text", id_col="chunk_id"
):
    metadatas = dataframe.drop([text_col], axis=1).to_dict("records")
    result = vector_store.add_texts(
        texts=dataframe[text_col],
        ids=dataframe[id_col],
        metadatas=metadatas,
    )
    return result

In [28]:
## Assert this flag if you want to recreate the table.
recreate_combined_collection = True

## which dataframe to save.
dataframe = kmgt_dataframe

if recreate_combined_collection:
    mahabharata_store = PGVector(
        collection_name="mahabharata",
        connection_string=DB_CONN_STRING,
        embedding_function=embeddings,
    )

    addDataframeToVectorStore(dataframe, mahabharata_store)

## Companies Act 2013 Data

In [36]:
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

filepath = './data/CompaniesAct2013.pdf'
loader = PyPDFLoader(filepath)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1100,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))


Number of chunks =  1154


In [38]:

ca_store = PGVector(
    collection_name="legal_docs",
    connection_string=DB_CONN_STRING,
    embedding_function=embeddings,
)
ca_store.add_documents(pages)

['43d2f3ac-9113-11ee-9e9e-bedec676b352',
 '43d2f58c-9113-11ee-9e9e-bedec676b352',
 '43d2f5b4-9113-11ee-9e9e-bedec676b352',
 '43d2f5c8-9113-11ee-9e9e-bedec676b352',
 '43d2f5e6-9113-11ee-9e9e-bedec676b352',
 '43d2f5fa-9113-11ee-9e9e-bedec676b352',
 '43d2f60e-9113-11ee-9e9e-bedec676b352',
 '43d2f686-9113-11ee-9e9e-bedec676b352',
 '43d2f6a4-9113-11ee-9e9e-bedec676b352',
 '43d2f6b8-9113-11ee-9e9e-bedec676b352',
 '43d2f6cc-9113-11ee-9e9e-bedec676b352',
 '43d2f6e0-9113-11ee-9e9e-bedec676b352',
 '43d2f6f4-9113-11ee-9e9e-bedec676b352',
 '43d2f708-9113-11ee-9e9e-bedec676b352',
 '43d2f71c-9113-11ee-9e9e-bedec676b352',
 '43d2f730-9113-11ee-9e9e-bedec676b352',
 '43d2f744-9113-11ee-9e9e-bedec676b352',
 '43d2f758-9113-11ee-9e9e-bedec676b352',
 '43d2f76c-9113-11ee-9e9e-bedec676b352',
 '43d2f780-9113-11ee-9e9e-bedec676b352',
 '43d2f794-9113-11ee-9e9e-bedec676b352',
 '43d2f7a8-9113-11ee-9e9e-bedec676b352',
 '43d2f7bc-9113-11ee-9e9e-bedec676b352',
 '43d2f7d0-9113-11ee-9e9e-bedec676b352',
 '43d2f7e4-9113-

## Done

<div class="alert alert-success"><b>Success! </b> 
    All the documents are saved in the vector store</div>
</div>
