# Embeddings for Mahabharat Text

## Setup 

In [1]:
from dotenv import load_dotenv
import os
import openai
import sys
import pprint
sys.path.append('')

pp = pprint.PrettyPrinter(indent=4)
pp = pp.pprint


In [2]:
from langchain.document_loaders import PyPDFLoader
import tiktoken
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

from langchain.embeddings.openai import OpenAIEmbeddings

from langchain.vectorstores import Chroma

from langchain.llms import OpenAI
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

from langchain.chat_models import ChatOpenAI

from langchain.chains import RetrievalQA

from langchain.vectorstores.pgvector import PGVector


In [3]:
_ = load_dotenv('./.env') 
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.langchain.plus"
os.environ["LANGCHAIN_SESSION"] = "Ankush Project"
openai.api_key  = os.environ['OPENAI_API_KEY']

text_embedding_model = "text-embedding-ada-002"

## Load the data

In [4]:
directory_path = "/Users/rahulnayak/TechWork/Python projects/mahabharata/books"
loader = DirectoryLoader("/Users/rahulnayak/TechWork/Python projects/mahabharata/books", glob="*.txt",  show_progress=True)

In [5]:
docs = loader.load()

100%|██████████| 18/18 [00:23<00:00,  1.28s/it]


## Split Data into chunks

In [7]:
chunk_size=500
chunk_overlap=30

splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunk_size, 
    chunk_overlap = chunk_overlap,
)


In [8]:
splits = splitter.split_documents(docs)
len(splits)

39306

### Calculate the cost of embedding the text

In [9]:
len(splits)

# Initialize the TiktokenTokenizer
encoding = tiktoken.get_encoding("cl100k_base")
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

def num_tokens_from_string(string, encoding_name="cl100k_base") -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

tokens_l = []
for doc in docs:
    tokens_l = tokens_l + [num_tokens_from_string(doc.page_content)]


In [10]:
ada_cost_per_k_token = 0.0001
total_ada_cost =  sum(tokens_l)*ada_cost_per_k_token/1000

print("Document, number of tokens = ", tokens_l)
print("Total Tokens = ", sum(tokens_l))
print("Total embedding cost: ", total_ada_cost)


Document, number of tokens =  [45757, 321881, 125423, 106103, 11409, 4449, 450887, 365834, 414485, 724577, 237599, 29304, 87564, 266427, 30178, 196033, 131876, 12398]
Total Tokens =  3562184
Total embedding cost:  0.35621840000000005


## Connect to PG Vector db

In [19]:
CONNECTION_STRING = PGVector.connection_string_from_db_params(
    driver="psycopg2",
    host="localhost",
    port="5432",
    database=os.environ["PGVECTOR_DATABASE"],
    user=os.environ["PGVECTOR_USER"],
    password=os.environ["PGVECTOR_PASSWORD"],
)

## Calculate Embeddings and persist to db

In [20]:
embedding = OpenAIEmbeddings(model=text_embedding_model)

In [23]:
delete_previous_data=True
# COLLECTION_NAME = "mh_embeddings_500"
COLLECTION_NAME = "mh_embeddings_500"
db = PGVector.from_documents(
    embedding=embedding,
    documents=splits,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
    pre_delete_collection=delete_previous_data
)