In [1]:
import torch

from dotenv import load_dotenv
import logging

import chromadb
from chromadb.utils import embedding_functions
from chromadb.config import Settings


# from llama_index import GPTChromaIndex, LLMPredictor, LangchainEmbedding, ServiceContext, PromptHelper, TrafilaturaWebReader, SimpleDirectoryReader
from llama_index import GPTVectorStoreIndex, LLMPredictor, LangchainEmbedding, ServiceContext, PromptHelper, load_index_from_storage
from llama_index.prompts.prompts import QuestionAnswerPrompt
from llama_index import download_loader

from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader, StorageContext

from llama_index.vector_stores import ChromaVectorStore
from llama_index.readers.chroma import ChromaReader

from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.chat_models import ChatOpenAI




Could not import azure.core python package.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:

CHROMA_COLLECTION_NAME = "demo-collection"

In [3]:
load_dotenv()

True

In [4]:
torch.cuda.empty_cache()

In [5]:
# Check if GPU is present
torch.cuda.is_available()

True

In [6]:
QUESTION_ANSWER_PROMPT_TMPL = (
    "We have provided context information below. \n"
    "---------------------\n"
    "{context_str}"
    "\n---------------------\n"
    "Given this context information and no prior knowledge, please answer the question: {query_str}\n"
)

In [7]:
QUESTION_ANSWER_PROMPT = QuestionAnswerPrompt(QUESTION_ANSWER_PROMPT_TMPL)

In [8]:
EMBEDDINGS_MODEL_NAME = "all-MiniLM-L6-v2"

In [9]:
embed_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name=EMBEDDINGS_MODEL_NAME))

In [10]:
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=EMBEDDINGS_MODEL_NAME)

## Set LLM Predictor

In [11]:
llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo"))

In [12]:
# define prompt helper
# set maximum input size
max_input_size = 4096
# set number of output tokens
num_output = 2048
# set maximum chunk overlap
max_chunk_overlap = 20

## Set Custom LLM Predictor

In [13]:
from transformers import pipeline
from langchain.llms.base import LLM
from typing import Optional, List, Mapping, Any

In [14]:
model_name = "lmsys/fastchat-t5-3b-v1.0"

In [15]:
# model_pipeline = pipeline(model=model_name, 
#                          load_in_8bit=False,
#                          max_length = 512, temperature = 0,
#                          device_map="auto")

In [16]:
# Reference - https://gpt-index.readthedocs.io/en/latest/how_to/customization/custom_llms.html

class CustomLLM(LLM):
    model_name = model_name
    

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        response = model_pipeline(prompt)
        return response


    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        return {"name_of_model": self.model_name}

    @property
    def _llm_type(self) -> str:
        return "custom"

In [17]:

# llm_predictor = LLMPredictor(llm=CustomLLM())

### Service COntext

In [18]:
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)

In [19]:
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, embed_model=embed_model, chunk_size_limit=512, 
                                               prompt_helper=prompt_helper)


### Generate index - collection of pdfs

In [20]:
from pathlib import Path

In [21]:
# https://github.com/timonmat/ChatObsidian/blob/main/utils/chroma.py#L66

In [22]:
EMBEDDING_PERSIST_FOLDER = "/home/snexus/software/chroma/chromadb/"
INDEX_PERSIST_FOLDER = "/home/snexus/software/chroma/"
FOLDER_PATH = '../sample_data/ato'
COLLECTION_NAME = "sample-ato-documents"

In [23]:
def create_chroma_client():
    return chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",persist_directory=EMBEDDING_PERSIST_FOLDER, anonymized_telemetry=False))

def get_chroma_collection(collection_name, embedding_function):
    client = create_chroma_client()
    try:
        return client.get_collection(collection_name, embedding_function=embedding_function)
    except Exception as e:
        logging.error(f"Failed to get collection '{collection_name}': {e}")
        return None

In [24]:
def get_collection_index_path(index_persist_folder: str, collection_name: str):
    return (index_persist_folder + f"collection-{collection_name}.json")

def load_chroma_index(index_persist_folder: str, collection_name: str):
    collection_index_path = get_collection_index_path(index_persist_folder, collection_name)
    _chroma_collection = get_chroma_collection(collection_name)
    
    # reader = ChromaReader(
    #             collection_name=collection_name,
    #             persist_directory=index_persist_folder
    #         )
    
    if Path(collection_index_path).exists():
        #documents = reader.load_data()
        # vector_store  = ChromaVectorStore(chroma_collection=_chroma_collection)
        
        storage_context = StorageContext.from_defaults(
                           vector_store = ChromaVectorStore(chroma_collection=_chroma_collection)
                                )
        
        
        #index = GPTVectorStoreIndex.from_documents(documents=documents, storage_context=storage_context, service_context=service_context)
        index = load_index_from_storage(storage_context)
        # index = GPTChromaIndex.load_from_disk(collection_index_path, chroma_collection=_chroma_collection, service_context=service_context)
        logging.info('Index loaded for collection ' + collection_name )
    else:
        index = None
    return index

In [25]:
# reader = ChromaReader(
#     collection_name=COLLECTION_NAME,
#     persist_directory=INDEX_PERSIST_FOLDER
# )

In [26]:
#collection_index_path = get_collection_index_path(INDEX_PERSIST_FOLDER, COLLECTION_NAME)

In [27]:
# collection_index_path

In [28]:
client = create_chroma_client()

Using embedded DuckDB with persistence: data will be stored in: /home/snexus/software/chroma/chromadb/


In [29]:
client.list_collections()

No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction


[Collection(name=sample-ato-documents)]

In [30]:
collection = client.get_collection(COLLECTION_NAME, embedding_function=sentence_transformer_ef)

In [31]:
# collection.peek()

In [32]:
_chroma_collection = get_chroma_collection(collection_name = COLLECTION_NAME, embedding_function=sentence_transformer_ef)

Using embedded DuckDB with persistence: data will be stored in: /home/snexus/software/chroma/chromadb/


In [33]:
vector_store = ChromaVectorStore(chroma_collection=_chroma_collection)

In [34]:
# StorageContext(vector_store=ChromaVectorStore(chroma_collection=_chroma_collection), )

In [35]:
storage_context = StorageContext.from_defaults(
                           vector_store = ChromaVectorStore(chroma_collection=_chroma_collection)
                           
                           )


In [36]:
storage_context = StorageContext.from_defaults(
                          vector_store = ChromaVectorStore(chroma_collection=_chroma_collection), persist_dir=INDEX_PERSIST_FOLDER
                           )

In [37]:
index =load_index_from_storage(storage_context, index_id="vector_index", service_context = service_context)

In [38]:
index

<llama_index.indices.vector_store.base.GPTVectorStoreIndex at 0x7f6bf9174b50>

In [41]:


# FOLDER_PATH = '../sample_data/standards'
# COLLENTION_NAME = "sample-standards"

In [25]:
chroma_client = create_chroma_client()
chroma_collection = chroma_client.get_or_create_collection(COLLECTION_NAME, embedding_function=sentence_transformer_ef)

PDFReader = download_loader("PDFReader")
# Scan file folder for .pdf
root = Path(FOLDER_PATH)
paths = [path for path in root.glob("*.pdf")]
print(paths)


index = None

for path in paths:
    loader = PDFReader()
    print(f"Loading {path}")
    documents = loader.load_data(file=path)
    ids = [f"id_{path.name}_{i}" for i in range(len(documents))]
    print(ids)
    chroma_collection.add(documents=documents[0].text, ids = ids)
    
    
    if index is None:
        print("Creating a new index.")
        # index = GPTChromaIndex.from_documents(documents, chroma_collection=chroma_collection,  service_context=service_context)

        index = GPTVectorStoreIndex.from_documents(documents=documents, service_context = service_context, storage_context=StorageContext.from_defaults(
                           vector_store = ChromaVectorStore(chroma_collection=chroma_collection))                          
                           )
    else:
        print("Indesring into existing index")
        index.insert(documents[0])

        
# collection_index_path = INDEX_PERSIST_FOLDER + f"collection-{COLLECTION_NAME}.json"
# collection_index_path
index.set_index_id("vector_index")
index.storage_context.persist(INDEX_PERSIST_FOLDER)
# index.save_to_disk(collection_index_path)
chroma_client.persist()

Using embedded DuckDB with persistence: data will be stored in: /home/snexus/software/chroma/chromadb/


[PosixPath('../sample_data/ato/ato-capital-gain-tax.pdf'), PosixPath('../sample_data/ato/ato-dividends.pdf')]
Loading ../sample_data/ato/ato-capital-gain-tax.pdf
['id_ato-capital-gain-tax.pdf_0']
Creating a new index.
Loading ../sample_data/ato/ato-dividends.pdf
['id_ato-dividends.pdf_0']
Indesring into existing index


True

In [80]:
# p = loader.load_data(file=path)

In [42]:
# p[0].text

### Example questions - ATO

In [39]:
query_engine = index.as_query_engine(mode="embedding",similarity_top_k=1,response_mode = "compact",  verbose=True)

# text_qa_template=QUESTION_ANSWER_PROMPT, 
#                 mode="embedding",
#                 similarity_top_k=1, 
#                 response_mode = "compact", # default, compact, tree_summarize, no_text
#                 verbose=True

In [40]:
r = query_engine.query("Who is eligible for CGT discount?")

In [41]:
print(r)

Australian trusts and complying super funds that have owned an asset for at least 12 months are eligible for CGT discount. Additionally, individuals who are Australian residents for tax purposes and provide affordable rental housing may be eligible for an additional CGT discount of up to 10%. However, foreign or temporary residents, those who dispose of certain shares or trust interests in non-widely held companies and trusts, and those who convert an income asset into a capital asset for the purposes of claiming the CGT discount may not be eligible for the full discount. The CGT discount is also not available for a CGT event that creates a new asset and a capital gain. Companies cannot use the CGT discount.


In [49]:
r = query_engine.query("What types of investements are eligible for CGT discount?")

In [50]:
print(r)

Assets owned for at least 12 months by Australian trusts and complying super funds are eligible for CGT discount. There is also an additional CGT discount of up to 10% for individuals who are Australian residents for tax purposes and provide affordable rental housing. However, the discount cannot be used for capital gains made by foreign or temporary residents after 8 May 2012, for a CGT event that creates a new asset, for disposal of interest in a non-widely held entity, or if an income asset is converted into a capital asset for the purposes of claiming the CGT discount. Companies cannot use the CGT discount.


In [51]:
r = query_engine.query("Can I get CGT discount if I sell shares 11 month after buying them?")

In [52]:
print(r)

No, you cannot get the CGT discount if you sell shares 11 months after buying them. The asset must be owned for at least 12 months before the CGT event happens to qualify for the discount.


In [53]:
print(r.source_nodes[0].node.text)

the CGT discount works
When you sell or otherwise dispose of an asset, you can reduce your capital gain
by 50%, if both of the following apply:
you owned the asset for at least 12 months
you are an Australian resident for tax purposes.
This is called the capital gains tax (CGT) discount.
12-month ownership requirement
For an asset to qualify for the CGT discount you must own it for at least 12 months
before the 'CGT event' happens. The CGT event is the point at which you make a
capital gain or loss. You exclude the day of acquisition and the day of the CGT
event when working out if you owned the CGT asset for at least 12 months before
the 'CGT event’ happens. 
If you sell the asset and there is no contract of sale
, the CGT event happens at
the time of sale.
31 of 253
If there is a contract to sell the asset
, the CGT event happens on the date of
the contract, not when you settle. Property sales usually work this way.
If the asset is lost or destroyed
, the CGT event happens when:    


In [54]:
r = query_engine.query("Can I get CGT discount for dividends?")

In [55]:
print(r)

The context information does not provide any information about CGT discount for dividends.


In [56]:
result = query_engine.query("What is a difference between franked and unfranked dividends?")

In [57]:
print(result)

A difference between franked and unfranked dividends is that franked dividends have a franking credit attached to them, while unfranked dividends do not.


In [59]:
result = query_engine.query("What is a difference between dividends and distributions?")

In [60]:
print(result)


The context information does not provide a clear difference between dividends and distributions. The terms are used interchangeably throughout the text.


In [62]:
result = query_engine.query("WHat is exempt from capital gain tax?")

In [63]:
print(result)

Assets acquired before 20 September 1985 and the main residence (home) are exempt from capital gain tax.


In [64]:
print(result.source_nodes[0].node.text)

such as
property.
You report capital gains and capital losses in your income tax return and pay tax on
your capital gains. Although it is referred to as 'capital gains tax,' it is part of your
income tax. It is not a separate tax.
If you have a capital gain, it will increase the tax you need to pay. You may want to
work out how much tax you will owe and set aside funds to cover it.
Example: calculating CGT
Maree buys some shares for $5,000.
She owns the shares for 6 months and sells them for $5,500. She has no
other capital gains or losses.
Maree declares a capital gain of $500 in her tax return. She will pay tax on
2 of 253
this gain at her individual income tax rate.
List of CGT assets and exemptions
https://www.ato.gov.au/Individuals/Capital-gains-tax/List-of-CGT-assets-and-
exemptions/
Last modified: 01 Jul 2022
QC 66014
Check if your assets are subject to CGT, exempt, or pre-date CGT.
On this page
Assets acquired before 20 September 1985
Real estate
Your main residence (your home)