In [1]:
# %pip install --upgrade --quiet  lark chromadb

Note: you may need to restart the kernel to use updated packages.


In [1]:
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
# from langchain_openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings

In [6]:
# Loaading data
from langchain_community.document_loaders.csv_loader import CSVLoader
loader = CSVLoader('data/FB_data.csv')
documents = loader.load()
print(f'Length of document - {len(documents)}') # Number of documents = No of rows in the csv file
print(documents[0].page_content)

Length of document - 253
Date: 6/20/2019
Open: 190.949997
High: 191.160004
Low: 187.639999
Close: 189.529999
Adj Close: 189.529999
Volume: 14635700


In [13]:
## splitting the document into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter

recursive_char_text_splitter=RecursiveCharacterTextSplitter(
                                                chunk_size=500,
                                                chunk_overlap=50)
split_documents=recursive_char_text_splitter.split_documents(documents)
print('documents type - ', type(split_documents))
print('documents length - ', len(split_documents))
print(split_documents[0])

documents type -  <class 'list'>
documents length -  253
page_content='Date: 6/20/2019\nOpen: 190.949997\nHigh: 191.160004\nLow: 187.639999\nClose: 189.529999\nAdj Close: 189.529999\nVolume: 14635700' metadata={'source': 'data/FB_data.csv', 'row': 0}


In [12]:
## Create embeddings for each document

embeddings=HuggingFaceEmbeddings(
            model_name='sentence-transformers/all-MiniLM-L6-v2', 
            model_kwargs={'device':'cpu'}
    )

  from .autonotebook import tqdm as notebook_tqdm


In [62]:
vectorstore = Chroma.from_documents(split_documents, embeddings) # <langchain_community.vectorstores.chroma.Chroma object at 0x0000022559FBBCD0>


# from langchain.vectorstores import FAISS
# vectorstore =FAISS.from_documents(split_documents, embeddings) # <langchain.vectorstores.faiss.FAISS object at 0x0000022559FBC250>


In [53]:
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever

metadata_field_info = [
    AttributeInfo(
        name="source",
        description="the name of the file containing the data",
        type="string",
    ),
    AttributeInfo(
        name="row",
        description="The row number the data corresponding to",
        type="integer",
    )
]

In [54]:

import os
from langchain import HuggingFaceHub
from dotenv import load_dotenv

load_dotenv()

HF_API_KEY = os.getenv('HUGGINGFACE_API_KEY')
MODEL_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.2"

llm = HuggingFaceHub(
            repo_id=MODEL_REPO_ID, 
            huggingfacehub_api_token=HF_API_KEY,
            model_kwargs={'max_new_tokens':512,
                    'temperature':0.01
            }
        )

print(llm)



[1mHuggingFaceHub[0m
Params: {'repo_id': 'mistralai/Mistral-7B-Instruct-v0.2', 'task': None, 'model_kwargs': {'max_new_tokens': 512, 'temperature': 0.01}}


In [63]:
document_content_description = "facebook dataset"

retriever = SelfQueryRetriever.from_llm(
    llm,
    vectorstore,
    document_content_description,
    metadata_field_info,
    verbose=True
)

ValueError: Self query retriever with Vector Store type <class 'langchain_community.vectorstores.chroma.Chroma'> not supported.