In [1]:
!pip install langchain
!pip install torch
!pip install sentence_transformers
!pip install faiss-cpu
!pip install huggingface-hub
!pip install pypdf
!pip -q install accelerate
!pip install llama-cpp-python
!pip -q install git+https://github.com/huggingface/transformers


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import LlamaCpp
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFDirectoryLoader

In [3]:
#load pdf files
loader = PyPDFDirectoryLoader("/content/sample_data/Data/")
data = loader.load()

In [4]:
print(data)

[Document(page_content='Booklet on\nSECURITIES\nMARKET\nUNDERSTANDING FROM\nINVESTOR’S PERSPECTIVE\nIPFT\n', metadata={'source': '/content/sample_data/Data/SEBI_Booklet_English.pdf', 'page': 0}), Document(page_content='Disclaimer:\nWhile every effort has been made to avoid errors or omission in this publication, publishers are not \nliable for any damage or loss to anyone,of any kind, in any manner from use of this material.\nPrepared Jointly by:\nPrinted by:\nSEBI\nDate: 23rd Nov, 2020\nNote for Reader:\nThis booklet is aimed at providing you in brief, basic information about the securities market. In case \nof further queries, you may visit online material at websites of SEBI, BSE, NSE, MSEI, NSDL and CDSL.\n“The information contained in this material is for only educational and awareness purposes related to \nsecurities market and shall be used for non-profitable, educational and awareness activities for \ngeneral public only.\n \nNo part of this material can be reproduced or copied

In [5]:
#Step 05: Split the Extracted Data into Text Chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=20)

text_chunks = text_splitter.split_documents(data)


In [6]:
len(text_chunks)

42

In [7]:
#get the third chunk
text_chunks[2]

Document(page_content='Index Page No.\n Introduction to Securities Market\n 1 Regulatory Framework for Securities Market             02\n 2 What are Securities and Securities Market?             03\n 3 Primary Market and Secondary Market              05\n 4 Who are the Market Infrastructure Institutions and Market           07 \n  Intermediaries in the Securities Market?\n Basics of Investing\n 5 Key Risks in Investing in Securities Market              09\n  6 How to Mitigate the Risk?                10\n Account Opening Process\n 7 Pre-requisites for Investing in Securities Market             10\n 8 Procedure for opening an account: Know Your Client (KYC) Process          11\n 9 Basic Services Demat Account (BSDA)              12\n 10 Power of Attorney                 13\n 11 Nomination                  14\n How do I  Invest in Securities Market?\n 12 Investment through Primary Market               14\n 13 Process of investing in Primary Market              15\n 14 Application Support

In [8]:
#Step 06:Downlaod the Embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [9]:
#Step 08: Create Embeddings for each of the Text Chunk
vector_store = FAISS.from_documents(text_chunks, embedding=embeddings)

In [None]:
#connect to google drive
from google.colab import drive


In [None]:
drive.mount('/content/drive')

In [None]:
#Import Model
llm = LlamaCpp(
    streaming = True,
    model_path="/content/drive/MyDrive/Model/mistral-7b-instruct-v0.1.Q4_K_M.gguf",
    temperature=0.75,
    top_p=1,
    verbose=True,
    n_ctx=4096
)

In [None]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever(search_kwargs={"k": 2}))

In [None]:
query = "What is stock market?"

In [None]:
qa.run(query)

In [None]:
import sys

while True:
  user_input = input(f"Input Prompt: ")
  if user_input == 'exit':
    print('Exiting')
    sys.exit()
  if user_input == '':
    continue
  result = qa({'query': user_input})
  print(f"Answer: {result['result']}")