In [1]:
# !pip install -q pypdf
# !pip install -q python-dotenv
# !pip install -q llama-index
# !pip install -q gradio
# !pip install einops
# !pip install accelerate
# !pip install faiss-cpu
# !pip install faiss-gpu

In [2]:
# %pip install langchain
# !pip install sentence_transformers
# !pip uninstall -y transformers
# !pip install git+https://github.com/huggingface/transformers

In [3]:
from llama_index.llms import HuggingFaceLLM
import torch
llm = HuggingFaceLLM(
    context_window=2048,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.0, "do_sample": False},
    tokenizer_name="microsoft/phi-2",
    model_name="microsoft/phi-2",
    device_map="cpu",
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.bfloat16}
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
from llama_index import Document
#from llama_index.embeddings import OpenAIEmbedding
from llama_index.text_splitter import SentenceSplitter
from llama_index.extractors import TitleExtractor
from llama_index.ingestion import IngestionPipeline, IngestionCache
from llama_index import VectorStoreIndex

# import os
# # os.environ["OPENAI_API_KEY"] = "random"

from langchain.embeddings.huggingface import HuggingFaceBgeEmbeddings
from llama_index import ServiceContext
from llama_index.embeddings import LangchainEmbedding
from llama_index import set_global_service_context
from llama_index import (
    ServiceContext
)




# index = VectorStoreIndex.from_documents(
#     [Document.example()]
# )

# Loading data in Faiss Vector store
- Creating a Faiss Index
- Load documents, build the VectorStoreIndex
- Download and save data to Faiss store

In [5]:
from llama_index import (
    SimpleDirectoryReader,
    load_index_from_storage,
    VectorStoreIndex,
    StorageContext,
)
from llama_index.vector_stores.faiss import FaissVectorStore
from IPython.display import Markdown, display
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms import HuggingFaceLLM
import torch
import faiss

# dimensions of text-ada-embedding-002
d = 768
faiss_index = faiss.IndexFlatL2(d)



In [6]:
from llama_index import Document
from llama_index.embeddings import OpenAIEmbedding
from llama_index.text_splitter import SentenceSplitter
from llama_index.extractors import TitleExtractor
from llama_index.ingestion import IngestionPipeline, IngestionCache

# # create the pipeline with transformations
# pipeline = IngestionPipeline(
#     transformations=[
#         SentenceSplitter(chunk_size=25, chunk_overlap=0),
#         TitleExtractor(),
#         OpenAIEmbedding(),
#     ]
# )

# # run the pipeline
# nodes = pipeline.run(documents=[Document.example()])

In [7]:
# load documents
documents = SimpleDirectoryReader("./data/").load_data()
vector_store = FaissVectorStore(faiss_index=faiss_index)

In [8]:
embed_model = LangchainEmbedding(
  HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en")
)

  return torch._C._cuda_getDeviceCount() > 0


In [9]:
service_context = ServiceContext.from_defaults(
    embed_model=embed_model,
    llm=None,
)

LLM is explicitly disabled. Using MockLLM.


In [10]:
set_global_service_context(service_context)

In [11]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [12]:
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, service_context=service_context
)

In [13]:
# save index to disk
index.storage_context.persist()

# Loading data from Faiss store

In [14]:

# load index from disk
vector_store = FaissVectorStore.from_persist_dir("./storage")
storage_context = StorageContext.from_defaults(
    vector_store=vector_store, persist_dir="./storage"
)
index = load_index_from_storage(storage_context=storage_context)

## General QA prompt with phi-2

In [16]:
from llama_index.prompts.prompts import SimpleInputPrompt

system_prompt = "You are a Q&A assistant. Your goal is to answer questions as accurately as possible based on the instructions and context provided."

# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")



llm = HuggingFaceLLM(
    context_window=2048,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.0, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="microsoft/phi-2",
    model_name="microsoft/phi-2",
    device_map="cpu",
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.bfloat16}
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [17]:
from llama_index.embeddings import HuggingFaceEmbedding

# loads BAAI/bge-small-en
# embed_model = HuggingFaceEmbedding()

# loads BAAI/bge-small-en-v1.5

embed_model = LangchainEmbedding(
  HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en-v1.5")
)

service_context = ServiceContext.from_defaults(
    embed_model=embed_model,
    llm=llm,
    chunk_size=1024,
)

set_global_service_context(service_context)

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [18]:
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

query_engine = index.as_query_engine()

def predict(input):
  response = query_engine.query(input)
  return str(response)


In [19]:
predict('what is the name of the movie?')



In [None]:

import gradio as gr

gr.ChatInterface(predict).launch(share=True)