In [None]:
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [3]:
URLs=[
    'https://blog.gopenai.com/paper-review-llama-2-open-foundation-and-fine-tuned-chat-models-23e539522acb',
    'https://www.mosaicml.com/blog/mpt-7b',
    'https://stability.ai/blog/stability-ai-launches-the-first-of-its-stablelm-suite-of-language-models',
    'https://lmsys.org/blog/2023-03-30-vicuna/'
]

# Imports

In [2]:
from langchain.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import CharacterTextSplitter

from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.embeddings import HuggingFaceEmbeddings

from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
import textwrap
import os
import torch

In [5]:
loader = UnstructuredURLLoader(URLs)
data = loader.load()

In [8]:
text_splitter = CharacterTextSplitter(
    separator='\n',
    chunk_size=1000,
    chunk_overlap=200)


splited_data = text_splitter.split_documents(data)

In [9]:
len(splited_data)

86

In [13]:
hf_embedding =HuggingFaceEmbeddings()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [14]:
q = hf_embedding.embed_query('What is the best language model?')
len(q)

768

## Vector DB

In [15]:
vector_db =FAISS.from_documents(splited_data, hf_embedding)

# Model Loading

In [16]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Token has not been saved to git credential helper.


In [17]:
model = 'meta-llama/Llama-2-7b-chat-hf'

In [19]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model)

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [21]:
import transformers
model = transformers.AutoModelForCausalLM.from_pretrained(
    model, 
    device='auto',
    torch_dtype=torch.float16, 
    load_in_8bit=True)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend


RuntimeError: CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend

In [None]:
pipeline = transformers.pipeline(
    task='text-generation',
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map='auto',
    max_length=1000,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id
)

In [None]:
llm = HuggingFacePipeline(pipeline, model_kwargs={'temperature': 0.5})

In [None]:
llm.predict('Short summary of book Harry Potter')

# Retrieval QA

In [22]:
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain

In [24]:
docs = vector_db.similarity_search('How good is Vicuna?', k=3)

In [None]:
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever = vector_db.as_retriever())

In [None]:
qa = RetrievalQA.from_chain_type(llm = llm, chain_type='stuff', retriever = vector_db.as_retriever())

In [None]:
query = "How good is Vicuna?"
qa.run(query)

In [None]:
query = "How does Llama 2 outperforms other models"
qa.run(query)

In [None]:
while True:
    user_input = input(f"Input Prompt: ")
    if user_input == 'exit':
        print('Exiting')
        sys.exit()
    if user_input == '':
        continue
    result = qa({'query': user_input})
    print(f"Answer: {result['result']}")