In [1]:
!pip install -q torch langchain bitsandbytes accelerate transformers sentence-transformers faiss-gpu

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cuml 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.7 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3.0.0, but you have pyarrow 11.0.0 which is incompatible.
cudf 23.8.0 requires pandas<1.6.0dev0,>=1.3, but you have pandas 2.0.3 which is incompatible.
cudf 23.8.0 requires protobuf<5,>=4.21, but you have protobuf 3.20.3 which is incompatible.
cuml 23.8.0 requires dask==2023.7.1, but you have dask 2023.12.1 which is incompatible.
cuml 23.8.0 requires distributed==2023.7.1, but you have distributed 2023.12.1 which is incompatible.
dask-cuda 23.8.0 requires dask==2023.7

In [2]:
import os
import torch
import transformers

from glob import glob
from tqdm.notebook import tqdm
from transformers import AutoTokenizer,AutoModelForCausalLM,BitsAndBytesConfig,pipeline
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain
from langchain.schema.output_parser import StrOutputParser

import warnings
warnings.simplefilter('ignore')



In [3]:
# Checking if GPU is available
if torch.cuda.is_available():
    print("GPU is available.")
    print('Using GPU: ', torch.cuda.get_device_name(0))
    print('Memory Usage: ')
    print('Allocated: ', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached: ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

else:
    print("GPU is not available.")

GPU is available.
Using GPU:  Tesla P100-PCIE-16GB
Memory Usage: 
Allocated:  0.0 GB
Cached:  0.0 GB


### Setting bitsandbytes config to improve speed

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=False,
)

### Loading a Quantized Mistral-7B Model

In [5]:
model_id = "/kaggle/input/mistral/pytorch/7b-v0.1-hf/1"
model = AutoModelForCausalLM.from_pretrained(model_id,quantization_config=bnb_config,do_sample=True,device_map = "auto")

tokenizer = tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Testing model response without RAG

In [6]:
device = "cuda" # the device to load the model onto

prompt = "Tell me about Transformers."

model_inputs = tokenizer([prompt], return_tensors="pt").to(device)

generated_ids = model.generate(**model_inputs, max_new_tokens=300, do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)[0]

print(decoded.replace('\\n', '\n'))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> Tell me about Transformers.

Let’s start here. What do you know about the Transformers?

The Transformers movie.

Transformers. Optimus Prime?

Yeah. And Megatron. Did you like the movie?

It was ok. Not that great. It was really long, so it was a little hard to concentrate. Did you like the movie?

I was really into the Transformers when I was little. There was a Transformers toy I wanted real bad. I saw it at my cousin Steve’s house and I told my parents that I absolutely had to have it. I kept looking at this toy, but I also knew that it must have cost $100 because it was so cool. Then I found out that my dad did not have $100 to give. That was too bad.

When did you see your first Transformers movie or cartoons?

I saw the first 3 Transformers movie in the 80s when I was little. I was hooked on Transformers, I guess because I liked to act the role play on cartoon characters in front of TV. I made myself into this transformer hero which fight with another hero for saving world. 

### Loading supplementary data

In [7]:
paper_paths = glob("/kaggle/input/100-llm-papers-to-explore/*.pdf")
pages = []

# Initialize the progress bar
progress_bar = tqdm(total=len(paper_paths), desc="Processing PDFs")

for path in paper_paths:
    try:
        loader = PyPDFLoader(path)
        doc = loader.load()
        
        # Chunk text
        text_splitter = CharacterTextSplitter(chunk_size=500, 
                                              chunk_overlap=0)
        chunked_documents = text_splitter.split_documents(doc)
        
        pages.extend(chunked_documents)
    except Exception as e:
        print(f'Skipping {path} due to error: {e}')
    
    # Update the progress bar
    progress_bar.update(1)

# Close the progress bar
progress_bar.close()

Processing PDFs:   0%|          | 0/100 [00:00<?, ?it/s]

### Creating a RAG Using LangChain and FAISS

In [8]:
text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=300
)

mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

In [9]:
prompt_template = """
Instruction: Answer the question based on the following context:
{context}

Question:
{question} 
 """

# Create prompt from prompt template 
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

In [10]:
# Create llm chain 
llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)

In [11]:
# Load chunked documents into the FAISS index
db = FAISS.from_documents(
    pages,
    HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Creating a RAG Chain

In [12]:
# Connect query to FAISS index using a retriever
retriever = db.as_retriever(
    search_type="similarity",
    search_kwargs={'k': 4}
)

In [13]:
rag_chain = ( 
 {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain
)

### Testing model response with RAG

In [14]:
query = "Tell me about Transformers."
response = rag_chain.invoke(query)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [15]:
print ("Question:", response["question"])
print (response["text"].replace('\\n', '\n'))

Question: Tell me about Transformers.

Instruction: Answer the question based on the following context:
[Document(page_content='[56]Sainbayar Sukhbaatar, Edouard Grave, Piotr Bojanowski, and Armand Joulin. Adaptive attention span
in transformers. In Proceedings of the Annual Meeting of the Association for Computational Linguistics ,
2019.
[57]Yi Tay, Mostafa Dehghani, Samira Abnar, Yikang Shen, Dara Bahri, Philip Pham, Jinfeng Rao, Liu
Yang, Sebastian Ruder, and Donald Metzler. Long range arena: A benchmark for eﬃcient transformers.
arXiv preprint arXiv:2011.04006 , 2020.
[58]Yi Tay, Mostafa Dehghani, Dara Bahri, and Donald Metzler. Eﬃcient transformers: A survey. arXiv
preprint arXiv:2009.06732 , 2020.
[59]Richard Taylor. Interpretation of the correlation coeﬃcient: a basic review. Journal of diagnostic medical
sonography , 6(1):35–39, 1990.
[60]Reginald P Tewarson and Reginald P Tewarson. Sparse matrices , volume 69. Academic Press New York,
1973.
[61]Anna Thomas, Albert Gu, Tri Dao,