In [2]:
import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    LlamaTokenizerFast
)
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings

In [7]:
model_name = "mistralai/Mistral-7B-v0.1"

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=False,
)

In [9]:
# Adapted from https://www.kaggle.com/code/philculliton/talking-papers-with-mistral-7b/

# Make sure the model path is correct for your system!
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    quantization_config = bnb_config,
    do_sample=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

text_generation_pipeline = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    temperature=0.7,    
    task="text-generation",
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=2000,    
)

# query_pipeline = text_generation_pipeline
llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

embedding_name = "sentence-transformers/all-mpnet-base-v2"
embedding_kwargs = {"device": "cuda"}
embeddings = HuggingFaceEmbeddings(model_name=embedding_name, model_kwargs=embedding_kwargs)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [11]:
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = TextLoader("metop-wiki.txt",
                    encoding="utf8")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
all_splits = text_splitter.split_documents(documents)

In [13]:
from langchain.chains import RetrievalQA
from langchain_community.vectorstores import Chroma

vectordb = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory="chroma_db")
retriever = vectordb.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)

In [19]:
query = "When was MeTop-B launched?"

In [20]:
from time import time

# Adapted from https://www.kaggle.com/code/gpreda/rag-using-llama-2-langchain-and-chromadb/

def test_model(tokenizer, pipeline, prompt_to_test):
    """
    Perform a query
    print the result
    Args:
        tokenizer: the tokenizer
        pipeline: the pipeline
        prompt_to_test: the prompt
    Returns
        None
    """
    # adapted from https://huggingface.co/blog/llama2#using-transformers
    time_1 = time()
    sequences = pipeline(
        prompt_to_test,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=200,)
    time_2 = time()
    print(f"Test inference: {round(time_2-time_1, 3)} sec.")
    for seq in sequences:
        print(f"Result: {seq['generated_text']}")
        

def test_rag(qa, query):
    print(f"Query: {query}\n")
    time_1 = time()
    result = qa.run(query)#,max_new_tokens=1000)
    time_2 = time()
    print(f"Inference time: {round(time_2-time_1, 3)} sec.")
    print("\nResult: ", result)

In [21]:
test_model(tokenizer,
           text_generation_pipeline,
           query)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Both `max_new_tokens` (=2000) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Test inference: 5.816 sec.
Result: When was MeTop-B launched?

MeTop-B was launched in 2011.

### What is MeTop-B's ISSN?

MeTop-B's identification number in the International Standard Serial Number (ISSN) system is: 2234-9756

### How can I obtain a copy of an article from MeTop-B?

You can purchase individual articles online through our partner, De Gruyter Online.


In [22]:
test_rag(qa, query)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Query: When was MeTop-B launched?



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Inference time: 0.831 sec.

Result:   2012
