In [1]:
!pip install langchain-community
!pip install sentence-transformers
!pip install faiss-cpu
!pip install llama-cpp-python

Collecting langchain-community
  Downloading langchain_community-0.3.18-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.0-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
working_folder='/content/drive/MyDrive/TransformersCode/04-hotel/'

In [4]:
model_path = working_folder + "model/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf"

In [5]:
data_folder = working_folder + "data/"

In [6]:
db_file_name= data_folder + "Hotel_faiss_DB"

In [7]:
from langchain_community.llms import LlamaCpp

llm = LlamaCpp(
    model_path=model_path,
    temperature=0.01,
    max_tokens=50,
    top_p=0.95
)

llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /content/drive/MyDrive/TransformersCode/04-hotel/model/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = ..
llama_model_loader: - kv   2:                           llama.vocab_size u32              = 128256
llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                          llama.block_count u32              = 32
llama_model_loader: - kv   6:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - 

In [8]:
template = """Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Context: {context}
Question: {question}
Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [9]:
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(
    model_name="distiluse-base-multilingual-cased-v1",
    model_kwargs={'device': 'cuda'})

from langchain.vectorstores import FAISS

db_store = FAISS.load_local(db_file_name, embeddings, allow_dangerous_deserialization=True)

retriever = db_store.as_retriever(search_kwargs={'k': 2})

from langchain import PromptTemplate
prompt = PromptTemplate(
    template=template,
    input_variables=['context', 'question'])

from langchain.chains import RetrievalQA
qa_llm = RetrievalQA.from_chain_type(llm=llm,
                                     chain_type='stuff',
                                     retriever=retriever,
                                     return_source_documents=True,
                                     chain_type_kwargs={'prompt': prompt})

  embeddings = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/452 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

2_Dense%2Fconfig.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

In [10]:
def find_line_with_alpha_num(s):

    lines = s.splitlines()

    for line in lines:
        if any(char.isalnum() for char in line):
            return line

    return ""

In [11]:
question = "Where is the 4Z Hotel is located?"

output = qa_llm.invoke(question)
first_line=find_line_with_alpha_num(output["result"])

print(first_line)

llama_perf_context_print:        load time =   98402.62 ms
llama_perf_context_print: prompt eval time =   98401.03 ms /   232 tokens (  424.14 ms per token,     2.36 tokens per second)
llama_perf_context_print:        eval time =    7456.92 ms /    10 runs   (  745.69 ms per token,     1.34 tokens per second)
llama_perf_context_print:       total time =  105893.71 ms /   242 tokens


The 4Z Hotel is located in Beirut.


In [12]:
question = "What is the phone number of the 4Z Hotel?"

output = qa_llm.invoke(question)
first_line=find_line_with_alpha_num(output["result"])

print(first_line)

Llama.generate: 47 prefix-match hit, remaining 182 prompt tokens to eval
llama_perf_context_print:        load time =   98402.62 ms
llama_perf_context_print: prompt eval time =   77781.66 ms /   182 tokens (  427.37 ms per token,     2.34 tokens per second)
llama_perf_context_print:        eval time =   33750.96 ms /    49 runs   (  688.80 ms per token,     1.45 tokens per second)
llama_perf_context_print:       total time =  111703.22 ms /   231 tokens


0096122334455


In [14]:
question = "هل يوجد مسبح في فندق الفورزد؟ أجب بالعربية"

output = qa_llm.invoke(question)
first_line=find_line_with_alpha_num(output["result"])

print(first_line)

Llama.generate: 46 prefix-match hit, remaining 233 prompt tokens to eval
llama_perf_context_print:        load time =   98402.62 ms
llama_perf_context_print: prompt eval time =   97156.85 ms /   233 tokens (  416.98 ms per token,     2.40 tokens per second)
llama_perf_context_print:        eval time =   23109.20 ms /    32 runs   (  722.16 ms per token,     1.38 tokens per second)
llama_perf_context_print:       total time =  120375.41 ms /   265 tokens


نعم، يوجد مسبح في فندق الفورزد على شاطئ البحر. 


In [15]:
question = "ماأسعار الغرف في فندق الفورزد؟ "

output = qa_llm.invoke(question)
first_line=find_line_with_alpha_num(output["result"])

print(first_line)

Llama.generate: 245 prefix-match hit, remaining 32 prompt tokens to eval
llama_perf_context_print:        load time =   98402.62 ms
llama_perf_context_print: prompt eval time =   13524.44 ms /    32 tokens (  422.64 ms per token,     2.37 tokens per second)
llama_perf_context_print:        eval time =   16032.83 ms /    23 runs   (  697.08 ms per token,     1.43 tokens per second)
llama_perf_context_print:       total time =   29632.47 ms /    55 tokens


تتراوح أسعار الغرف بين 100 و 200 دولار في فندق الفورزد. 
