In [6]:

# Run Ollama service in the background using threading
import threading
import subprocess
import time

def run_ollama_serve():
    subprocess.Popen(["ollama", "serve"])

# Start Ollama in a separate thread
thread = threading.Thread(target=run_ollama_serve)
thread.start()

# Wait for the service to initialize
time.sleep(5)


Error: listen tcp 127.0.0.1:11434: bind: address already in use


In [7]:
# !ollama pull hf.co/CompendiumLabs/bge-base-en-v1.5-gguf
# !ollama pull hf.co/bartowski/Llama-3.2-1B-Instruct-GGUF
# !ollama pull deepseek-r1:1.5b

In [None]:
# For the smallest version (best for limited hardware)

# # For the default 7B model
# ollama pull deepseek-r1:7b

# # For other available sizes
# ollama pull deepseek-r1:8b
# ollama pull deepseek-r1:14b
# ollama pull deepseek-r1:32b
# ollama pull deepseek-r1:70b
# ollama pull deepseek-r1:671b


In [None]:
dataset = []
with open('/Users/suchanda/Desktop/workspace_rwth/rag/notebooks/cat/cat-facts.txt', 'r') as file:
  dataset = file.readlines()
  print(f'Loaded {len(dataset)} entries')


Loaded 319 entries


In [9]:
import ollama

EMBEDDING_MODEL = 'hf.co/CompendiumLabs/bge-base-en-v1.5-gguf'
LANGUAGE_MODEL = 'hf.co/bartowski/Llama-3.2-1B-Instruct-GGUF'

LANGUAGE_MODEL = 'deepseek-r1:1.5b'


# Each element in the VECTOR_DB will be a tuple (chunk, embedding)
# The embedding is a list of floats, for example: [0.1, 0.04, -0.34, 0.21, ...]
VECTOR_DB = []

def add_chunk_to_database(chunk):
  embedding = ollama.embed(model=EMBEDDING_MODEL, input=chunk)['embeddings'][0]
  VECTOR_DB.append((chunk, embedding))


In [10]:
for i, chunk in enumerate(dataset):
  add_chunk_to_database(chunk)
  print(f'Added chunk {i+1}/{len(dataset)} to the database')


Added chunk 1/319 to the database
Added chunk 2/319 to the database
Added chunk 3/319 to the database
Added chunk 4/319 to the database
Added chunk 5/319 to the database
Added chunk 6/319 to the database
Added chunk 7/319 to the database
Added chunk 8/319 to the database
Added chunk 9/319 to the database
Added chunk 10/319 to the database
Added chunk 11/319 to the database
Added chunk 12/319 to the database
Added chunk 13/319 to the database
Added chunk 14/319 to the database
Added chunk 15/319 to the database
Added chunk 16/319 to the database
Added chunk 17/319 to the database
Added chunk 18/319 to the database
Added chunk 19/319 to the database
Added chunk 20/319 to the database
Added chunk 21/319 to the database
Added chunk 22/319 to the database
Added chunk 23/319 to the database
Added chunk 24/319 to the database
Added chunk 25/319 to the database
Added chunk 26/319 to the database
Added chunk 27/319 to the database
Added chunk 28/319 to the database
Added chunk 29/319 to the dat

In [11]:
def cosine_similarity(a, b):
  dot_product = sum([x * y for x, y in zip(a, b)])
  norm_a = sum([x ** 2 for x in a]) ** 0.5
  norm_b = sum([x ** 2 for x in b]) ** 0.5
  return dot_product / (norm_a * norm_b)


In [12]:
def retrieve(query, top_n=3):
  # Fix: Replace 'chunk' with 'query' as the input parameter
  query_embedding = ollama.embed(model=EMBEDDING_MODEL, input=query)['embeddings'][0]

  # temporary list to store (chunk, similarity) pairs
  similarities = []
  for chunk, embedding in VECTOR_DB:
    similarity = cosine_similarity(query_embedding, embedding)
    similarities.append((chunk, similarity))

  # sort by similarity in descending order
  similarities.sort(key=lambda x: x[1], reverse=True)

  # return the top N most relevant chunks
  return similarities[:top_n]


In [13]:
input_query = input('Ask me a question: ')
retrieved_knowledge = retrieve(input_query)

print('Retrieved knowledge:')
for chunk, similarity in retrieved_knowledge:
  print(f' - (similarity: {similarity:.2f}) {chunk}')

# Create the chunks list first
chunks_text = '\n'.join([f' - {chunk}' for chunk, similarity in retrieved_knowledge])

# Then use it in the main f-string
instruction_prompt = f'''You are a helpful chatbot.
Use only the following pieces of context to answer the question. Also mention how confident you are about this. Don't make up any new information:
{chunks_text}
'''


Retrieved knowledge:
 - (similarity: 0.79) Cats speak to humans who speak to them. They also get their security from your voice, so watch your tone.

 - (similarity: 0.76) The more cats are spoken to, the more they will speak back. You will learn a lot from your cat’s wide vocabulary of chirps and meows.

 - (similarity: 0.76) The more cats are spoken to, the more they will speak to you.



In [14]:
stream = ollama.chat(
  model=LANGUAGE_MODEL,
  messages=[
    {'role': 'system', 'content': instruction_prompt},
    {'role': 'user', 'content': input_query},
  ],
  stream=True,
)

# print the response from the chatbot in real-time
print('Chatbot response:')
for chunk in stream:
  print(chunk['message']['content'], end='', flush=True)


Chatbot response:
<think>
Alright, let's see what I need to do here. The user has given me three context pieces to use when answering their question about cats talking. They want me to tell them about cats talking and mention my confidence level. 

First, looking at the first context: "Cats speak to humans who speak to them. They also get their security from your voice, so watch your tone." That tells me that cats respond to human speech and that they guard their peace through proper voice tones.

The second context says, "The more cats are spoken to, the more they will speak back. You will learn a lot from your cat’s wide vocabulary of chirps and meows." This reinforces that both humans and cats can converse with lots of variety in sounds and words.

The third context is similar to the first: it states, "The more cats are spoken to, the more they will speak to you." So, cats also respond to human interactions, meaning they talk back when people talk. 

Putting this together, I should 