In [1]:

# Run Ollama service in the background using threading
import threading
import subprocess
import time
import json
def run_ollama_serve():
    subprocess.Popen(["ollama", "serve"])


import ollama

EMBEDDING_MODEL = 'hf.co/CompendiumLabs/bge-base-en-v1.5-gguf'
LANGUAGE_MODEL = 'hf.co/bartowski/Llama-3.2-1B-Instruct-GGUF'

LANGUAGE_MODEL = 'deepseek-r1:1.5b'


# Start Ollama in a separate thread
thread = threading.Thread(target=run_ollama_serve)
thread.start()

# Wait for the service to initialize
time.sleep(5)

Error: listen tcp 127.0.0.1:11434: bind: address already in use


In [2]:
# !ollama pull hf.co/CompendiumLabs/bge-base-en-v1.5-gguf
# !ollama pull hf.co/bartowski/Llama-3.2-1B-Instruct-GGUF
# !ollama pull deepseek-r1:1.5b

In [3]:
with open("/Users/suchanda/Desktop/workspace_rwth/rag/data/test_kidney_lr_human_human_pmc_line_dict.json") as f:
    test_kidney_lr_human_human_pmc_line_dict = json.load(f)


In [4]:
test_kidney_lr_human_human_pmc_line_dict.keys()

dict_keys(['40236555', '40196581', '40224016'])

In [5]:
dataset = []

for v in test_kidney_lr_human_human_pmc_line_dict.values():
    dataset.extend(v)

In [6]:
dataset = [i for i in dataset if i]
dataset = list(set(dataset))
print(f"#lines: {len(dataset)}")

#lines: 260


In [7]:
dataset

['When the probe was applied to these nontransfected cells, no receptor labeling was observed.',
 'qPCR analysis demonstrated that there were no significant differences in gene expressions of the tubular injury, tubular integrity, macrophages, and fibrosis markers between WT and CCN1-KO mice (Figure\xa04H).',
 'We further examined the effect of Ccn1 deletion on the severe AKI model, 35\xa0min unilateral IRI (Figure\xa05A).',
 'pFAK+ cells accumulated around KIM1+ injured tubules only in WT mice, not in CCN1-KO mice, and bulk of interstitial pFAK+ cells were positive for PDGFRβ (Figure\xa04E).',
 'Immunofluorescence analysis demonstrated that PTs were labeled with tdTomato in both CCN1-KO and wild-type (WT) mice, and KIM1 expression was detected in both injured kidneys (Figure\xa04B).',
 'The discovery of potent M2R antagonists led to the synthesis of piperidine analogues, including 3, a highly potent and selective antagonist with poor pharmacokinetics.',
 'The 33 green signal is concen

In [8]:

VECTOR_DB = []

def add_chunk_to_database(chunk):
  embedding = ollama.embed(model=EMBEDDING_MODEL, input=chunk)['embeddings'][0]
  VECTOR_DB.append((chunk, embedding))


In [9]:
for i, chunk in enumerate(dataset):
  add_chunk_to_database(chunk)
  print(f'Added chunk {i+1}/{len(dataset)} to the database')


Added chunk 1/260 to the database
Added chunk 2/260 to the database
Added chunk 3/260 to the database
Added chunk 4/260 to the database
Added chunk 5/260 to the database
Added chunk 6/260 to the database
Added chunk 7/260 to the database
Added chunk 8/260 to the database
Added chunk 9/260 to the database
Added chunk 10/260 to the database
Added chunk 11/260 to the database
Added chunk 12/260 to the database
Added chunk 13/260 to the database
Added chunk 14/260 to the database
Added chunk 15/260 to the database
Added chunk 16/260 to the database
Added chunk 17/260 to the database
Added chunk 18/260 to the database
Added chunk 19/260 to the database
Added chunk 20/260 to the database
Added chunk 21/260 to the database
Added chunk 22/260 to the database
Added chunk 23/260 to the database
Added chunk 24/260 to the database
Added chunk 25/260 to the database
Added chunk 26/260 to the database
Added chunk 27/260 to the database
Added chunk 28/260 to the database
Added chunk 29/260 to the dat

In [10]:
def cosine_similarity(a, b):
  dot_product = sum([x * y for x, y in zip(a, b)])
  norm_a = sum([x ** 2 for x in a]) ** 0.5
  norm_b = sum([x ** 2 for x in b]) ** 0.5
  return dot_product / (norm_a * norm_b)


In [11]:
def retrieve(query, top_n=3):
  # Fix: Replace 'chunk' with 'query' as the input parameter
  query_embedding = ollama.embed(model=EMBEDDING_MODEL, input=query)['embeddings'][0]

  # temporary list to store (chunk, similarity) pairs
  similarities = []
  for chunk, embedding in VECTOR_DB:
    similarity = cosine_similarity(query_embedding, embedding)
    similarities.append((chunk, similarity))

  # sort by similarity in descending order
  similarities.sort(key=lambda x: x[1], reverse=True)

  # return the top N most relevant chunks
  return similarities[:top_n]


In [13]:
input_query = input('Ask me a question: ')
retrieved_knowledge = retrieve(input_query, top_n=10)

print('Retrieved knowledge:')
for chunk, similarity in retrieved_knowledge:
  print(f' - (similarity: {similarity:.2f}) {chunk}')

# Create the chunks list first
chunks_text = '\n'.join([f' - {chunk}' for chunk, similarity in retrieved_knowledge])

# Then use it in the main f-string
instruction_prompt = f'''You are a helpful chatbot.
Use only the following pieces of context to answer the question. Also mention how confident you are about this. Don't make up any new information:
{chunks_text}
'''


Retrieved knowledge:
 - (similarity: 0.67) Design, Synthesis, and Evaluation of a New Fluorescent Ligand for the M<sub>2</sub> Muscarinic Acetylcholine Receptor.
 - (similarity: 0.65) Various fluorescent probes for muscarinic receptors prepared by the conjugation of a ligand with a fluorophore have been reported in the literature.
 - (similarity: 0.65) Competitive ligand binding measurements were performed using unlabeled M2R ligands, specifically atropine and compound 28 (Figure S3).
 - (similarity: 0.64) The ability of the compounds to antagonize the carbachol-induced G protein activation of M2R was assessed using the bioluminescence resonance energy transfer (BRET)-based TRUPATH GoA activation assay (Figure 4A,B).
 - (similarity: 0.64) Confocal microscopy results confirmed the selective labeling of 33 against M2 receptors.
 - (similarity: 0.63) The suitability of the fluorescent compound for BRET-based ligand binding measurements was also evaluated (Figure 4).
 - (similarity: 0.63) 

In [14]:
stream = ollama.chat(
  model=LANGUAGE_MODEL,
  messages=[
    {'role': 'system', 'content': instruction_prompt},
    {'role': 'user', 'content': input_query},
  ],
  stream=True,
)

# print the response from the chatbot in real-time
print('Chatbot response:')
for chunk in stream:
  print(chunk['message']['content'], end='', flush=True)


Chatbot response:
<think>
Okay, so I need to figure out how to respond to this query. The user has provided several pieces of context about a study on M2 muscarinic acetylcholine receptors. They want me to list all different ligand receptor interactions and their downstream signaling, but without adding any new information or making up anything.

First, I should read through each context piece carefully. Let's see:

1. The study focuses on designing a new fluorescent ligand for the M2 muscarinic acetylcholine receptor.
2. There are various fluorescent probes made by conjugating the ligand with fluorophores and reported in literature.
3. Competitive measurements were done using unlabeled M2R ligands, specifically atropine and compound 28, as shown in Figure S3.
4. The ligands were tested against G protein activation of M2R using BRET-based TRUPATH assay results (Figure 4A,B).
5. Confocal microscopy confirmed labeling on 33 against M2 receptors.
6. Suitability for BRET measurements was a

In [14]:
#what are the downstream signalling for differnet protein interactions and mention the cell types too

In [None]:
#List all different ligand receptor interactions and their downstream signalling