# Outer Feedback loop implementation single Thread

In [7]:
import json
import torch
import os
import random
import openai
import pandas as pd
import numpy as np
from tqdm import tqdm

from pathlib import Path
from openai import OpenAI
from autogen.retrieve_utils import create_vector_db_from_dir, query_vector_db
from agatha.construct.semrep_handler import SemRepHandler
# protobuf versioning issue, need to compile and ..

In [8]:
import warnings
warnings.filterwarnings('ignore')
os.environ["TOKENIZERS_PARALLELISM"] = "false"

## 1. Ingestion

#### 1.1. Visualizer Documents sub-domain Agatha Knowledge Graph.

In [9]:
with open('hand_agtr1_pmid_sents_dict.json', 'r') as f:
    pmid_sents_dict = json.load(f)

In [10]:
pmid_sents_dict['33682799']

{'s:33682799:1:0': 'Aging delay: of mice and men. ',
 's:33682799:1:1': 'The evaluation of the safety of a drug in rodents that may be used as geroprotectors is a challenge of current times. ',
 's:33682799:1:2': 'In the paper, we discuss approaches to long-term assays for selection of potent aging delay drugs for humans. ',
 's:33682799:1:3': 'Priority is given to methods combining evaluation of carcinogenic safety and life-spanning potential. ',
 's:33682799:1:4': 'The use of such methods will be time-efficient and economically feasible. '}

In [11]:
len(pmid_sents_dict)

2379

### 1.2. Agatha model

In [17]:
workingFolder = \
    '/lustre/acslab/shared/Agatha_shared/2021_11_22'

model_path = f'{workingFolder}/model_epoch5_darwin.pt'
embedding_path = f'{workingFolder}/embeddings/predicate_subset/'
entity_db = f'{workingFolder}/predicate_entities.sqlite3'
graph_db = f'{workingFolder}/predicate_graph.sqlite3'

In [18]:
model = torch.load(model_path)

# Configure auxilary data paths
model.configure_paths(
  embedding_dir=embedding_path,
  entity_db=entity_db,
  graph_db=graph_db,
)

model = model.eval()
#model.preload()

c5203670:COVID 19 Virus Infection
c4025218:Large vessel vasculitis; Vasculitis of large artery (disorder)

In [19]:
model.predict_from_terms([('m:c5203670', 'm:c4025218')])

[0.7810704708099365]

Umls term: C1439284: AGTR1 C4285693: HAND (HIV-1-associated neurocognitive disorders) 
C4285693 C4285693 C5208104

In [22]:
model.predict_from_terms([('m:c1439284', 'm:c4285693')])

[0.5539675354957581]

### 1.3. Semantic-Type Negative Sampling

In [28]:
with open(
    '/lustre/acslab/shared/Agatha_shared/2021_11_22/2021_11_22_semtypes_to_nodelbl_and_vv_dict.json', 'r'
) as f:
    st_nodelbl_dict = json.load(f)

In [29]:
for i, (key, value) in enumerate(st_nodelbl_dict.items()):
    print(f"{key}: {value}")
    if i == 7:
        break

m:c0000005: Pharmacologic Substance
m:c0000039: Organic Chemical
m:c0000052: Amino Acid, Peptide, or Protein
m:c0000084: Amino Acid, Peptide, or Protein
m:c0000096: Organic Chemical
m:c0000097: Organic Chemical
m:c0000098: Organic Chemical
m:c0000102: Organic Chemical


In [30]:
len(st_nodelbl_dict)

414961

#### 1.4. facts_set

In [23]:
nodelist = model.graph.keys()

In [25]:
pred_pairs_list = [
    p for p in tqdm(nodelist) if p[0] == 'p'
]

100%|██████████████████████████████████████████████████| 26953485/26953485 [00:11<00:00, 2384444.03it/s]


In [26]:
pred_cui_pairs_set = set()

for p in tqdm(pred_pairs_list):
    p_split = p.upper().split(':')
    
    s = p_split[1]
    o = p_split[-1]
    
    pred_cui_pairs_set.add(
        tuple(sorted([s,o]))
    )

100%|███████████████████████████████████████████████████| 26535915/26535915 [00:49<00:00, 538332.99it/s]


In [27]:
len(pred_cui_pairs_set)

19086440

### 1.5. Semrep Handler => Identify Predicates

In [31]:
nlm_soft_folder = '/lustre/acslab/users/3281/SemRep'
sr_temp_folder = '/lustre/acslab/users/3281/semrep_temp2'
sr_replace_utf8_path = '/lustre/acslab/users/3281/SemRep/replace_utf8.jar'

In [32]:
t = SemRepHandler(
    nlm_soft_path=nlm_soft_folder,
    temp_folder=sr_temp_folder,
    #restart_mm_services=True,
    replace_utf8_path=sr_replace_utf8_path,
)

In [33]:
t.sr_binary_path = Path('/lustre/acslab/users/3281/SemRep/public_semrep/bin/semrep.v1.9_2021AB')

In [34]:
t.ProcessList_parallel(['HAND (HIV-1-associated neurocognitive disorders) and the AGTR1 gene'])

Run SemRep in interactive mode...
Processing input with replace_utf8.jar utility...


{'s:user_input_0:1': {'sent_text': 'HAND (HIV-1-associated neurocognitive disorders) and the AGTR1 gene ',
  'terms': [{'CID': 'C0018563',
    'pref_name': 'Hand',
    'extracted_text': 'HAND',
    'label': 'UMLS',
    'sem_types': ['bpoc'],
    'negated': False},
   {'CID': 'C4285693',
    'pref_name': 'HIV-associated neurocognitive disorder',
    'extracted_text': 'HIV-1-associated neurocognitive disorders',
    'label': 'UMLS',
    'sem_types': ['dsyn'],
    'negated': False},
   {'CID': 'C1439284',
    'pref_name': 'AGTR1 gene',
    'extracted_text': 'AGTR1 gene',
    'label': 'UMLS',
    'sem_types': ['gngm'],
    'negated': False}],
  'relations': []}}

In [35]:
pd.DataFrame(t.ProcessList_parallel(['HAND (HIV-1-associated neurocognitive disorders) and the AGTR1 gene'])['s:user_input_0:1']['terms'])

Run SemRep in interactive mode...
Processing input with replace_utf8.jar utility...


Unnamed: 0,CID,pref_name,extracted_text,label,sem_types,negated
0,C0018563,Hand,HAND,UMLS,[bpoc],False
1,C4285693,HIV-associated neurocognitive disorder,HIV-1-associated neurocognitive disorders,UMLS,[dsyn],False
2,C1439284,AGTR1 gene,AGTR1 gene,UMLS,[gngm],False


In [36]:
pd.DataFrame(t.ProcessList_parallel(['HAND (HIV-1-associated neurocognitive disorders) and the AGTR1 gene'])['s:user_input_0:1']['relations'])

Run SemRep in interactive mode...
Processing input with replace_utf8.jar utility...


## 2. Retrieval

#### 2.1. Merging all pmids into single `.txt` files

In [37]:
txt_save_dir = Path('fb_loop_files/pmids_txt/')

In [38]:
for k,v in tqdm(pmid_sents_dict.items()):
    save_fname = f'{k}.txt'
    
    save_fpath = txt_save_dir.joinpath(save_fname)
    
    with open(save_fpath, 'w') as f:
        f.write(
            ''.join(v.values())
        )

100%|██████████████████████████████████████████████████████████████| 2379/2379 [00:12<00:00, 186.33it/s]


In [40]:
# pmid_sents_dict.items()

#### 2.2. AutoGen Configuring chromadb vector retrieval

In [41]:
chromadb_api = create_vector_db_from_dir(
    dir_path='fb_loop_files/pmids_txt/',
    db_path='fb_loop_files/chromadb_temp/',
    collection_name='hand_agtr1',
    embedding_model='all-MiniLM-L6-v2',
    get_or_create=True, # If True, the collection will be returned if it already exists
)

In [19]:
sample_query = 'HAND (HIV-1-associated neurocognitive disorders) and the AGTR1 gene'

In [43]:
%%time
chromadb_resp = query_vector_db(
    query_texts=['HAND (HIV-1-associated neurocognitive disorders) and AGTR1 gene'],
    n_results=3,
    #client=chromadb_api,
    db_path='fb_loop_files/chromadb_temp/',
    collection_name='hand_agtr1',
)

CPU times: user 447 ms, sys: 5.52 ms, total: 452 ms
Wall time: 22.3 ms


In [44]:
chromadb_resp['documents'][:2]

[['HIV Associated Neurocognitive Disorders. Human immunodeficiency virus type 1 is associated with the development of neurocognitive disorders in many infected individuals, including a broad spectrum of motor impairments and cognitive deficits. Despite extensive research, the pathogenesis of HIV-associated neurocognitive disorders (HAND) is still not clear. This review provides a comprehensive view of HAND, including HIV neuroinvasion, HAND diagnosis and different level of disturbances, influence of highly-active antiretroviral therapy to HIV-associated dementia (HAD), possible pathogenesis of HAD, etc. Together, this review will give a thorough and clear understanding of HAND, especially HAD, which will be vital for future research, diagnosis and treatment. ',
  'A candidate gene study of intermediate histopathological phenotypes in HIV-associated neurocognitive disorders. HIV-associated neurocognitive disorders (HAND) describe a spectrum of neuropsychological impairment caused by HIV

In [45]:
def query_chroma_db(query_str, top_n):
    chromadb_resp = query_vector_db(
        query_texts=[query_str],
        n_results=top_n,
        #client=chromadb_api,
        db_path='fb_loop_files/chromadb_temp/',
        collection_name='hand_agtr1',
    )
    
    return chromadb_resp['documents'][0]

In [63]:
query_result_200 = query_vector_db(
query_texts=['HAND (HIV-1-associated neurocognitive disorders) and AGTR1 gene'],
n_results=200,
#client=chromadb_api,
db_path='fb_loop_files/chromadb_temp/',
collection_name='hand_agtr1',
)

## 3.Feedback loop logic

In [46]:
st_nodelbl_dict['Organic Chemical'][:3]

['m:c0000039', 'm:c0000096', 'm:c0000097']

In [47]:
def generate_negatives(pos_pair, sample_rate=5):
    
    s = pos_pair[0]
    o = pos_pair[1]
    
    if s[0] != 'm':
        s = f'm:{pos_pair[0].lower()}'
    
    if o[0] != 'm':
        o = f'm:{pos_pair[1].lower()}'
    
    if o in st_nodelbl_dict:
        o_st = st_nodelbl_dict[o]
        o_sample_list = random.sample(
            st_nodelbl_dict[o_st],
            sample_rate
        )
    else:
        o_sample_list = []
    
    out_pairs = []
    for o_neg in o_sample_list:
        out_pairs.append((s, o_neg))
        
    return out_pairs

In [48]:
generate_negatives(('m:c0000039', 'm:c0000096'))

[('m:c0000039', 'm:c0764801'),
 ('m:c0000039', 'm:c0720810'),
 ('m:c0000039', 'm:c0393004'),
 ('m:c0000039', 'm:c0720319'),
 ('m:c0000039', 'm:c0007549')]

In [49]:
def eval_pair(pair, sample_rate = 10):
    
    pair_negs = generate_negatives(pair, sample_rate=sample_rate)
    
    agatha_queries = [pair] + pair_negs
    
    pair_labels = [1] + [0]*len(pair_negs)
    
    scores = model.predict_from_terms(agatha_queries)
    
    res_list = sorted(
        list(zip(scores, pair_labels)),
        key=lambda x: x[0],
        reverse=True
    )
    
    rank = None
    for i, (score, lbl) in enumerate(res_list):
        if lbl == 1:
            rank = i
            break
    
    return {
        'scores': res_list,
        'pos_rank': rank + 1
    }

In [50]:
sc_list = eval_pair(('C5203670', 'C0044419'))

### Part 1: process full explanation with oai_get_response LLM (GPT4) semrep

In [51]:
llm_model = "gpt-4-0125-preview"

In [52]:
open('OAI_CONFIG_LIST').read().strip()

'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'

In [53]:
client = OpenAI(
    # This is the default and can be omitted
    api_key=open('OAI_CONFIG_LIST').read().strip(),
)

In [54]:
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Hi, how are you?",
        }
    ],
    model=llm_model,
    temperature=0,
)

In [55]:
chat_completion.choices[0].message.content

"I'm just a computer program, so I don't have feelings, but thanks for asking! How can I assist you today?"

In [56]:
def oai_get_response(msg, temp):
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": msg,
            }
        ],
        model=llm_model,
        temperature=temp,
    )
    
    reply_text = chat_completion.choices[0].message.content
    
    return reply_text

#### 2.4. OuterLoop Impl to select top-n closest docs based on(distance/cosine similarity)

In [57]:
import os
from pathlib import Path

def read_text_files(directory):
    files_content = []
    directory_path = Path(directory)

    for file in directory_path.iterdir():
        if file.is_file() and file.suffix == '.txt':
            with open(file, 'r', encoding='utf-8') as f:
                files_content.append(f.read())
    
    return files_content

In [58]:
directory = '/lustre/acslab/users/3281/LLM_Expalainability/Ilya_Feedback_loop/fb_loop_files/pmids_txt/'
content_list = read_text_files(directory)
len(content_list)

2379

In [59]:
content_list[:3]

['HIV-associated neurocognitive disorder in HIV-infected Koreans: the Korean NeuroAIDS Project. HIV-associated neurocognitive disorder (HAND) is an independent predictor of early mortality and is associated with many difficulties in activities of daily living. We sought to determine the prevalence of and risk factors for HAND in HIV-infected Koreans. In addition, we investigated the performance of screening tools and components of neuropsychological (NP) tests for diagnosing HAND. HIV-infected patients were enrolled consecutively from two different urban teaching hospitals in Seoul, South Korea between March 2012 and September 2012. Participants completed a detailed NP assessment of six cognitive domains commonly affected by HIV. The Frascati criteria were used for diagnosing HAND. Four key questions, the International HIV Dementia Scale (IHDS) and Montreal Cognitive Assessment (MoCA)-K were also assessed as potential tools for screening for HAND. Among the 194 participants, the preval

### 2.5. Generate Prompt text + Context

In [60]:
llm_fix_prompt= 'How would you describe a relationship between HAND and AGTR1 gene given the following scientific abstracts as context?'
llm_fix_prompt_str = '\n\n'.join(content_list[:3])
llm_fix_prompt_combo = llm_fix_prompt + '\n' + llm_fix_prompt_str
llm_fix_prompt_combo

"How would you describe a relationship between HAND and AGTR1 gene given the following scientific abstracts as context?\nHIV-associated neurocognitive disorder in HIV-infected Koreans: the Korean NeuroAIDS Project. HIV-associated neurocognitive disorder (HAND) is an independent predictor of early mortality and is associated with many difficulties in activities of daily living. We sought to determine the prevalence of and risk factors for HAND in HIV-infected Koreans. In addition, we investigated the performance of screening tools and components of neuropsychological (NP) tests for diagnosing HAND. HIV-infected patients were enrolled consecutively from two different urban teaching hospitals in Seoul, South Korea between March 2012 and September 2012. Participants completed a detailed NP assessment of six cognitive domains commonly affected by HIV. The Frascati criteria were used for diagnosing HAND. Four key questions, the International HIV Dementia Scale (IHDS) and Montreal Cognitive A

In [65]:
def get_documents_with_scores(content_list):
    results_with_scores = []

    # Iterate over each set of results corresponding to each query text
    for doc_list, score_list in zip(query_result_200['documents'], query_result_200['distances']):
        
        # Pair each document with its corresponding score
        paired_results = zip(doc_list, score_list)
        results_with_scores.extend(paired_results)

    return results_with_scores

In [66]:
documents_scores = get_documents_with_scores(content_list)
# content_list query_results
documents_scores[:6]

[('HIV Associated Neurocognitive Disorders. Human immunodeficiency virus type 1 is associated with the development of neurocognitive disorders in many infected individuals, including a broad spectrum of motor impairments and cognitive deficits. Despite extensive research, the pathogenesis of HIV-associated neurocognitive disorders (HAND) is still not clear. This review provides a comprehensive view of HAND, including HIV neuroinvasion, HAND diagnosis and different level of disturbances, influence of highly-active antiretroviral therapy to HIV-associated dementia (HAD), possible pathogenesis of HAD, etc. Together, this review will give a thorough and clear understanding of HAND, especially HAD, which will be vital for future research, diagnosis and treatment. ',
  0.21835744380950928),
 ('A candidate gene study of intermediate histopathological phenotypes in HIV-associated neurocognitive disorders. HIV-associated neurocognitive disorders (HAND) describe a spectrum of neuropsychological 

In [67]:
# Unpack Separate documents and their scores
documents, scores = zip(*documents_scores)

# Use scores as weights for random selection
selected_documents = random.choices(documents, weights=scores, k=6)

# print(selected_documents)
len(selected_documents)
# len(scores)

6

In [68]:
selected_documents[:3]

['Neurocognitive impairment in HIV-infected naive patients with advanced disease: the role of virus and intrathecal immune activation. To investigate intrathecal immune activation parameters and HIV-RNA in HIV-associated neurocognitive disorders (HAND) of advanced naive HIV-infected patients and to evaluate their dynamics before and after initiation of antiretroviral therapy (ART). Cross-sectional and longitudinal analysis of HIV RNA, proinflammatory cytokines (IL-6, IL-10, INF-gamma, TNF-alpha, TGF-beta1, and TGF-beta2) and chemokines (MIP-1alpha, MIP-1beta, and MCP-1) in plasma and cerebrospinal fluid (CSF) of HIV-infected patients with CD4 <200/muL. HAND was diagnosed at baseline in 6/12 patients. Baseline CSF HIV-RNA was comparable in patients with or without HAND, whereas CSF concentration of IL-6 and MIP-1beta, proinflammatory cytokines, was increased in HAND patients. CSF evaluation at 12 weeks was available in 10/12 cases. ART greatly reduced HIV-RNA in all patients. Neverthele

#### Generate from in 5 different Iteration Explanantion

In [69]:
def generate_explanations(content_list, llm_fix_prompt, oai_get_response, num_iterations=5, first_n_abstracts=3):
    for iteration in range(num_iterations):
        # Select abstracts based on iteration
        if iteration == 0:
            selected_abstracts = content_list[:first_n_abstracts]
        else:
            selected_abstracts = random.sample(content_list, first_n_abstracts)
            query_chroma_db("neurocognitive effects of HIV", 5)
        
        llm_fix_prompt_str = '\n\n'.join(selected_abstracts)
        llm_fix_prompt_combo = llm_fix_prompt + '\nContext:\n' + llm_fix_prompt_str

        response = oai_get_response(llm_fix_prompt_combo, 0)

        print(f"Iteration {iteration} Prompt:\n{llm_fix_prompt_combo}\n")
        print(f" Response Iteration {iteration} Response:\n{response}\n")
        print("----------------------------------------------------\n")

generate_explanations(content_list, llm_fix_prompt, oai_get_response)

Iteration 0 Prompt:
How would you describe a relationship between HAND and AGTR1 gene given the following scientific abstracts as context?
Context:
HIV-associated neurocognitive disorder in HIV-infected Koreans: the Korean NeuroAIDS Project. HIV-associated neurocognitive disorder (HAND) is an independent predictor of early mortality and is associated with many difficulties in activities of daily living. We sought to determine the prevalence of and risk factors for HAND in HIV-infected Koreans. In addition, we investigated the performance of screening tools and components of neuropsychological (NP) tests for diagnosing HAND. HIV-infected patients were enrolled consecutively from two different urban teaching hospitals in Seoul, South Korea between March 2012 and September 2012. Participants completed a detailed NP assessment of six cognitive domains commonly affected by HIV. The Frascati criteria were used for diagnosing HAND. Four key questions, the International HIV Dementia Scale (IHD

In [37]:
with open('hand_agtr1_test_expl.txt', 'r') as f:
    expl_raw = f.read().replace('\n', '')

In [38]:
expl_raw

'Based on the provided abstracts, the relationship between the HAND (HIV-1-associated neurocognitive disorders) and the AGTR1 gene (which encodes the angiotensin II receptor type 1) appears to be centered around the role of the AGTR1 gene in the neuroinflammatory processes that are implicated in HAND.Here are the key points from the abstracts that pertain to the AGTR1 gene:    Neuroinflammation and HAND: HAND is associated with neuroinflammation, which is a common consequence of chronic HIV infection within the CNS. AGTR1-mediated pathways are implicated in this inflammatory process.    Angiotensin II Receptor Type 1 (AT1R) and Neuroinflammation: AGTR1, through its product, the AT1R, is involved in modulating microglia activation and neuroinflammation. Angiotensin II, the ligand for AT1R, induces microglia activation, affecting the function of neurons.    Modulation of AT1R to Mitigate Neuroinflammation: Strategies that modulate AT1R, such as the use of inhibitors like losartan or cand

### 4. Evaluation: Each sentence according Agatha Pos-Rank

In [70]:
def evaluate_llm_resp(llm_resp_str):

    expl_parts_list = []
    
    expl_sr_out = t.ProcessList_parallel([llm_resp_str], nthreads=1)

    for s_id, sent_data in expl_sr_out.items():
        sent_text = sent_data['sent_text']

        status = 'ok'
        bad_predicates = []
        good_predicates = []
        for rel in sent_data['relations']:

            rel_subj = rel['subj_text']
            v = rel['verb']
            rel_obj = rel['obj_text']

            #pred = [rel_subj, v, rel_obj]
            pred = [rel_subj, rel_obj]

            rel_pair = tuple(
                sorted(
                    [
                        rel['subj_id'],
                        rel['obj_id']
                    ]
                )
            )
            if rel_pair in pred_cui_pairs_set:
                good_predicates.append(pred)
            if rel_pair not in pred_cui_pairs_set:
                #status = 'REWORK'
                subj_id = f"m:{rel['subj_id'].lower()}"
                obj_id = f"m:{rel['obj_id'].lower()}"
                if subj_id in model.graph and obj_id in model.graph:
                    #agatha_score = model.predict_from_terms([[subj_id, obj_id]])[0]
                    agatha_rank = eval_pair([subj_id, obj_id], sample_rate=20)['pos_rank']
                    pred.append(agatha_rank)
                    if agatha_rank > 1:
                        status = 'REWORK'
                        bad_predicates.append(pred)
                    else:
                        good_predicates.append(pred)


        print('Sentence:\n', sent_text)
        print('Status:\n', status)
        if status != 'ok':
            print('Reason:')
            for p in bad_predicates:
                print('\t', p, 'not in AGATHA KB')
        else:
            if len(good_predicates):
                print('Recognized:')
            for p in good_predicates:
                print('\t', p, 'in AGATHA KB')

        print('\n-----\n')

        expl_parts_list.append(
            {
                'sent_text': sent_text,
                'status': status,
                'bad_predicates': bad_predicates,
                'good_predicates': good_predicates
            }
        )
        
    return expl_parts_list

In [40]:
expl_parts_list = evaluate_llm_resp(expl_raw)

Run SemRep in interactive mode...
Processing input with replace_utf8.jar utility...
Sentence:
 Based on the provided abstracts, the relationship between the HAND (HIV-1-associated neurocognitive disorders) and the AGTR1 gene (which encodes the angiotensin II receptor type 1) appears to be centered around the role of the AGTR1 gene in the neuroinflammatory processes that are implicated in HAND. 
Status:
 ok

-----

Sentence:
 Here are the key points from the abstracts that pertain to the AGTR1 gene:    Neuroinflammation and HAND: HAND is associated with neuroinflammation, which is a common consequence of chronic HIV infection within the CNS. 
Status:
 ok

-----

Sentence:
 AGTR1-mediated pathways are implicated in this inflammatory process. 
Status:
 ok

-----



NameError: name 'eval_pair' is not defined

In [212]:
expl_parts_enum_dict = dict(enumerate(expl_parts_list))

In [214]:
max(expl_parts_enum_dict)

10

In [297]:
expl_parts_enum_dict

{0: {'sent_text': 'Based on the provided abstracts, the relationship between the HAND (HIV-1-associated neurocognitive disorders) and the AGTR1 gene (which encodes the angiotensin II receptor type 1) appears to be centered around the role of the AGTR1 gene in the neuroinflammatory processes that are implicated in HAND. ',
  'status': 'ok',
  'bad_predicates': []},
 1: {'sent_text': 'Here are the key points from the abstracts that pertain to the AGTR1 gene:    Neuroinflammation and HAND: HAND is associated with neuroinflammation, which is a common consequence of chronic HIV infection within the CNS. ',
  'status': 'ok',
  'bad_predicates': []},
 2: {'sent_text': 'AGTR1-mediated pathways are implicated in this inflammatory process. ',
  'status': 'ok',
  'bad_predicates': []},
 3: {'sent_text': 'Angiotensin II Receptor Type 1 (AT1R) and Neuroinflammation: AGTR1, through its product, the AT1R, is involved in modulating microglia activation and neuroinflammation. ',
  'status': 'ok',
  'ba

In [258]:
expl_parts_df = pd.DataFrame(expl_parts_list).reset_index()
expl_parts_df['reworked_part'] = None
expl_parts_df

Unnamed: 0,index,sent_text,status,bad_predicates,reworked_part
0,0,"Based on the provided abstracts, the relations...",ok,[],
1,1,Here are the key points from the abstracts tha...,ok,[],
2,2,AGTR1-mediated pathways are implicated in this...,ok,[],
3,3,Angiotensin II Receptor Type 1 (AT1R) and Neur...,ok,[],
4,4,"Angiotensin II, the ligand for AT1R, induces m...",REWORK,"[[AT1R, microglia activation, 2]]",
5,5,Modulation of AT1R to Mitigate Neuroinflammati...,ok,[],
6,6,They can reduce neuroinflammation and thereby ...,ok,[],
7,7,Endothelin A2 (EPA2) and AT1R: One study sugge...,ok,[],
8,8,Therapeutic agents that increase EPA2 expressi...,ok,[],
9,9,Therapeutic Potential: The inhibition of AGTR1...,REWORK,"[[HAND, pathogenesis, 9]]",


### 4.3.Part 2: reworking bad parts

In [124]:
source_str = 'HAND (HIV-1-associated neurocognitive disorder)'
target_str = 'AGTR1 gene'
top_n_abstr = 3

In [300]:
t.ProcessList_parallel(['HAND'])

Run SemRep in interactive mode...
Processing input with replace_utf8.jar utility...


{'s:user_input_0:1': {'sent_text': 'HAND ',
  'terms': [{'CID': 'C0018563',
    'pref_name': 'Hand',
    'extracted_text': 'HAND',
    'label': 'UMLS',
    'sem_types': ['bpoc'],
    'negated': False}],
  'relations': []}}

In [301]:
t.ProcessList_parallel(['HAND (HIV-1-associated neurocognitive disorder)'])

Run SemRep in interactive mode...
Processing input with replace_utf8.jar utility...


{'s:user_input_0:1': {'sent_text': 'HAND (HIV-1-associated neurocognitive disorder) ',
  'terms': [{'CID': 'C0018563',
    'pref_name': 'Hand',
    'extracted_text': 'HAND',
    'label': 'UMLS',
    'sem_types': ['bpoc'],
    'negated': False},
   {'CID': 'C4285693',
    'pref_name': 'HIV-associated neurocognitive disorder',
    'extracted_text': 'HIV-1-associated neurocognitive disorder',
    'label': 'UMLS',
    'sem_types': ['dsyn'],
    'negated': False}],
  'relations': []}}

#### Sentence processing prompt

In [171]:
llm_fix_prompt = (
    'You are a helpful assistant, who summarizes biomedical knowledge from the provided context.'
    'to generate a plausible explanation of potential connection between {source_str} and {target_str}, '
    'which is shown to be promising.'
    'Your explanation is evaluated by a biology expert, who provides you a feedback and the appropriate context'
    'in a form of scientific abstracts.'
    'From your previous response, it seems that the following part you generated: \n "{bad_sentence}" \n is wrong.'
    'Please rework it by summarizing the context provided by the biology expert. \n\nCONTEXT:\n\n {context_abstr_str}'
    'As a response, return only the summary of the provided context as one or two sentences.'
)

In [272]:
llm_fix_inb_prompt = (
    'You are a helpful assistant, who summarizes biomedical knowledge from the provided context.'
    'to generate a plausible explanation of potential connection between {source_str} and {target_str}, '
    'which is shown to be promising.'
    'Your explanation is evaluated by a biology expert, who provides you a feedback and the appropriate context'
    'in a form of scientific abstracts.'
    'From your previous response, it seems that the following part you generated: \n "{bad_sentence}" \n is wrong.'
    'Please rework it by summarizing the context provided by the biology expert. \n\nCONTEXT:\n\n {context_abstr_str}'
    '\n\nAs a response, return only a short summary of the provided context as one or two sentences focusing'
    'on biomedical concepts.'
    'Keep in mind that the part that you are reworking should connect the previous and the next'
    'parts of the explanation.'
    '\nPrevious part: "{prev_sent}"'
    '\n<YOUR SUMMARY IS HERE>'
    '\nNext part: "{next_sent}"'
    '\nReturn only YOUR SUMMARY.'
)

In [33]:
current_sent_to_fix = bad_sents_list[0]
current_sent_to_fix

NameError: name 'bad_sents_list' is not defined

In [32]:
cur_top_n_context_abstr_list = query_chroma_db(bad_sents_list[0], top_n_abstr)

NameError: name 'bad_sents_list' is not defined

In [174]:
cur_top_n_context_abstr_str = '\n\n'.join(cur_top_n_context_abstr_list)

In [175]:
llm_fix_prompt_filled = llm_fix_prompt.format(
    source_str=source_str,
    target_str=target_str,
    bad_sentence=current_sent_to_fix,
    context_abstr_str=cur_top_n_context_abstr_str,
)

In [176]:
print(llm_fix_prompt_filled)

You are a helpful assistant, who summarizes biomedical knowledge from the provided context.to generate a plausible explanation of potential connection between HAND (HIV-1-associated neurocognitive disorder) and AGTR1 gene, which is shown to be promising.Your explanation is evaluated by a biology expert, who provides you a feedback and the appropriate contextin a form of scientific abstracts.From your previous response, it seems that the following part you generated: 
 "Angiotensin II, the ligand for AT1R, induces microglia activation, affecting the function of neurons. " 
 is wrong.Please rework it by summarizing the context provided by the biology expert. 

CONTEXT:

 Biochanin A protects against angiotensin II-induced damage of dopaminergic neurons in rats associated with the increased endophilin A2 expression. The brain renin-angiotensin system plays a vital role in the modulation of the neuroinflammatory responses and the progression of dopaminergic (DA) degeneration. Angiotensin I

In [167]:
%%time
llm_resp = oai_get_response(llm_fix_prompt_filled)

CPU times: user 11.1 ms, sys: 2.03 ms, total: 13.1 ms
Wall time: 2.36 s


In [177]:
llm_resp

'The angiotensin II type 1 receptor (AT1R) is involved in the activation of microglia, which affects the function of dopaminergic neurons. Biochanin A, a phytoestrogen, has been shown to protect against angiotensin II-induced damage to dopaminergic neurons by increasing the expression of endophilin A2 (EPA2) and decreasing the expression of AT1R.'

In [178]:
expl_fixed_sr_out = t.ProcessList_parallel([llm_resp])

Run SemRep in interactive mode...
Processing input with replace_utf8.jar utility...


In [187]:
llm_resp_evald_sent_list = evaluate_llm_resp(llm_resp)

Run SemRep in interactive mode...
Processing input with replace_utf8.jar utility...
Sentence:
 The angiotensin II type 1 receptor (AT1R) is involved in the activation of microglia, which affects the function of dopaminergic neurons. 
Status:
 ok
Recognized:
	 ['angiotensin II type 1 receptor', 'microglia'] in AGATHA KB
	 ['microglia', 'dopaminergic neurons'] in AGATHA KB

-----

Sentence:
 Biochanin A, a phytoestrogen, has been shown to protect against angiotensin II-induced damage to dopaminergic neurons by increasing the expression of endophilin A2 (EPA2) and decreasing the expression of AT1R. 
Status:
 ok

-----



In [188]:
llm_resp_evald_sent_list

[{'sent_text': 'The angiotensin II type 1 receptor (AT1R) is involved in the activation of microglia, which affects the function of dopaminergic neurons. ',
  'status': 'ok',
  'bad_predicates': [],
  'good_predicates': [['angiotensin II type 1 receptor', 'microglia'],
   ['microglia', 'dopaminergic neurons']]},
 {'sent_text': 'Biochanin A, a phytoestrogen, has been shown to protect against angiotensin II-induced damage to dopaminergic neurons by increasing the expression of endophilin A2 (EPA2) and decreasing the expression of AT1R. ',
  'status': 'ok',
  'bad_predicates': [],
  'good_predicates': []}]

In [308]:
def rework_explanation_part(
    wrong_part_str,
    inbetw=False,
    cur_sent_idx=None,
    print_prompt=False,
    temp=0.3,
    top_n_abstr=3,
):
    
    print(f'Reworking the following part:\n---\n{wrong_part_str}\n---')
    
    print(f'Getting top {top_n_abstr} abstracts...')
    
    cur_top_n_context_abstr_list = query_chroma_db(
        wrong_part_str,
        top_n_abstr
    )
    cur_top_n_context_abstr_str = '\n\n'.join(cur_top_n_context_abstr_list)
    
    if inbetw:
        print('Taking into account surrounding sentences...')
        prev_sent = expl_parts_enum_dict[cur_sent_idx-1]['sent_text']
        next_sent = expl_parts_enum_dict[cur_sent_idx+1]['sent_text']
        
        llm_fix_prompt_filled = llm_fix_inb_prompt.format(
            source_str=source_str,
            target_str=target_str,
            bad_sentence=wrong_part_str,
            context_abstr_str=cur_top_n_context_abstr_str,
            prev_sent=prev_sent,
            next_sent=next_sent,
        )
    else:
        llm_fix_prompt_filled = llm_fix_prompt.format(
            source_str=source_str,
            target_str=target_str,
            bad_sentence=wrong_part_str,
            context_abstr_str=cur_top_n_context_abstr_str,
        )
    
    
    if print_prompt:
        print(f'Querying LLM with the prompt:')
        print(llm_fix_prompt_filled)
    
    llm_resp = oai_get_response(llm_fix_prompt_filled, temp=temp)
    
    print(f'Evaluating LLM response...')
    llm_resp_evald_sent_list = evaluate_llm_resp(llm_resp)
    
    return llm_resp_evald_sent_list

### Constructing simple loop

In [314]:
max_while_iter = 3
oai_temp = 0.5
top_n_abstr = 3

In [315]:
bad_part_idxs = [k for k,v in expl_parts_enum_dict.items() if v['status'] != 'ok']
bad_part_idxs

[4, 9]

In [324]:
pd.DataFrame(expl_parts_enum_dict).T

Unnamed: 0,sent_text,status,bad_predicates
0,"Based on the provided abstracts, the relations...",ok,[]
1,Here are the key points from the abstracts tha...,ok,[]
2,AGTR1-mediated pathways are implicated in this...,ok,[]
3,Angiotensin II Receptor Type 1 (AT1R) and Neur...,ok,[]
4,"Angiotensin II, the ligand for AT1R, induces m...",REWORK,"[[AT1R, microglia activation, 2]]"
5,Modulation of AT1R to Mitigate Neuroinflammati...,ok,[]
6,They can reduce neuroinflammation and thereby ...,ok,[]
7,Endothelin A2 (EPA2) and AT1R: One study sugge...,ok,[]
8,Therapeutic agents that increase EPA2 expressi...,ok,[]
9,Therapeutic Potential: The inhibition of AGTR1...,REWORK,"[[HAND, pathogenesis, 9]]"


In [316]:
for bad_idx in bad_part_idxs:
    
    cur_iter = 1
    status_reworked = 'ok'
    rd_list = rework_explanation_part(
        expl_parts_enum_dict[bad_idx]['sent_text'],
        inbetw=True,
        cur_sent_idx=bad_idx,
        top_n_abstr=top_n_abstr,
    )

    for s in rd_list:
        if s['status'] != 'ok':
            status_reworked = 'REWORK'
            
    print(f'\n\n> > > CURRENT STATUS FOR IDX {bad_idx}: ', status_reworked)
    reworked_str = ''.join([s['sent_text'] for s in rd_list])
    #print('LLM GENERATED: ', reworked_str)
    print('Current iter:', cur_iter)
    print('---\n---\n---')
    
    while (status_reworked != 'ok') and cur_iter <= max_while_iter:
        cur_iter += 1
        status_reworked = 'ok'
        rd_list = rework_explanation_part(
            reworked_str,
            inbetw=True,
            cur_sent_idx=bad_idx,
            temp=oai_temp,
            top_n_abstr=top_n_abstr+cur_iter,
        )
        
        for s in rd_list:
            if s['status'] != 'ok':
                status_reworked = 'REWORK'
        print(f'\n\n> > > CURRENT STATUS FOR IDX {bad_idx}: ', status_reworked)
        reworked_str = ''.join([s['sent_text'] for s in rd_list])
        #print('LLM GENERATED: ', reworked_str)
        print('Current iter:', cur_iter)
        print('---\n---\n---')
    
    expl_parts_df['reworked_part'][bad_idx] = reworked_str
    print('\n\n\n----------------------------\n\n\n')

Reworking the following part:
---
Angiotensin II, the ligand for AT1R, induces microglia activation, affecting the function of neurons. 
---
Getting top 3 abstracts...
Taking into account surrounding sentences...
Querying LLM with the prompt:
Evaluating LLM response...
Run SemRep in interactive mode...
Processing input with replace_utf8.jar utility...
Sentence:
 Biochanin A, a phytoestrogen, has been shown to protect against angiotensin II-induced damage to dopaminergic neurons in rats by increasing endophilin A2 expression and decreasing AT1R expression. 
Status:
 REWORK
Reason:
	 ['Biochanin A', 'endophilin A2', 2] not in AGATHA KB

-----

Sentence:
 This suggests that modulating AT1R may be a potential strategy to mitigate neuroinflammation in conditions like HIV-1-associated neurocognitive disorder (HAND). 
Status:
 ok

-----



> > > CURRENT STATUS FOR IDX 4:  REWORK
Current iter: 1
---
---
---
Reworking the following part:
---
Biochanin A, a phytoestrogen, has been shown to prote

---
---
---
---
---

In [317]:
expl_parts_df

Unnamed: 0,index,sent_text,status,bad_predicates,reworked_part
0,0,"Based on the provided abstracts, the relations...",ok,[],
1,1,Here are the key points from the abstracts tha...,ok,[],
2,2,AGTR1-mediated pathways are implicated in this...,ok,[],
3,3,Angiotensin II Receptor Type 1 (AT1R) and Neur...,ok,[],
4,4,"Angiotensin II, the ligand for AT1R, induces m...",REWORK,"[[AT1R, microglia activation, 2]]","Oxidative stress, microglia activation, and ne..."
5,5,Modulation of AT1R to Mitigate Neuroinflammati...,ok,[],
6,6,They can reduce neuroinflammation and thereby ...,ok,[],
7,7,Endothelin A2 (EPA2) and AT1R: One study sugge...,ok,[],
8,8,Therapeutic agents that increase EPA2 expressi...,ok,[],
9,9,Therapeutic Potential: The inhibition of AGTR1...,REWORK,"[[HAND, pathogenesis, 9]]",miR-155 post-transcriptionally regulates AGTR1...


In [326]:
expl_parts_df[['sent_text', 'reworked_part']].to_csv('hand_agtr1_1st_iter.csv')