In [19]:
%clear -f
import pandas as pd
from sqlalchemy import create_engine, update, Table, MetaData
from dotenv import load_dotenv
import os, json, threading, time 
import google.generativeai as genai
from labels import get_prompt_A, get_prompt_main
from txt_to_df import txt_to_df
from test import get_test_df

load_dotenv(".env")
GEMINI_KEY = os.environ.get("GEMINI_KEY")
genai.configure(api_key=GEMINI_KEY)

# Gemini API # currently need VPN to outside Europe
model = genai.GenerativeModel('gemini-pro')

# SQLite for persistent storage
engine = create_engine('sqlite:///publications.db')


In [26]:
%autoreload

print(get_prompt_A("",""))


The following is a publication related to Alzheimer's Disease (AD). Please analyze the following title and abstract of the publication and classify it using the categories and their explanations that are listed afterwards.
TITLE: .
ABSTRACT: .
ABSTRACT END
You can ONLY choose from the following curated categories:
Categoriy shortcuts: [A1a, A1b, A1c, A2, A3, A4, A5, A6, A7, A8, A9, A10, AX]
Explanations: 
Genetics (A1a)
This topic includes new variants, genetic and transcriptomics insights such as genome-wide association studies.
APOE (A1b) 
This topic is about apolipoprotein E (APOE). 
Familial AD mutations (A1c) 
This topic is about familial AD mutations, such as APP, presenilin 1/2, BACE
Autophagy/Proteostasis (A2) 
This topic includes endosomal/lysosomal dysfunction, protein uptake/trafficking, general protein aggregation, post-translational modifications, and proteomics. It does not include tau and amyloid-beta, as it belongs to a different topic.
Metabolism and Mitochondrial Dys

In [9]:
publications = txt_to_df("tst.txt")


(484, 4)

In [4]:
publications = get_test_df()
# publications["category"] = None
# publications["reasoning"] = None
publications["category1"] = None
publications["reasoning1"] = None
publications["category2"] = None
publications["reasoning2"] = None


# Write whole df to SQLite 
publications.to_sql('publications', con=engine, if_exists='replace', index=False)
publications_table = Table('publications', MetaData(), autoload_with=engine)
publications.shape

(18, 4)

In [5]:
def get_category(i):
 try:
    print(i)
    #get data from gemini api
    doc = publications["doc"][i]
    #title = publications["title"][i]
    #body = publications["body"][i]
    prompt = get_prompt_A("", doc)
    #print(prompt)
    res = model.generate_content(prompt)
    json_res = json.loads(res.text)#.replace("'", '"'))
    category = json_res["most_relevant_category"]
    reasoning = json_res["clear_reasoning"]
    print(i, category)
    #save in database
    stmt = (
    update(publications_table)
    .where(publications_table.c.doc == doc)
    .values(subcategory=category, subcategory_reasoning=reasoning)
    )
    with engine.connect() as conn:
     result = conn.execute(stmt)
     conn.commit()  
 except Exception as e:
   print(e)
   print(str(i)+"returned with err")

#test function
#get_category(1)


In [13]:
start_idx = 0
end_idx = 2

if __name__ == '__main__':
    threads = []
    for i in range(start_idx, end_idx):
        thread = threading.Thread(target=get_category, args=(i,))
        threads.append(thread)
        thread.start()
        time.sleep(1.01) # current rate limit 60 requests/minute
        if (i%10==0) :
            print("# running threads: " + str(threading.active_count()))

    for thread in threads:
        thread.join()


0
# running threads: 8
1
1 A5
0 A5


In [16]:
query = "SELECT * FROM publications"
publications = pd.read_sql_query(query, engine)
publications
#publications.to_csv('new.csv', index=False, sep=",")
publications.to_html("new.html")

In [12]:
# # #remove hallucinated labels
# labels = ["S", "OP", "TAM", "PRM", "PFU", "GAFM", "BAWM", "PED", "MISC"]
# # # Replace values not in the list with null
# proposals['category'] = proposals['category'].apply(lambda x: x if x in labels else None)
# proposals.to_sql('proposals', con=engine, if_exists='replace', index=False)

# filter for proposals that have not been assigned a label yet
publications = publications[publications['subcategory'].isnull()]
publications = publications.reset_index()
print(publications.shape)
publications.head()

(2, 5)


Unnamed: 0,index,doc,correct_sublabel,subcategory,subcategory_reasoning
0,6,"The formation of amyloid-ß peptides (Aß), that...",A5,,
1,12,While soluble forms of amyloid-β (Aβ) and Tau ...,A5,,


In [37]:
%autoreload
doc = publications["doc"][7]
    #title = publications["title"][i]
    #body = publications["body"][i]
prompt = get_prompt_A("", doc)
    #print(prompt)
res = model.generate_content(prompt)
print(res.text)

{
"categories": [
{"category": "A4", "reason": "This publication is clearly related to the cholinergic system, which is part of the Neural Excitability, Synapses, and Brain Connectivity topic."},
{"category": "A2", "reason": "The publication is also related to post-translational modifications of AChE, which is part of the Autophagy/Proteostasis topic."}
]
}


In [29]:
print(res.text)

{
"categories": [
{"category": "A4", "clear_reasoning": "The abstract investigates dopamine transporter levels in relation to glucose metabolism, which relates to neural excitability, synapses, and brain connectivity."},
{"category": "A5", "clear_reasoning": "The study focuses on the dopamine transporter which is related to amyloid pathology."},
{"category": "A7", "clear_reasoning": "The abstract briefly mentions the impact of striatal dopamine transporter levels which are related to the immune system and glial cells."},
{"category": "A10", "clear_reasoning": "The study focuses on CSF DAT as a biomarker, which is related to risk factors associated with Alzheimer's disease."}
]
}


In [32]:
json_res = json.loads(res.text)#.replace("'", '"'))
json_res["categories"][0]
# reasoning = json_res["clear_reasoning"]

{'category': 'A4',
 'clear_reasoning': 'The abstract investigates dopamine transporter levels in relation to glucose metabolism, which relates to neural excitability, synapses, and brain connectivity.'}