In [147]:
%clear -f
import pandas as pd
from sqlalchemy import create_engine, update, Table, MetaData
from dotenv import load_dotenv
import os, json, threading, time 
import google.generativeai as genai
from labels_all import get_prompt
from txt_to_df import txt_to_df
from test import get_test_df

load_dotenv(".env")
GEMINI_KEY = os.environ.get("GEMINI_KEY")
genai.configure(api_key=GEMINI_KEY)

# Gemini API # currently need VPN to outside Europe
model = genai.GenerativeModel('gemini-pro')
generation_config = genai.types.GenerationConfig(temperature=0) #default: 1.0

# SQLite for persistent storage
engine = create_engine('sqlite:///publications.db')


[H[2J

In [148]:
%autoreload
publications = txt_to_df("tst.txt")
publications = publications[:20]
publications.head()
publications.to_html("new1.html")

In [149]:
#publications = get_test_df()
# publications["category"] = None
# publications["reasoning"] = None
publications["category1"] = None
publications["reasoning1"] = None
publications["category2"] = None
publications["reasoning2"] = None


# Write whole df to SQLite 
publications.to_sql('publications', con=engine, if_exists='replace', index=False)
publications_table = Table('publications', MetaData(), autoload_with=engine)
publications.shape

(20, 9)

In [151]:
def get_category(i):
 try:
    print(i)
    #get data from gemini api
    index = publications["Index"][i]
    title = publications["Title"][i]
    abstract = publications["Abstract"][i]
    prompt = get_prompt(title, abstract)
    res = model.generate_content(prompt, generation_config=generation_config)
    json_res = json.loads(res.text)
    categories = json_res["categories"]
    category1 = categories[0]["category"]
    reasoning1 = categories[0]["clear_reasoning"]
    if len(categories) > 1:
      category2 = json_res["categories"][1]["category"]
      reasoning2 = json_res["categories"][1]["clear_reasoning"]
    else :
      category2 = None
      reasoning2 = None
    print(i, category1, category2)
    #save in database
    stmt = (
    update(publications_table)
    .where(publications_table.c.Index == index)
    .values(category1=category1, reasoning1=reasoning1, category2=category2, reasoning2=reasoning2)
    )
    with engine.connect() as conn:
     result = conn.execute(stmt)
     conn.commit()  
 except Exception as e:
   print(e)
   print(str(i)+"returned with err")

#test function
#get_category(1)


1
1 A7 C3


In [104]:
start_idx = 0
end_idx = 3

if __name__ == '__main__':
    threads = []
    for i in range(start_idx, end_idx):
        thread = threading.Thread(target=get_category, args=(i,))
        threads.append(thread)
        thread.start()
        time.sleep(1.01) # current rate limit 60 requests/minute
        if (i%10==0) :
            print("# running threads: " + str(threading.active_count()))

    for thread in threads:
        thread.join()


0
# running threads: 8
1
2
1 A2 A5
Expecting value: line 4 column 1 (char 224)
2returned with err
0 A6 A7


In [119]:
query = "SELECT * FROM publications"
publications = pd.read_sql_query(query, engine)
publications
#publications.to_csv('new.csv', index=False, sep=",")
publications.to_html("new1.html")

In [103]:
# # #remove hallucinated labels
# labels = ["S", "OP", "TAM", "PRM", "PFU", "GAFM", "BAWM", "PED", "MISC"]
# # # Replace values not in the list with null
# proposals['category'] = proposals['category'].apply(lambda x: x if x in labels else None)
# proposals.to_sql('proposals', con=engine, if_exists='replace', index=False)

# filter for proposals that have not been assigned a label yet
publications = publications[publications['category1'].isnull()]
publications = publications.reset_index()
print(publications.shape)
publications.head()

(3, 7)


Unnamed: 0,index,doc,correct_sublabel,category1,reasoning1,category2,reasoning2
0,3,Tau protein aggregates are a major driver of n...,A1b,,,,
1,8,"A Disintegrin and Metalloprotease 10, also kno...",A2,,,,
2,15,BACKGROUND: Exposure to air pollution has been...,A10,,,,


In [115]:
%autoreload
doc = publications["doc"][4]
    #title = publications["title"][i]
    #body = publications["body"][i]
prompt = get_prompt("", doc)
    #print(prompt)
res = model.generate_content(prompt, generation_config=generation_config)
print(res.text)

{
"categories": [
{"category": "A1a", "clear_reasoning": "The publication uses summary statistics from genome-wide association studies to assess genetic covariance and local genetic correlation between Parkinson's and Alzheimer's disease."},
{"category": "A7", "clear_reasoning": "The publication mentions enrichment of heritability in genomic regions relevant to microglia in both Parkinson's and Alzheimer's disease."}
]
}


In [69]:
json_res = json.loads(res.text)#.replace("'", '"'))
category1 = json_res["categories"][0]["clear_reasoning"]
category1
# reasoning = json_res["clear_reasoning"]

'The abstract mentions microglia and the immune system, which is related to the immune system and glial cells.'

In [139]:
import re

def find_doi(text):
    # This regex looks for 'doi' and captures everything that follows it
    pattern = r'doi:(.*?)($|\s)'
    match = re.search(pattern, text, re.IGNORECASE)

    # Return the matched group if found, else return None
    return match.group(1).strip() if match else None

# Example usage
text = "Here is a DOI doi:10.1000/182 and another DOI doi:10.1000/xyz in the same text."
found_dois = find_all_dois(text)
print(found_dois)

10.1000/182
