# ArXiv keyword Generator with WatsonX

We are interested to generate the keywords used in the search engine of ArXiv to find information


For the following question which keyword is more important to use to search in Arxiv API
question=question = "What are the most effective treatments for cardiovascular diseases, and how can they be made more accessible to patients worldwide?" and save it as variable in python

In [86]:
import os
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes
from langchain.vectorstores import FAISS
from langchain.embeddings import TensorflowHubEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain import PromptTemplate
from langchain.chains import LLMChain
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes


load_dotenv()
project_id = os.getenv("PROJECT_ID", None)
credentials = {
        "url": "https://us-south.ml.cloud.ibm.com",
        "apikey": os.getenv("API_KEY", None)
        }    
#this cell should never fail, and will produce no output
import requests

def getBearer(apikey):
    form = {'apikey': apikey, 'grant_type': "urn:ibm:params:oauth:grant-type:apikey"}
    print("About to create bearer")
#    print(form)
    response = requests.post("https://iam.cloud.ibm.com/oidc/token", data = form)
    if response.status_code != 200:
        print("Bad response code retrieving token")
        raise Exception("Failed to get token, invalid status")
    json = response.json()
    if not json:
        print("Invalid/no JSON retrieving token")
        raise Exception("Failed to get token, invalid response")
    print("Bearer retrieved")
    return json.get("access_token")

credentials["token"] = getBearer(credentials["apikey"])
from ibm_watson_machine_learning.foundation_models import Model

About to create bearer
Bearer retrieved


In [87]:
model_ids = [model.name for model in ModelTypes]

In [88]:
parameters = {
    GenParams.DECODING_METHOD: "greedy",
    GenParams.MAX_NEW_TOKENS: 50,
    GenParams.MIN_NEW_TOKENS: 1,
    #GenParams.STOP_SEQUENCES: ["\n"],
    
    GenParams.STOP_SEQUENCES: ["<|endoftext|>"],
    GenParams.REPETITION_PENALTY:1,
    
    }

In [89]:
def call_model(model_id):
    # Initialize the Watsonx foundation model
    llm_model= Model(
        model_id=ModelTypes[model_id], 
        params=parameters, 
        credentials=credentials,
        project_id=project_id)
    question= "What are the most effective treatments for cardiovascular diseases, and how can they be made more accessible to patients worldwide?"
    #prompt = "Considering the question '{}' and the topic of interest, please identify the top 5 most relevant keywords for querying the Arxiv API. Provide your response as a Python list.".format(question)
    prompt = f"Considering the following question, generate 3 keywords are most significant to use when searching in the Arxiv API: {question}. Please provide your response as a Python list."
    #prompt = "To effectively retrieve information from the Arxiv API regarding {}, which specific keyword(s) would be most crucial? Please provide your answer as a list in Python format.".format(question)
    #prompt = "For the following question which keyword is more important to use to search in Arxiv API {} , answer only with a python list".format(question)
    result=llm_model.generate(prompt)['results'][0]['generated_text']
    return result

In [33]:
#model_ids

In [34]:
results=call_model('FLAN_T5_XXL')

In [35]:
print(results)

cardiovascular, disease, treatment


In [36]:
for models in model_ids:
    print(models)
    results=call_model(models)
    print(results)
    

FLAN_T5_XXL
cardiovascular, disease, treatment
FLAN_UL2
cardiovascular, disease, treatment
MT0_XXL
cardiovascular, treatment, worldwide
GPT_NEOX


A:

I would use the following:

cardiovascular diseases
cardiovascular
cardiovascular disease

The first one is the most general, the second one is the most specific, and the third one is the most common
MPT_7B_INSTRUCT2

The most effective treatments for cardiovascular diseases are statins, blood pressure medications, and cholesterol-lowering diets. These treatments can be made more accessible to patients worldwide by providing them with more affordable medications, educating patients about the importance of following a
STARCODER


# In[1]:


import requests
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk
LLAMA_2_70B_CHAT


Answer: Here are three keywords that could be used when searching the ArXiv API for articles related to the

In [37]:
def call_model_flan(question):
    # Initialize the Watsonx foundation model
    llm_model= Model(
        model_id=ModelTypes['FLAN_T5_XXL'], 
        params=parameters, 
        credentials=credentials,
        project_id=project_id)
    prompt = f"Considering the following question, generate 3 keywords are most significant to use when searching in the Arxiv API ,provide your response as a Python list: {question}. "
    result=llm_model.generate(prompt)['results'][0]['generated_text']

    # Convert string to a list of individual words
    word_list = result.split(', ')    
    
    return word_list

In [38]:
question= "What are the most effective treatments for cardiovascular diseases, and how can they be made more accessible to patients worldwide?"

In [39]:
result=call_model_flan(question)

In [40]:
result

['cardiovascular', 'disease', 'treatment']

In [41]:
questions = [
    "What are the current therapies with Tinnitus?",
    "What are the most effective treatments for cardiovascular diseases, and how can they be made more accessible to patients worldwide?",
    "How can early detection and prevention strategies for cancer be improved and implemented on a global scale?",
    "What are the key factors contributing to the rise of diabetes, and how can lifestyle interventions be used to combat this epidemic?",
    "How can we develop more effective vaccines and treatments for respiratory infections such as pneumonia and influenza?",
    "What are the most promising advances in Alzheimer's disease research, and how can these findings be translated into clinical practice?",
    "What strategies can be employed to prevent and manage the growing global burden of chronic kidney disease?",
    "How can we improve the understanding of mental health disorders, such as depression and anxiety, to develop more effective therapies?",
    "What are the most significant challenges in eradicating malaria, and how can we overcome them?",
    "How can we improve access to HIV/AIDS treatment and prevention methods in regions with high prevalence rates?",
    "What are the most effective ways to prevent and treat malnutrition in children and adults worldwide?",
    "How can we address the global rise in antibiotic resistance, and what alternative treatments can be developed for bacterial infections?",
    "What are the key factors driving the obesity epidemic, and how can public health interventions help reverse this trend?",
    "How can we improve our understanding of the genetic and environmental factors contributing to autoimmune diseases such as lupus and rheumatoid arthritis?",
    "What are the most promising areas of research for developing new treatments for chronic pain conditions?",
    "How can we better understand and manage the global burden of neurological disorders, such as multiple sclerosis and Parkinson's disease?",
    "What are the most effective strategies for reducing the impact of substance abuse and addiction on individuals and communities?",
    "How can we improve the early detection and treatment of rare genetic disorders, such as cystic fibrosis and muscular dystrophy?",
    "What are the most significant challenges in combating neglected tropical diseases, and how can we address these issues?",
    "How can we develop more effective interventions for preventing and treating age-related diseases, such as osteoporosis and macular degeneration?",
    "What are the most promising areas of research for understanding and treating chronic liver diseases, including hepatitis and cirrhosis?",
]

In [42]:
for question in questions:
    output=call_model_flan(question)
    print(output, " " , question)

['tinnitus', 'therapy', 'tinnitus']   What are the current therapies with Tinnitus?
['cardiovascular', 'disease', 'treatment']   What are the most effective treatments for cardiovascular diseases, and how can they be made more accessible to patients worldwide?
['cancer', 'detection', 'prevention']   How can early detection and prevention strategies for cancer be improved and implemented on a global scale?
['diabetes', 'epidemic', 'lifestyle', 'intervention']   What are the key factors contributing to the rise of diabetes, and how can lifestyle interventions be used to combat this epidemic?
['influenza', 'pneumonia', 'treatment']   How can we develop more effective vaccines and treatments for respiratory infections such as pneumonia and influenza?
["alzheimer's", 'disease', 'practice']   What are the most promising advances in Alzheimer's disease research, and how can these findings be translated into clinical practice?
['chronic', 'disease', 'manage', 'prevent', 'strategy']   What stra

In [43]:
original_list =['tinnitus', 'therapy', 'tinnitus']

In [44]:
unique_list = list(set(original_list))

In [47]:
unique_list

['therapy', 'tinnitus']

In [80]:
# full topic creation
topic = ' '.join(unique_list)

In [81]:
#topic ='tinnitus'

In [82]:
#pip install arxiv

In [83]:
# combinations of single topics
titles = list()
authors = list()
summary = list()
pdf_url = list()

In [92]:
import arxiv
search = arxiv.Search(
  query = topic,
  max_results = 10,
  sort_by = arxiv.SortCriterion.Relevance
   #SubmittedDate #TODO Include it
)

In [93]:
print('Fetching items for token: {}'.format(topic))  
titles = [result.title for result in arxiv.Client().results(search)]
authors = [result.authors for result in arxiv.Client().results(search)]
summary = [result.summary for result in arxiv.Client().results(search)]
entry_id = [result.entry_id for result in arxiv.Client().results(search)]
pdf_url = [result.pdf_url for result in arxiv.Client().results(search)]
categories = [result.categories for result in arxiv.Client().results(search)]
comment = [result.comment for result in arxiv.Client().results(search)]
doi = [result.doi for result in arxiv.Client().results(search)]
published = [result.published for result in arxiv.Client().results(search)]

Fetching items for token: therapy tinnitus


In [94]:
df = pd.DataFrame({
    'title': titles,
    'authors': authors,
    'summary': summary,
    'pdf_url': pdf_url,
    'categories': categories,
    'published': published
})

In [95]:
df

Unnamed: 0,title,authors,summary,pdf_url,categories,published
0,Side-aware Meta-Learning for Cross-Dataset Lis...,"[Yun Li, Zhe Liu, Lina Yao, Molly Lucas, Jessi...","With the development of digital technology, ma...",http://arxiv.org/pdf/2205.03231v1,"[eess.SP, cs.LG]",2022-05-03 03:17:44+00:00
1,Towards a Cognitive Computational Neuroscience...,"[Patrick Krauss, Achim Schilling]",In order to gain a mechanistic understanding o...,http://arxiv.org/pdf/2010.01914v1,[q-bio.NC],2020-10-05 10:55:03+00:00
2,The Complex-Pole Filter Representation (COFRE)...,"[Marco A. Pinto Orellana, Peyman Mirtaheri, Hu...",The complex-pole frequency representation (COF...,http://arxiv.org/pdf/2105.13476v1,"[q-bio.QM, physics.med-ph, stat.ME]",2021-05-13 16:42:00+00:00
3,Disentangled and Side-aware Unsupervised Domai...,"[Yun Li, Zhe Liu, Lina Yao, Jessica J. M. Mona...",EEG-based tinnitus classification is a valuabl...,http://arxiv.org/pdf/2205.03230v2,"[eess.SP, cs.LG]",2022-05-03 05:22:04+00:00
4,Predictive coding and stochastic resonance as ...,"[Achim Schilling, William Sedley, Richard Geru...",How is information processed in the brain duri...,http://arxiv.org/pdf/2204.03354v2,"[q-bio.NC, cs.AI]",2022-04-07 10:47:58+00:00
5,Open(G)PIAS: An open source solution for the c...,"[Richard Gerum, Hinrich Rahlfs, Matthias Streb...",The acoustic startle reflex (ASR) that can be ...,http://arxiv.org/pdf/1804.09667v1,[q-bio.NC],2018-04-25 16:31:53+00:00
6,UNITI Mobile -- EMI-Apps for a Large-Scale Eur...,"[Carsten Vogel, Johannes Schobel, Winfried Sch...",More and more observational studies exploit th...,http://arxiv.org/pdf/2107.14029v1,"[cs.OH, D.2.13; J.3; J.4; H.4.0]",2021-07-22 20:31:00+00:00
7,Challenges in constructing genetic instruments...,"[B. A. Ference, G. Davey Smith, M. V. Holmes, ...",The genes that encode the targets of most ther...,http://arxiv.org/pdf/2007.13115v1,[q-bio.QM],2020-07-26 12:22:16+00:00
8,In Silico Implementation of Evolutionary Parad...,"[Branislav Brutovsky, Denis Horvath]",In here presented in silico study we suggest a...,http://arxiv.org/pdf/1811.06262v3,[q-bio.PE],2018-11-15 10:00:11+00:00
9,Stochastic resonance controlled upregulation o...,"[Patrick Krauss, Konstantin Tziridis, Achim Sc...",Subjective tinnitus (ST) is generally assumed ...,http://arxiv.org/pdf/1603.04721v1,"[q-bio.QM, q-bio.NC]",2016-03-15 15:27:19+00:00


In [99]:
df['pdf_url'].values.tolist()

['http://arxiv.org/pdf/2205.03231v1',
 'http://arxiv.org/pdf/2010.01914v1',
 'http://arxiv.org/pdf/2105.13476v1',
 'http://arxiv.org/pdf/2205.03230v2',
 'http://arxiv.org/pdf/2204.03354v2',
 'http://arxiv.org/pdf/1804.09667v1',
 'http://arxiv.org/pdf/2107.14029v1',
 'http://arxiv.org/pdf/2007.13115v1',
 'http://arxiv.org/pdf/1811.06262v3',
 'http://arxiv.org/pdf/1603.04721v1']

In [105]:
import requests
import os
import tempfile

def download_pdf(url, filename):
    response = requests.get(url)
    with open(filename, 'wb') as file:
        file.write(response.content)

def download_pdf_files(url_list):
    temp_dir = tempfile.gettempdir()  # Get the temporary directory path
    downloaded_files = []  # List to store downloaded file paths
    for i, url in enumerate(url_list):
        filename = os.path.join(temp_dir, f'file_{i+1}.pdf')  # Set the absolute path in the temporary directory
        download_pdf(url, filename)
        downloaded_files.append(f'file_{i+1}.pdf')  # Append the file name to the list without the path
        print(f'Downloaded: {filename}')
    
    return downloaded_files  # Return the list of downloaded file names

def delete_files_in_temp():
    temp_dir = tempfile.gettempdir()  # Get the temporary directory path
    for file in os.listdir(temp_dir):
        file_path = os.path.join(temp_dir, file)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)
                print(f"Deleted: {file_path}")
        except Exception as e:
            print(f"Failed to delete {file_path}: {e}")

# List of PDF URLs
url_list = [
    'http://arxiv.org/pdf/2205.03231v1',
    'http://arxiv.org/pdf/2010.01914v1',
    'http://arxiv.org/pdf/2105.13476v1',
    'http://arxiv.org/pdf/2205.03230v2',
    'http://arxiv.org/pdf/2204.03354v2',
    'http://arxiv.org/pdf/1804.09667v1',
    'http://arxiv.org/pdf/2107.14029v1',
    'http://arxiv.org/pdf/2007.13115v1',
    'http://arxiv.org/pdf/1811.06262v3',
    'http://arxiv.org/pdf/1603.04721v1'
]




In [111]:
downloaded_files = download_pdf_files(url_list)


 

Downloaded: C:\Users\rusla\AppData\Local\Temp\file_1.pdf
Downloaded: C:\Users\rusla\AppData\Local\Temp\file_2.pdf
Downloaded: C:\Users\rusla\AppData\Local\Temp\file_3.pdf
Downloaded: C:\Users\rusla\AppData\Local\Temp\file_4.pdf
Downloaded: C:\Users\rusla\AppData\Local\Temp\file_5.pdf
Downloaded: C:\Users\rusla\AppData\Local\Temp\file_6.pdf
Downloaded: C:\Users\rusla\AppData\Local\Temp\file_7.pdf
Downloaded: C:\Users\rusla\AppData\Local\Temp\file_8.pdf
Downloaded: C:\Users\rusla\AppData\Local\Temp\file_9.pdf
Downloaded: C:\Users\rusla\AppData\Local\Temp\file_10.pdf


In [109]:
print("List of downloaded files:")
for file_name in downloaded_files:
    print(file_name)
    
    
  

List of downloaded files:
C:\Users\rusla\AppData\Local\Temp\file_1.pdf
C:\Users\rusla\AppData\Local\Temp\file_2.pdf
C:\Users\rusla\AppData\Local\Temp\file_3.pdf
C:\Users\rusla\AppData\Local\Temp\file_4.pdf
C:\Users\rusla\AppData\Local\Temp\file_5.pdf
C:\Users\rusla\AppData\Local\Temp\file_6.pdf
C:\Users\rusla\AppData\Local\Temp\file_7.pdf
C:\Users\rusla\AppData\Local\Temp\file_8.pdf
C:\Users\rusla\AppData\Local\Temp\file_9.pdf
C:\Users\rusla\AppData\Local\Temp\file_10.pdf


In [112]:
downloaded_files

['C:\\Users\\rusla\\AppData\\Local\\Temp\\file_1.pdf',
 'C:\\Users\\rusla\\AppData\\Local\\Temp\\file_2.pdf',
 'C:\\Users\\rusla\\AppData\\Local\\Temp\\file_3.pdf',
 'C:\\Users\\rusla\\AppData\\Local\\Temp\\file_4.pdf',
 'C:\\Users\\rusla\\AppData\\Local\\Temp\\file_5.pdf',
 'C:\\Users\\rusla\\AppData\\Local\\Temp\\file_6.pdf',
 'C:\\Users\\rusla\\AppData\\Local\\Temp\\file_7.pdf',
 'C:\\Users\\rusla\\AppData\\Local\\Temp\\file_8.pdf',
 'C:\\Users\\rusla\\AppData\\Local\\Temp\\file_9.pdf',
 'C:\\Users\\rusla\\AppData\\Local\\Temp\\file_10.pdf']

In [113]:
#delete_files_in_temp()
