# Environment test for WatsonX.ai  

In [1]:
import os
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes
from langchain.vectorstores import FAISS
from langchain.embeddings import TensorflowHubEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
import pandas as pd
parameters = {
    GenParams.DECODING_METHOD: "greedy",
    GenParams.MAX_NEW_TOKENS: 200,
    GenParams.MIN_NEW_TOKENS: 0,
    GenParams.STOP_SEQUENCES: ["\n"],
    GenParams.REPETITION_PENALTY:1
    }


load_dotenv()
project_id = os.getenv("PROJECT_ID", None)
credentials = {
        "url": "https://us-south.ml.cloud.ibm.com",
        "apikey": os.getenv("API_KEY", None)
        }    
#this cell should never fail, and will produce no output
import requests

def getBearer(apikey):
    form = {'apikey': apikey, 'grant_type': "urn:ibm:params:oauth:grant-type:apikey"}
    print("About to create bearer")
#    print(form)
    response = requests.post("https://iam.cloud.ibm.com/oidc/token", data = form)
    if response.status_code != 200:
        print("Bad response code retrieving token")
        raise Exception("Failed to get token, invalid status")
    json = response.json()
    if not json:
        print("Invalid/no JSON retrieving token")
        raise Exception("Failed to get token, invalid response")
    print("Bearer retrieved")
    return json.get("access_token")

credentials["token"] = getBearer(credentials["apikey"])
from ibm_watson_machine_learning.foundation_models import Model
model_id = ModelTypes.LLAMA_2_70B_CHAT

# Initialize the Watsonx foundation model
llama_model = Model(
    model_id=model_id, 
    params=parameters, 
    credentials=credentials,
    project_id=project_id)


# Function to get text from PDF documents
def get_pdf_text(pdf_docs):
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        text += " ".join(page.extract_text() for page in pdf_reader.pages)
    return text

# Function to split text into chunks
def get_text_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks

# Function to create a vector store
def get_vectorstore(text_chunks):
    url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
    #embeddings = OpenAIEmbeddings()
    #embeddings = HuggingFaceInstructEmbeddings() 
    embeddings  = TensorflowHubEmbeddings(model_url=url)   
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore

# Function to create a conversation chain
def get_conversation_chain(vectorstore):
    #llm = ChatOpenAI()
    #llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
    llm=llama_model.to_langchain()
    memory = ConversationBufferMemory(
        memory_key='chat_history', return_messages=True)
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        memory=memory
    )
    return conversation_chain

About to create bearer
Bearer retrieved


In [2]:
def call_model_flan(question):
    
    parameters = {
    GenParams.DECODING_METHOD: "greedy",
    GenParams.MAX_NEW_TOKENS: 50,
    GenParams.MIN_NEW_TOKENS: 1,
    #GenParams.STOP_SEQUENCES: ["\n"],
    
    GenParams.STOP_SEQUENCES: ["<|endoftext|>"],
    GenParams.REPETITION_PENALTY:1,
    
    }    
    
    # Initialize the Watsonx foundation model
    llm_model= Model(
        model_id=ModelTypes['FLAN_T5_XXL'], 
        params=parameters, 
        credentials=credentials,
        project_id=project_id)
    prompt = f"Considering the following question, generate 3 keywords are most significant to use when searching in the Arxiv API ,provide your response as a Python list: {question}. "
    result=llm_model.generate(prompt)['results'][0]['generated_text']

    # Convert string to a list of individual words
    word_list = result.split(', ')    
    
    return word_list

In [3]:
question= "What are the current therapies with Tinnitus?"

In [4]:
original_list=call_model_flan(question)

In [5]:
unique_list = list(set(original_list))

In [6]:
unique_list

['tinnitus', 'therapy']

In [7]:
# full topic creation
topic = ' '.join(unique_list)

In [8]:
print("The topic to search is: '{}' ".format(topic))

The topic to search is: 'tinnitus therapy' 


In [9]:
def arxiv_search(topic):
    print("Searching on Arxiv: '{}' ".format(topic))
    # combinations of single topics
    titles = list()
    authors = list()
    summary = list()
    pdf_url = list()
    import arxiv
    search = arxiv.Search(
      query = topic,
      max_results = 10,
      sort_by = arxiv.SortCriterion.Relevance
       #SubmittedDate #TODO Include it
    )
    print('Fetching items for token: {}'.format(topic))  
    titles = [result.title for result in arxiv.Client().results(search)]
    authors = [result.authors for result in arxiv.Client().results(search)]
    summary = [result.summary for result in arxiv.Client().results(search)]
    entry_id = [result.entry_id for result in arxiv.Client().results(search)]
    pdf_url = [result.pdf_url for result in arxiv.Client().results(search)]
    categories = [result.categories for result in arxiv.Client().results(search)]
    comment = [result.comment for result in arxiv.Client().results(search)]
    doi = [result.doi for result in arxiv.Client().results(search)]
    published = [result.published for result in arxiv.Client().results(search)]
    df = pd.DataFrame({
        'title': titles,
        'authors': authors,
        'summary': summary,
        'pdf_url': pdf_url,
        'categories': categories,
        'published': published
    })
    url_list =df['pdf_url'].values.tolist()

    import requests
    import os
    import tempfile

    def download_pdf(url, filename):
        response = requests.get(url)
        with open(filename, 'wb') as file:
            file.write(response.content)

    def download_pdf_files(url_list):
        temp_dir = tempfile.gettempdir()  # Get the temporary directory path
        downloaded_files = []  # List to store downloaded file paths
        for i, url in enumerate(url_list):
            filename = os.path.join(temp_dir, f'file_{i+1}.pdf')  # Set the absolute path in the temporary directory
            download_pdf(url, filename)
            downloaded_files.append(filename)  # Append the file name to the list with the path
            print(f'Downloaded: {filename}')

        return downloaded_files  # Return the list of downloaded file names

    def delete_files_in_temp():
        temp_dir = tempfile.gettempdir()  # Get the temporary directory path
        for file in os.listdir(temp_dir):
            file_path = os.path.join(temp_dir, file)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
                    print(f"Deleted: {file_path}")
            except Exception as e:
                print(f"Failed to delete {file_path}: {e}")


    downloaded_files = download_pdf_files(url_list)
    return downloaded_files

In [10]:
downloaded_files =arxiv_search(topic)

Searching on Arxiv: 'tinnitus therapy' 
Fetching items for token: tinnitus therapy
Downloaded: C:\Users\rusla\AppData\Local\Temp\file_1.pdf
Downloaded: C:\Users\rusla\AppData\Local\Temp\file_2.pdf
Downloaded: C:\Users\rusla\AppData\Local\Temp\file_3.pdf
Downloaded: C:\Users\rusla\AppData\Local\Temp\file_4.pdf
Downloaded: C:\Users\rusla\AppData\Local\Temp\file_5.pdf
Downloaded: C:\Users\rusla\AppData\Local\Temp\file_6.pdf
Downloaded: C:\Users\rusla\AppData\Local\Temp\file_7.pdf
Downloaded: C:\Users\rusla\AppData\Local\Temp\file_8.pdf
Downloaded: C:\Users\rusla\AppData\Local\Temp\file_9.pdf
Downloaded: C:\Users\rusla\AppData\Local\Temp\file_10.pdf


In [11]:
downloaded_files

['C:\\Users\\rusla\\AppData\\Local\\Temp\\file_1.pdf',
 'C:\\Users\\rusla\\AppData\\Local\\Temp\\file_2.pdf',
 'C:\\Users\\rusla\\AppData\\Local\\Temp\\file_3.pdf',
 'C:\\Users\\rusla\\AppData\\Local\\Temp\\file_4.pdf',
 'C:\\Users\\rusla\\AppData\\Local\\Temp\\file_5.pdf',
 'C:\\Users\\rusla\\AppData\\Local\\Temp\\file_6.pdf',
 'C:\\Users\\rusla\\AppData\\Local\\Temp\\file_7.pdf',
 'C:\\Users\\rusla\\AppData\\Local\\Temp\\file_8.pdf',
 'C:\\Users\\rusla\\AppData\\Local\\Temp\\file_9.pdf',
 'C:\\Users\\rusla\\AppData\\Local\\Temp\\file_10.pdf']

In [14]:
# get the current working directory
#current_working_directory = os.getcwd()# Path
# Join various path components 
#pdf_path=[os.path.join(current_working_directory, "documents", "example.pdf")]

In [15]:
# Adding the downloaded files to path
pdf_path= downloaded_files

In [16]:
# Get PDF text and split into chunks
raw_text = get_pdf_text(pdf_path)

In [17]:
text_chunks = get_text_chunks(raw_text)

Created a chunk of size 1144, which is longer than the specified 1000
Created a chunk of size 1212, which is longer than the specified 1000
Created a chunk of size 1211, which is longer than the specified 1000


In [18]:
# Create vector store and conversation chain
vectorstore = get_vectorstore(text_chunks)

In [19]:
llm=llama_model.to_langchain()
memory = ConversationBufferMemory(memory_key='chat_history',
                                    return_messages=True)

In [20]:
conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=vectorstore.as_retriever(),
    memory=memory)

In [21]:
query = question

In [22]:
print(query)

What are the current therapies with Tinnitus?


In [23]:
prompt={"question": query}

In [24]:
result = conversation_chain(prompt)

In [25]:
result

{'question': 'What are the current therapies with Tinnitus?',
 'chat_history': [HumanMessage(content='What are the current therapies with Tinnitus?', additional_kwargs={}, example=False),
  AIMessage(content='  Unfortunately, there are currently no universally effective clinical methods for the diagnosis and treatment of tinnitus. Researchers have proposed that assessing abnormal neural activity through EEG signals may aid in the diagnosis of tinnitus. In the early stages, evaluation of EEG signals and further diagnosis were usually done by clinical specialists. Some researchers have attempted to use neurofeedback to assist in tinnitus therapy, and all patients claimed tinnitus relief with decreased EEG activity observed.', additional_kwargs={}, example=False)],
 'answer': '  Unfortunately, there are currently no universally effective clinical methods for the diagnosis and treatment of tinnitus. Researchers have proposed that assessing abnormal neural activity through EEG signals may a


To include instructions to the large language model in the python code above, you can use the **parameters** argument when calling the conversation_chain.generate() method. This argument is a dictionary that can contain any additional parameters that you want to pass to the model.

In [26]:
parameters = {"instruction": "Answer the following question using only information from the article. If there is no good answer in the article, say I don't know"}

In [27]:
query = question

In [28]:
prompt={"question": query}

In [29]:
result = conversation_chain(prompt, parameters)


In [30]:
result

{'answer': '  The current therapies for tinnitus include neurofeedback, which was first attempted by Gosepath et al. [26] to assist tinnitus therapy. With the decreased activity of EEG observed, all patients claimed tinnitus relief. The researchers emphasize that the assessment of abnormal neural activity as assessed by EEG signals may aid in the diagnosis of tinnitus. However, there is currently no universally effective clinical method for subjective tinnitus diagnosis and treatment [25].'}

In [76]:
query = "What is the temperature in Genova"
prompt={"question": query}

In [77]:
result = conversation_chain(prompt,parameters)

In [78]:
result

{'answer': "  I don't know.\n"}

In [79]:
memory

ConversationBufferMemory(chat_memory=ChatMessageHistory(messages=[HumanMessage(content='What is the topic about', additional_kwargs={}, example=False), AIMessage(content=' The topic is about a research study on tinnitus analysis using EEG signals, specifically proposing a new method called Side-aware Meta-Siamese-AutoEncoder (SMeta-SAE) for tinnitus diagnosis and side information prediction.', additional_kwargs={}, example=False), HumanMessage(content='What are the current therapies with Tinnitus?', additional_kwargs={}, example=False), AIMessage(content='  According to the provided text, current therapies for tinnitus include neurofeedback, tinnitus retraining therapy, and cognitive behavioral therapy. Additionally, researchers have proposed using the assessment of abnormal neural activity as assessed by EEG signals to aid in the diagnosis of tinnitus.', additional_kwargs={}, example=False), HumanMessage(content='What are the current therapies with Tinnitus?', additional_kwargs={}, ex