In [1]:
import os
import pandas as pd
from tqdm import tqdm
import glob  # Import glob module

# Define the project directory

data_dir = os.path.join('data', 'PubMed_Format')

# Define articles dictionary keys
articles_dict_keys = ['PMID', 'OWN', 'STAT', 'DCOM', 'LR', 'IS', 'VI', 'IP', 'DP', 'TI', 'PG', 'LID', 'AB', 'FAU', 'AU', 'AD', 'LA', 'GR', 'PT', 'DEP', 'PL', 'TA', 'JT', 'JID', 'SB', 'MH', 'PMC', 'MID', 'COIS', 'EDAT', 'MHDA', 'CRDT', 'PHST', 'AID', 'PST', 'SO', 'AUID', 'CIN', 'CI', 'OTO', 'OT']
articles_dict = {key: [] for key in articles_dict_keys}

# Pattern to match all relevant files
file_pattern = os.path.join(data_dir, 'raw_pubmed_intelligence_abstracts_*.txt')

# Use glob to find all files matching the pattern
for file_path in glob.glob(file_pattern):
    with open(file_path, 'r', encoding="utf-8") as f:
        text = f.read()
    
    articles = text.split("\n\n")

    for article in tqdm(articles, desc=f"Processing {os.path.basename(file_path)}"):
        lines = article.split("\n")
        dictionary = {key: [] for key in articles_dict_keys}  # Use lists to handle multiple entries
        current_key = None  # Track the current key for lines without a new key

        for line in lines:
            if len(line) > 4 and line[4] == '-':
                key, value = line.split("-", 1)
                key = key.strip()
                value = value.strip()
                if key in dictionary:
                    dictionary[key].append(value)  # Append new values to the list
                    current_key = key
            elif current_key:  # Append continuation lines to the last value of the current key
                dictionary[current_key][-1] += ' ' + line.strip()

        # Flatten lists with a single value for compatibility
        for key in articles_dict_keys:
            articles_dict[key].append(' '.join(dictionary[key]) if dictionary[key] else None)

# Create DataFrame from the aggregated data
df = pd.DataFrame(articles_dict, columns=articles_dict_keys)

Processing raw_pubmed_intelligence_abstracts_2013.txt: 100%|██████████| 1660/1660 [00:00<00:00, 4429.23it/s]
Processing raw_pubmed_intelligence_abstracts_2014.txt: 100%|██████████| 1798/1798 [00:00<00:00, 3562.40it/s]
Processing raw_pubmed_intelligence_abstracts_2015.txt: 100%|██████████| 1974/1974 [00:00<00:00, 4316.69it/s]
Processing raw_pubmed_intelligence_abstracts_2016.txt: 100%|██████████| 2011/2011 [00:00<00:00, 3779.49it/s]
Processing raw_pubmed_intelligence_abstracts_2017.txt: 100%|██████████| 2359/2359 [00:00<00:00, 3869.60it/s]


In [5]:
project_dir

'c:\\Users\\ozgeb\\OneDrive\\Masaüstü'

In [4]:
df.head(5)

Unnamed: 0,PMID,OWN,STAT,DCOM,LR,IS,VI,IP,DP,TI,...,CRDT,PHST,AID,PST,SO,AUID,CIN,CI,OTO,OT
0,24278995,,Publisher,,,,,,1993,CASK Disorders.,...,2020/05/21 00:00,,NBK169825 [bookaccession],,,,,"Copyright (c) 1993-2024, University of Washing...",NLM,Intellectual Disability and Microcephaly with ...
1,25529590,NLM,MEDLINE,20151224.0,20141222.0,2327-9109 (Electronic) 2327-9095 (Linking),22.0,1.0,2015,WAIS-IV administration errors: effects of alte...,...,2014/12/23 06:00,2014/12/23 06:00 [entrez] 2014/12/23 06:00 [pu...,10.1080/23279095.2013.828726 [doi],ppublish,Appl Neuropsychol Adult. 2015;22(1):42-5. doi:...,,,,NOTNLM,Block Design Symbol Search WAIS-IV intelligence
2,25529585,NLM,MEDLINE,20151224.0,20141222.0,2327-9109 (Electronic) 2327-9095 (Linking),22.0,1.0,2015,Apparently abnormal Wechsler Memory Scale inde...,...,2014/12/23 06:00,2014/12/23 06:00 [entrez] 2014/12/23 06:00 [pu...,10.1080/23279095.2013.816702 [doi],ppublish,Appl Neuropsychol Adult. 2015;22(1):1-6. doi: ...,,,,NOTNLM,General Ability Index WAIS-IV WMS-IV
3,25284715,NLM,MEDLINE,20150610.0,20211021.0,2044-835X (Electronic) 0261-510X (Print) 0261-...,32.0,1.0,2014 Mar,A longitudinal intergenerational analysis of e...,...,2014/10/07 06:00,2013/05/09 00:00 [received] 2013/07/23 00:00 [...,10.1111/bjdp.12021 [doi],ppublish,Br J Dev Psychol. 2014 Mar;32(1):50-64. doi: 1...,,,(c) 2013 The British Psychological Society.,NOTNLM,early childhood executive function individual ...
4,25265311,NLM,MEDLINE,20150715.0,20211021.0,2327-9109 (Electronic) 2327-9095 (Print) 2327-...,21.0,4.0,2014,The design organization test: further demonstr...,...,2014/09/30 06:00,2014/09/30 06:00 [entrez] 2014/09/30 06:00 [pu...,811671 [pii] 10.1080/23279095.2013.811671 [doi],ppublish,Appl Neuropsychol Adult. 2014;21(4):297-309. d...,,,,NOTNLM,IQ neuropsychology reliability validity visuos...


In [5]:
# print the total number of rows and columns
print(f"Total number of rows: {df.shape[0]} x {df.shape[1]}")

Total number of rows: 9802 x 41


In [2]:
# Convert PMID to string
df['PMID'] = df['PMID'].astype(str)


# convert Titles, Abstracts to string
df['TI'] = df['TI'].astype(str)
df['AB'] = df['AB'].astype(str)

# Full Author Names and Abbreviated Author Names to string
df['FAU'] = df['FAU'].astype(str)
df['AU'] = df['AU'].astype(str)

# MeSH Terms and Other Terms (OT) to string
df['MH'] = df['MH'].astype(str)
df['OT'] = df['OT'].astype(str)

# Place of Publication to string
df['PL'] = df['PL'].astype(str) 

# Journal Title to string
df['JT'] = df['JT'].astype(str)

# PubMed Central ID
df['PMC'] = df['PMC'].astype(str)

# Convert DP, EDAT, and MHDA to datetime64
df['DP'] = pd.to_datetime(df['DP'], errors='coerce')  # Coerce errors in case of invalid dates
df['EDAT'] = pd.to_datetime(df['EDAT'], format='%Y/%m/%d', errors='coerce')
df['MHDA'] = pd.to_datetime(df['MHDA'], format='%Y/%m/%d', errors='coerce')

In [3]:
# List of important keys based on the explanation provided
important_keys = [
    'PMID', 'TI', 'AB', 'FAU', 'AU', 'DP', 'MH','OT','PL', 'JT', 'PMC', 
    'EDAT', 'MHDA', 'STAT'
]
# Select only the columns with the important keys
df_important = df[important_keys]
df_important.head(5)

Unnamed: 0,PMID,TI,AB,FAU,AU,DP,MH,OT,PL,JT,PMC,EDAT,MHDA,STAT
0,24278995,CASK Disorders.,CLINICAL CHARACTERISTICS: CASK disorders inclu...,"Moog, Ute Kutsche, Kerstin",Moog U Kutsche K,1993-01-01,,Intellectual Disability and Microcephaly with ...,Seattle (WA),,,NaT,NaT,Publisher
1,25529590,WAIS-IV administration errors: effects of alte...,This study utilized a sample of 50 college stu...,"Ryan, Joseph J Swopes-Willhite, Nicole Frankli...",Ryan JJ Swopes-Willhite N Franklin C Kreiner DS,2015-01-01,Adolescent Adult Female Humans *Intelligence M...,Block Design Symbol Search WAIS-IV intelligence,United States,Applied neuropsychology. Adult,,NaT,NaT,MEDLINE
2,25529585,Apparently abnormal Wechsler Memory Scale inde...,Interpretation of the Wechsler Memory Scale-Fo...,"Carrasco, Roman Marcus Grups, Josefine Evans, ...",Carrasco RM Grups J Evans B Simco E Mittenberg W,2015-01-01,Humans Memory/*physiology Monte Carlo Method *...,General Ability Index WAIS-IV WMS-IV,United States,Applied neuropsychology. Adult,,NaT,NaT,MEDLINE
3,25284715,A longitudinal intergenerational analysis of e...,Despite the importance of executive function (...,"Cuevas, Kimberly Deater-Deckard, Kirby Kim-Spo...",Cuevas K Deater-Deckard K Kim-Spoon J Wang Z M...,NaT,"Adult *Child Development Child, Preschool *Exe...",early childhood executive function individual ...,England,The British journal of developmental psychology,PMC4187223,NaT,NaT,MEDLINE
4,25265311,The design organization test: further demonstr...,Neuropsychological assessments are frequently ...,"Killgore, William D S Gogel, Hannah",Killgore WD Gogel H,2014-01-01,Adolescent Adult Female Humans Intelligence/*p...,IQ neuropsychology reliability validity visuos...,United States,Applied neuropsychology. Adult,PMC4235486,NaT,NaT,MEDLINE


In [4]:
from langchain_community.document_loaders import DataFrameLoader
from langchain_community.embeddings import OllamaEmbeddings, GPT4AllEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate
loader = DataFrameLoader(df_important, page_content_column="AB")
docs = loader.load()
docs

[Document(page_content='CLINICAL CHARACTERISTICS: CASK disorders include a spectrum of phenotypes in both females and males. Two main types of clinical presentation are seen: Microcephaly with pontine and cerebellar hypoplasia (MICPCH), generally associated with pathogenic loss-of-function variants in CASK. X-linked intellectual disability (XLID) with or without nystagmus, generally associated with hypomorphic CASK pathogenic variants. MICPCH is typically seen in females with moderate-to-severe intellectual disability, progressive microcephaly with or without ophthalmologic anomalies, and sensorineural hearing loss. Most are able to sit independently; 20%-25% attain the ability to walk; language is nearly absent in most. Neurologic features may include axial hypotonia, hypertonia/spasticity of the extremities, and dystonia or other movement disorders. Nearly 40% have seizures by age ten years. Behaviors may include sleep disturbances, hand stereotypies, and self biting. MICPCH in males

In [5]:
embeddings = GPT4AllEmbeddings()

In [6]:
text_splitter = RecursiveCharacterTextSplitter()
documents = text_splitter.split_documents(docs)
print(f"Number of documents: {len(documents)}")
# Number of documents: 62158

Number of documents: 9846


In [7]:
# Save the index to disk
vector = FAISS.from_documents(documents, embeddings)

In [8]:
vector.save_local(os.path.join('vectorStore'))

In [9]:
db = FAISS.load_local(folder_path = "vectorStore", embeddings = embeddings)

In [10]:
llm = Ollama(model="llama2")

In [11]:
from langchain.chains.combine_documents import create_stuff_documents_chain

prompt = ChatPromptTemplate.from_template("""Answer the following question based only on the provided context and metadata:

<context>
{context}
</context>
<metadata>
{metadata}
<metadata>
Question: {input}""")

document_chain = create_stuff_documents_chain(llm, prompt)

In [12]:
from langchain.chains import create_retrieval_chain

retriever = vector.as_retriever()

In [14]:
query = "what are the symptoms of covid?"
document = retriever.get_relevant_documents(query)
context = document[0]
metadata = document[0].metadata
print(context)
metadata



{'PMID': '26928307',
 'TI': 'Forecasting Chikungunya spread in the Americas via data-driven empirical approaches.',
 'FAU': 'Escobar, Luis E Qiao, Huijie Peterson, A Townsend',
 'AU': 'Escobar LE Qiao H Peterson AT',
 'DP': NaT,
 'MH': 'Americas/epidemiology Chikungunya Fever/*epidemiology/*transmission *Epidemiologic Methods *Forecasting Humans Models, Statistical Prevalence Socioeconomic Factors',
 'OT': 'None',
 'PL': 'England',
 'JT': 'Parasites & vectors',
 'PMC': 'PMC4772319',
 'EDAT': NaT,
 'MHDA': NaT,
 'STAT': 'MEDLINE'}

In [15]:
from langchain_core.documents import Document

document_chain.invoke({
    "input": query,
    "context": [context],
    "metadata": [metadata]
})

'Based on the provided context and metadata, there is no information available about the symptoms of COVID-19. The article focuses on forecasting the spread of Chikungunya virus in the Americas, and does not provide information on COVID-19 symptoms. Therefore, I cannot answer your question based on the provided data.'

In [16]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder

# First we need a prompt that we can pass into an LLM to generate this search query

prompt = ChatPromptTemplate.from_messages([
    MessagesPlaceholder(variable_name="chat_history"),
    ("user", "{input}"),
    ("user", "Given the above conversation, generate a search query to look up in order to get information relevant to the conversation")
])
retriever_chain = create_history_aware_retriever(llm, retriever, prompt)

In [17]:
from langchain_core.messages import HumanMessage, AIMessage

chat_history = [HumanMessage(content="Can LangSmith help test my LLM applications?"), AIMessage(content="Yes!")]
retriever_chain.invoke({
    "chat_history": chat_history,
    "input": "Tell me who?"
})

[Document(page_content='Intelligent Patent Analysis Tool (IPAT) is an online data retrieval tool, operated based on text mining algorithm to extract specific patent information in a predetermined pattern into an Excel sheet. The software is designed and developed to retrieve and analyze technology information from multiple patent documents and generate various patent landscape graphs and charts. The software is C# coded in visual studio 2010, which extracts the publicly available patent information from the web pages like Google Patent and simultaneously study the various technology trends based on user-defined parameters. In other words, IPAT combined with the manual categorization will act as an excellent technology assessment tool in competitive intelligence and due diligence for predicting the future R&D forecast.', metadata={'PMID': '26452016', 'TI': 'IPAT: a freely accessible software tool for analyzing multiple patent documents with inbuilt landscape visualizer.', 'FAU': 'Ajay, 

In [128]:
from langchain.chains import RetrievalQA
qa = RetrievalQA.from_chain_type(llm=Ollama(), chain_type="stuff", retriever=vector.as_retriever())
result = qa.invoke(query)
print(result)

{'query': 'what are the symptoms of covid?', 'result': "Based on the provided context, the symptoms of COVID-19 can vary from person to person and can include:\n\n* Fever\n* Cough\n* Shortness of breath or difficulty breathing\n* Fatigue or weakness\n* Headache\n* Sore throat\n* Runny nose or stuffy nose\n* Body aches or muscle pains\n* Diarrhea or nausea and vomiting (in some cases)\n\nIt's important to note that not everyone who is infected with COVID-19 will experience all of these symptoms, and some people may experience different symptoms altogether. Additionally, the severity of the symptoms can vary from person to person, with some individuals experiencing mild symptoms while others may experience more severe symptoms.\n\nIt's also worth noting that COVID-19 can cause complications, such as pneumonia or acute respiratory distress syndrome (ARDS), which can be serious and even life-threatening in some cases. If you suspect that you or someone else may have COVID-19, it's importan

In [19]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.memory import ConversationSummaryMemory
from langchain.memory import CombinedMemory
from langchain import PromptTemplate
from langchain.chains import ConversationChain
import Tools
from langchain import VectorDBQA

# create memory
memory = ConversationBufferMemory(
    return_messages=True, memory_key="chat_history")

chain = ConversationalRetrievalChain.from_llm(llm, retriever, memory = memory)

In [None]:
query = ""
while query != 'done':
    query = input("Enter Your Query: ")
    print(chain({"question": query})["answer"])
    print('#'*100)

In [20]:
conv_memory = ConversationBufferMemory(memory_key="chat_history_lines", input_key="input")
summary_memory = ConversationSummaryMemory(llm = llm, input_key="input")
memory = CombinedMemory(memories=[conv_memory, summary_memory])

In [21]:
TEMPLATE = """ The AI provides lots of specific details for its context. If the AI does not know the answer. 
                Summary of comversation:
                {history}
                Current conversation:
                {chat_history_lines}
                Human:{input}
                AI: """

PROMPT = PromptTemplate(input_variables=["history", "input", "chat_history_lines"], template=TEMPLATE)

In [22]:
conversation = ConversationChain(llm=llm, verbose=True, memory=memory, prompt=PROMPT)
vectorDatabase = VectorDBQA.from_chain_type(llm = llm, chain_type="stuff", vectorstore = db, verbose = True)



In [23]:
import huggingface_hub
from langchain.agents import initialize_agent
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.agents import AgentType
from langchain import PromptTemplate, HuggingFaceHub

In [24]:
from langchain_core.tools import Tool

tools = [
    Tool(name ='FAISS', func=vectorDatabase.run, description="Use this to search from FAISS if answer is not available from the summary of conversation"), 
    Tool(name ='ConversationHistory', func=conversation.run, description="Use this when answer is available from the summary of conversation"),
    llm
]

In [None]:

def main():
    agent = initialize_agent(
        tools, 
        llm, 
        agent="zero-shot-react-description", 
        verbose=True
    )

    while True:
        agent.run(input = input())

In [26]:

from langchain.memory import ConversationBufferMemory

memory = ConversationBufferMemory(memory_key='chat_history')

def main():
    agent = initialize_agent(
        tools, 
        llm, 
        agent=AgentType.CONVERSATIONAL_REACT_DESCRIPTION, 
        verbose=True,
        memory=memory
    )

    agent.run('When was google created?')