In [None]:
'''
pip install -U ipywidgets
pip install pyarrow
'''

# Loading PubMed text files

In [1]:
import os
import pandas as pd
from tqdm import tqdm
import glob  # Import glob module

# Define the project directory
project_dir = os.path.dirname(os.getcwd())
data_dir = os.path.join(project_dir, 'data', 'PubMed_Format')

# Define articles dictionary keys
articles_dict_keys = ['PMID', 'OWN', 'STAT', 'DCOM', 'LR', 'IS', 'VI', 'IP', 'DP', 'TI', 'PG', 'LID', 'AB', 'FAU', 'AU', 'AD', 'LA', 'GR', 'PT', 'DEP', 'PL', 'TA', 'JT', 'JID', 'SB', 'MH', 'PMC', 'MID', 'COIS', 'EDAT', 'MHDA', 'CRDT', 'PHST', 'AID', 'PST', 'SO', 'AUID', 'CIN', 'CI', 'OTO', 'OT']
articles_dict = {key: [] for key in articles_dict_keys}

# Pattern to match all relevant files
file_pattern = os.path.join(data_dir, 'raw_pubmed_intelligence_abstracts_*.txt')

# Use glob to find all files matching the pattern
for file_path in glob.glob(file_pattern):
    with open(file_path, 'r', encoding="utf-8") as f:
        text = f.read()
    
    articles = text.split("\n\n")

    for article in tqdm(articles, desc=f"Processing {os.path.basename(file_path)}"):
        lines = article.split("\n")
        dictionary = {key: [] for key in articles_dict_keys}  # Use lists to handle multiple entries
        current_key = None  # Track the current key for lines without a new key

        for line in lines:
            if len(line) > 4 and line[4] == '-':
                key, value = line.split("-", 1)
                key = key.strip()
                value = value.strip()
                if key in dictionary:
                    dictionary[key].append(value)  # Append new values to the list
                    current_key = key
            elif current_key:  # Append continuation lines to the last value of the current key
                dictionary[current_key][-1] += ' ' + line.strip()

        # Flatten lists with a single value for compatibility
        for key in articles_dict_keys:
            articles_dict[key].append(' '.join(dictionary[key]) if dictionary[key] else None)

# Create DataFrame from the aggregated data
df = pd.DataFrame(articles_dict, columns=articles_dict_keys)

Processing raw_pubmed_intelligence_abstracts_2024.txt: 100%|██████████| 4240/4240 [00:00<00:00, 18103.00it/s]
Processing raw_pubmed_intelligence_abstracts_2018.txt: 100%|██████████| 3341/3341 [00:00<00:00, 18878.47it/s]
Processing raw_pubmed_intelligence_abstracts_2019.txt: 100%|██████████| 4951/4951 [00:00<00:00, 18784.53it/s]
Processing raw_pubmed_intelligence_abstracts_2021.txt: 100%|██████████| 15221/15221 [00:00<00:00, 18181.32it/s]
Processing raw_pubmed_intelligence_abstracts_2020.txt: 100%|██████████| 7875/7875 [00:00<00:00, 18199.60it/s]
Processing raw_pubmed_intelligence_abstracts_2022.txt: 100%|██████████| 20748/20748 [00:01<00:00, 18192.72it/s]
Processing raw_pubmed_intelligence_abstracts_2023.txt: 100%|██████████| 25276/25276 [00:01<00:00, 18434.16it/s]
Processing raw_pubmed_intelligence_abstracts_2013.txt: 100%|██████████| 1660/1660 [00:00<00:00, 19909.54it/s]
Processing raw_pubmed_intelligence_abstracts_2014.txt: 100%|██████████| 1798/1798 [00:00<00:00, 19529.00it/s]
Proc

# Data Exploration

In [2]:
df.head(5)

Unnamed: 0,PMID,OWN,STAT,DCOM,LR,IS,VI,IP,DP,TI,...,CRDT,PHST,AID,PST,SO,AUID,CIN,CI,OTO,OT
0,38421690,NLM,Publisher,,20240229,1438-8871 (Electronic) 1438-8871 (Linking),26.0,,2024 Feb 29,"Promises, Pitfalls, and Clinical Applications ...",...,2024/02/29 11:53,2024/02/29 12:46 [medline] 2024/02/29 12:46 [p...,v26i1e49022 [pii] 10.2196/49022 [doi],epublish,J Med Internet Res. 2024 Feb 29;26:e49022. doi...,ORCID: 0009-0001-2785-3930 ORCID: 0000-0003-26...,,"(c)Hansa Bhargava, Carmela Salomon, Srinivasan...",NOTNLM,ASD CME artificial intelligence autism autism ...
1,38421670,NLM,Publisher,,20240229,2168-6173 (Electronic) 2168-6165 (Linking),,,2024 Feb 29,Accuracy of an Artificial Intelligence Chatbot...,...,2024/02/29 11:33,2024/02/29 12:43 [medline] 2024/02/29 12:43 [p...,2815697 [pii] 10.1001/jamaophthalmol.2024.0017...,aheadofprint,JAMA Ophthalmol. 2024 Feb 29. doi: 10.1001/jam...,,,,,
2,38421439,NLM,Publisher,,20240229,1436-2813 (Electronic) 0941-1291 (Linking),,,2024 Feb 29,ChatGPT in surgery: a revolutionary innovation?,...,2024/02/29 11:07,2024/02/29 12:46 [medline] 2024/02/29 12:46 [p...,10.1007/s00595-024-02800-6 [pii] 10.1007/s0059...,aheadofprint,Surg Today. 2024 Feb 29. doi: 10.1007/s00595-0...,,,(c) 2024. The Author(s).,NOTNLM,Artificial intelligence ChatGPT Surgery
3,38421392,NLM,Publisher,,20240229,1434-4726 (Electronic) 0937-4477 (Linking),,,2024 Feb 29,Exploring the landscape of AI-assisted decisio...,...,2024/02/29 11:05,2024/02/29 12:42 [medline] 2024/02/29 12:42 [p...,10.1007/s00405-024-08525-z [pii] 10.1007/s0040...,aheadofprint,Eur Arch Otorhinolaryngol. 2024 Feb 29. doi: 1...,ORCID: 0000-0003-0957-8354,,"(c) 2024. The Author(s), under exclusive licen...",NOTNLM,Artificial intelligence (AI) models Cancer car...
4,38421272,NLM,In-Process,,20240229,1661-4917 (Electronic) 0004-069X (Linking),72.0,1.0,2024 Jan 1,"Artificial Intelligence, Big Data, and Regulat...",...,2024/02/29 09:53,2024/02/29 12:45 [medline] 2024/02/29 12:45 [p...,aite-2024-0006 [pii] 10.2478/aite-2024-0006 [doi],epublish,Arch Immunol Ther Exp (Warsz). 2024 Feb 29;72(...,ORCID: 0000-0002-9792-8694 ORCID: 0000-0002-33...,,"(c) 2024 Bhagirath Singh et al., published by ...",NOTNLM,Artificial intelligence Big data Ethics of imm...


In [3]:
# print the total number of rows and columns
print(f"Total number of rows: {df.shape[0]} x {df.shape[1]}")

Total number of rows: 91454 x 41


In [4]:
# display columns data types
df.dtypes

PMID    object
OWN     object
STAT    object
DCOM    object
LR      object
IS      object
VI      object
IP      object
DP      object
TI      object
PG      object
LID     object
AB      object
FAU     object
AU      object
AD      object
LA      object
GR      object
PT      object
DEP     object
PL      object
TA      object
JT      object
JID     object
SB      object
MH      object
PMC     object
MID     object
COIS    object
EDAT    object
MHDA    object
CRDT    object
PHST    object
AID     object
PST     object
SO      object
AUID    object
CIN     object
CI      object
OTO     object
OT      object
dtype: object

In [5]:
# check for missing values in AB column
print(f"Number of missing values in AB column: {df['AB'].isnull().sum()}")

# drop rows with missing values in AB column
df = df.dropna(subset=['AB'])
print(f"Number of articles after dropping missing values in AB column: {df.shape[0]}")

# find total number of duplicates in PMID column
print(f"Number of duplicate PMIDs: {df['PMID'].duplicated().sum()}")

# drop duplicates
df = df.drop_duplicates(subset=['PMID'])
print(f"Number of articles after dropping duplicates: {df.shape[0]}")

# df = df.reset_index(drop=True)

Number of missing values in AB column: 6371
Number of articles after dropping missing values in AB column: 85083
Number of duplicate PMIDs: 23074
Number of articles after dropping duplicates: 62009


# Convert columns data type

In [6]:
# Convert PMID to string
df['PMID'] = df['PMID'].astype(str)


# convert Titles, Abstracts to string
df['TI'] = df['TI'].astype(str)
df['AB'] = df['AB'].astype(str)

# Full Author Names and Abbreviated Author Names to string
df['FAU'] = df['FAU'].astype(str)
df['AU'] = df['AU'].astype(str)

# MeSH Terms and Other Terms (OT) to string
df['MH'] = df['MH'].astype(str)
df['OT'] = df['OT'].astype(str)

# Place of Publication to string
df['PL'] = df['PL'].astype(str) 

# Journal Title to string
df['JT'] = df['JT'].astype(str)

# PubMed Central ID
df['PMC'] = df['PMC'].astype(str)

# Convert DP, EDAT, and MHDA to datetime64
df['DP'] = pd.to_datetime(df['DP'], errors='coerce')  # Coerce errors in case of invalid dates
df['EDAT'] = pd.to_datetime(df['EDAT'], format='%Y/%m/%d', errors='coerce')
df['MHDA'] = pd.to_datetime(df['MHDA'], format='%Y/%m/%d', errors='coerce')

In [7]:
df['PMID'].value_counts()

PMID
38421690    1
37255522    1
37274299    1
37274276    1
37274142    1
           ..
34773485    1
34773465    1
34773361    1
34773156    1
25308198    1
Name: count, Length: 62009, dtype: int64

In [8]:
# List of important keys based on the explanation provided
important_keys = [
    'PMID', 'TI', 'AB', 'FAU', 'AU', 'DP', 'MH','OT','PL', 'JT', 'PMC', 
    'EDAT', 'MHDA', 'STAT'
]
# Select only the columns with the important keys
df_important = df[important_keys]
df_important.head(5)

Unnamed: 0,PMID,TI,AB,FAU,AU,DP,MH,OT,PL,JT,PMC,EDAT,MHDA,STAT
0,38421690,"Promises, Pitfalls, and Clinical Applications ...",Artificial intelligence (AI) broadly describes...,"Bhargava, Hansa Salomon, Carmela Suresh, Srini...",Bhargava H Salomon C Suresh S Chang A Kilian R...,2024-02-29,,ASD CME artificial intelligence autism autism ...,Canada,Journal of medical Internet research,,NaT,NaT,Publisher
1,38421670,Accuracy of an Artificial Intelligence Chatbot...,IMPORTANCE: Ophthalmology is reliant on effect...,"Mihalache, Andrew Huang, Ryan S Popovic, Marko...",Mihalache A Huang RS Popovic MM Patil NS Pandy...,2024-02-29,,,United States,JAMA ophthalmology,,NaT,NaT,Publisher
2,38421439,ChatGPT in surgery: a revolutionary innovation?,ChatGPT has brought about a new era of digital...,"Bektas, Mustafa Pereira, Jaime Ken Daams, Free...",Bektas M Pereira JK Daams F van der Peet DL,2024-02-29,,Artificial intelligence ChatGPT Surgery,Japan,Surgery today,,NaT,NaT,Publisher
3,38421392,Exploring the landscape of AI-assisted decisio...,PURPOSE: Recent breakthroughs in natural langu...,"Marchi, Filippo Bellini, Elisa Iandelli, Andre...",Marchi F Bellini E Iandelli A Sampieri C Peret...,2024-02-29,,Artificial intelligence (AI) models Cancer car...,Germany,European archives of oto-rhino-laryngology : o...,,NaT,NaT,Publisher
4,38421272,"Artificial Intelligence, Big Data, and Regulat...",The immune system is regulated by a complex se...,"Singh, Bhagirath Jevnikar, Anthony M Desjardin...",Singh B Jevnikar AM Desjardins E,2024-01-01,,Artificial intelligence Big data Ethics of imm...,Switzerland,Archivum immunologiae et therapiae experimentalis,,NaT,NaT,In-Process


## Parquet File (.parquet) <br>Use Case: Excellent for storing large datasets. <br> Parquet is a columnar storage file format optimized for fast retrieval of columns and efficient compression.

In [9]:
import pyarrow as pa
file_path = os.path.join(project_dir,'data','cleanData','pubmed_intelligence_abstracts_cleaned.parquet')
df_important.to_parquet(file_path, index=False)

## Data loader

In [14]:
from langchain_community.document_loaders import DataFrameLoader
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate
loader = DataFrameLoader(df_important, page_content_column="AB")
docs = loader.load()
docs[0:5]

[Document(page_content='Artificial intelligence (AI) broadly describes a branch of computer science focused on developing machines capable of performing tasks typically associated with human intelligence. Those who connect AI with the world of science fiction may meet its growing rise with hesitancy or outright skepticism. However, AI is becoming increasingly pervasive in our society, from algorithms helping to sift through airline fares to substituting words in emails and SMS text messages based on user choices. Data collection is ongoing and is being leveraged by software platforms to analyze patterns and make predictions across multiple industries. Health care is gradually becoming part of this technological transformation, as advancements in computational power and storage converge with the rapid expansion of digitized medical information. Given the growing and inevitable integration of AI into health care systems, it is our viewpoint that pediatricians urgently require training an

In [11]:
#%pip install --upgrade --quiet  gpt4all > /dev/null

## Generating Embeddings

In [15]:
embeddings = GPT4AllEmbeddings()
print(embeddings)

bert_load_from_file: gguf version     = 2
bert_load_from_file: gguf alignment   = 32
bert_load_from_file: gguf data offset = 695552
bert_load_from_file: model name           = BERT
bert_load_from_file: model architecture   = bert
bert_load_from_file: model file type      = 1
bert_load_from_file: bert tokenizer vocab = 30522
client=<gpt4all.gpt4all.Embed4All object at 0x2d31c78e0>


## Create a vector store

In [16]:
text_splitter = RecursiveCharacterTextSplitter()
documents = text_splitter.split_documents(docs)
print(f"Number of documents: {len(documents)}")
# Number of documents: 62158

Number of documents: 62158


In [None]:
vector = FAISS.from_documents(documents, embeddings)

In [None]:
# Save the index to disk
vector.save_local(os.path.join(project_dir,'vectorStore'))

In [17]:
vector = FAISS.load_local(os.path.join(project_dir,'vectorStore'), embeddings)

In [18]:
import faiss
# Load the FAISS index
index = faiss.read_index(os.path.join(project_dir,'vectorStore','index.faiss'))

# Print the number of vectors and their dimensionality
print(f"Number of vectors in the index: {index.ntotal}")
print(f"Vector dimensionality: {index.d}")

Number of vectors in the index: 62158
Vector dimensionality: 384


## Load LLM Model

In [19]:
llm = Ollama(model="llama2")

In [None]:
from langchain.chains import create_retrieval_chain
from langchain_core.documents import Document

retriever = vector.as_retriever()

In [21]:
from langchain.chains.combine_documents import create_stuff_documents_chain

prompt = ChatPromptTemplate.from_template("""Answer the following question based only on the provided context and metadata:

<context>
{context}
</context>
<metadata>
{metadata}
<metadata>
Question: {input}""")

document_chain = create_stuff_documents_chain(llm, prompt)

query = "how artificial intelligence can predict the occurrence of progressive immunoglobulin A nephropathy?"
documents = retriever.get_relevant_documents(query)

In [24]:
documents

[Document(page_content='BACKGROUND: Artificial intelligence (AI) now plays a critical role in almost every area of our daily lives and academic disciplines due to the growth of computing power, advances in methods and techniques, and the explosion of the amount of data; medicine is not an exception. Rather than replacing clinicians, AI is augmenting the intelligence of clinicians in diagnosis, prognosis, and treatment decisions. SUMMARY: Kidney disease is a substantial medical and public health burden globally, with both acute kidney injury and chronic kidney disease bringing about high morbidity and mortality as well as a huge economic burden. Even though the existing research and applied works have made certain contributions to more accurate prediction and better understanding of histologic pathology, there is a lot more work to be done and problems to solve. KEY MESSAGES: AI applications of diagnostics and prognostics for high-prevalence and high-morbidity types of nephropathy in me

In [25]:
context = documents[0]
metadata = documents[0].metadata
print(context)
metadata

page_content='BACKGROUND: Artificial intelligence (AI) now plays a critical role in almost every area of our daily lives and academic disciplines due to the growth of computing power, advances in methods and techniques, and the explosion of the amount of data; medicine is not an exception. Rather than replacing clinicians, AI is augmenting the intelligence of clinicians in diagnosis, prognosis, and treatment decisions. SUMMARY: Kidney disease is a substantial medical and public health burden globally, with both acute kidney injury and chronic kidney disease bringing about high morbidity and mortality as well as a huge economic burden. Even though the existing research and applied works have made certain contributions to more accurate prediction and better understanding of histologic pathology, there is a lot more work to be done and problems to solve. KEY MESSAGES: AI applications of diagnostics and prognostics for high-prevalence and high-morbidity types of nephropathy in medical-reso

{'PMID': '32021868',
 'TI': "Artificial Intelligence in Nephrology: How Can Artificial Intelligence Augment Nephrologists' Intelligence?",
 'FAU': 'Xie, Guotong Chen, Tiange Li, Yingxue Chen, Tingyu Li, Xiang Liu, Zhihong',
 'AU': 'Xie G Chen T Li Y Chen T Li X Liu Z',
 'DP': NaT,
 'MH': 'None',
 'OT': 'Artificial intelligence Big data Diagnostics and prognostics Kidney disease Treatment',
 'PL': 'Switzerland',
 'JT': 'Kidney diseases (Basel, Switzerland)',
 'PMC': 'PMC6995978',
 'EDAT': NaT,
 'MHDA': NaT,
 'STAT': 'PubMed-not-MEDLINE'}

In [26]:
document_chain.invoke({
    "input": query,
    "context": [context],
    "metadata": [metadata]
})

'Based on the provided context and metadata, the answer to the question is not explicitly mentioned. However, we can infer from the information provided that AI has the potential to predict the occurrence of progressive immunoglobulin A nephropathy through various means:\n\n1. Analysis of large datasets: With the growth of computing power and the explosion of data, AI algorithms can analyze vast amounts of data related to kidney disease, including demographic information, medical history, laboratory results, and imaging studies. By identifying patterns and correlations within these datasets, AI can help predict the likelihood of developing progressive immunoglobulin A nephropathy.\n2. Machine learning techniques: Machine learning algorithms, such as decision trees, random forests, and neural networks, can learn from historical data and identify potential risk factors for progressive immunoglobulin A nephropathy. These algorithms can then be used to predict the likelihood of developing 

In [46]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage

# First we need a prompt that we can pass into an LLM to generate this search query

prompt = ChatPromptTemplate.from_messages([
    MessagesPlaceholder(variable_name="chat_history"),
    ("system", "Act as Expert in Medical field by providing refrences and scientifc conclusions for user's questions based on the below context:\n\n{context} and metadata"),
    ("user", "{input}"),
    ("user", '''Given the above conversation, generate a search query to look up
      in order to get information relevant to the conversation''')
])
retriever_chain = create_history_aware_retriever(llm, retriever, prompt)

document_chain = create_stuff_documents_chain(llm, prompt)

retrieval_chain = create_retrieval_chain(retriever_chain, document_chain)

In [47]:
query = "where was the articles published (place of publication PL)?"
retriever.get_relevant_documents(query)
chat_history = [HumanMessage(content=""), AIMessage(content="Yes!")]
retrieval_chain.invoke({
    "chat_history": chat_history,
    "input": query,
    "context": context,
})

{'chat_history': [HumanMessage(content=''), AIMessage(content='Yes!')],
 'input': 'where was the articles published (place of publication PL)?',
 'context': [Document(page_content="INTRODUCTION: The rapid advancement of artificial intelligence and big data analytics, including descriptive, diagnostic, predictive, and prescriptive analytics, has the potential to revolutionize many areas of medicine, including nephrology and dialysis. Artificial intelligence and big data analytics can be used to analyze large amounts of patient medical records, including laboratory results and imaging studies, to improve the accuracy of diagnosis, enhance early detection, identify patterns and trends, and personalize treatment plans for patients with kidney disease. Additionally, artificial intelligence and big data analytics can be used to identify patients' treatment who are not receiving adequate care, highlighting care inefficiencies in the dialysis provider, optimizing patient outcomes, reducing hea