In [226]:
'''
pip install -U ipywidgets
pip install pyarrow
'''

'\npip install -U ipywidgets\npip install pyarrow\n'

# Loading PubMed text files

In [227]:
import os
import pandas as pd
from tqdm import tqdm
import glob  # Import glob module

# Define the project directory
project_dir = os.path.dirname(os.getcwd())
data_dir = os.path.join(project_dir, 'data', 'PubMed_Format')

# Define articles dictionary keys
articles_dict_keys = ['PMID', 'OWN', 'STAT', 'DCOM', 'LR', 'IS', 'VI', 'IP', 'DP', 'TI', 'PG', 'LID', 'AB', 'FAU', 'AU', 'AD', 'LA', 'GR', 'PT', 'DEP', 'PL', 'TA', 'JT', 'JID', 'SB', 'MH', 'PMC', 'MID', 'COIS', 'EDAT', 'MHDA', 'CRDT', 'PHST', 'AID', 'PST', 'SO', 'AUID', 'CIN', 'CI', 'OTO', 'OT']
articles_dict = {key: [] for key in articles_dict_keys}

# Pattern to match all relevant files
file_pattern = os.path.join(data_dir, 'raw_pubmed_intelligence_abstracts_*.txt')

# Use glob to find all files matching the pattern
for file_path in glob.glob(file_pattern):
    with open(file_path, 'r', encoding="utf-8") as f:
        text = f.read()
    
    articles = text.split("\n\n")

    for article in tqdm(articles, desc=f"Processing {os.path.basename(file_path)}"):
        lines = article.split("\n")
        dictionary = {key: [] for key in articles_dict_keys}  # Use lists to handle multiple entries
        current_key = None  # Track the current key for lines without a new key

        for line in lines:
            if len(line) > 4 and line[4] == '-':
                key, value = line.split("-", 1)
                key = key.strip()
                value = value.strip()
                if key in dictionary:
                    dictionary[key].append(value)  # Append new values to the list
                    current_key = key
            elif current_key:  # Append continuation lines to the last value of the current key
                dictionary[current_key][-1] += ' ' + line.strip()

        # Flatten lists with a single value for compatibility
        for key in articles_dict_keys:
            articles_dict[key].append(' '.join(dictionary[key]) if dictionary[key] else None)

# Create DataFrame from the aggregated data
df = pd.DataFrame(articles_dict, columns=articles_dict_keys)

Processing raw_pubmed_intelligence_abstracts_2024.txt: 100%|██████████| 4240/4240 [00:00<00:00, 14925.32it/s]
Processing raw_pubmed_intelligence_abstracts_2018.txt: 100%|██████████| 3341/3341 [00:00<00:00, 15828.42it/s]
Processing raw_pubmed_intelligence_abstracts_2019.txt: 100%|██████████| 4951/4951 [00:00<00:00, 17918.28it/s]
Processing raw_pubmed_intelligence_abstracts_2021.txt: 100%|██████████| 15221/15221 [00:00<00:00, 16855.62it/s]
Processing raw_pubmed_intelligence_abstracts_2020.txt: 100%|██████████| 7875/7875 [00:00<00:00, 17558.73it/s]
Processing raw_pubmed_intelligence_abstracts_2022.txt: 100%|██████████| 20748/20748 [00:01<00:00, 17218.53it/s]
Processing raw_pubmed_intelligence_abstracts_2023.txt: 100%|██████████| 25276/25276 [00:01<00:00, 15728.33it/s]
Processing raw_pubmed_intelligence_abstracts_2013.txt: 100%|██████████| 1660/1660 [00:00<00:00, 19122.88it/s]
Processing raw_pubmed_intelligence_abstracts_2014.txt: 100%|██████████| 1798/1798 [00:00<00:00, 18694.49it/s]
Proc

# Data Exploration

In [229]:
df.head(5)

Unnamed: 0,PMID,OWN,STAT,DCOM,LR,IS,VI,IP,DP,TI,...,CRDT,PHST,AID,PST,SO,AUID,CIN,CI,OTO,OT
0,38421690,NLM,Publisher,,20240229,1438-8871 (Electronic) 1438-8871 (Linking),26.0,,2024 Feb 29,"Promises, Pitfalls, and Clinical Applications ...",...,2024/02/29 11:53,2024/02/29 12:46 [medline] 2024/02/29 12:46 [p...,v26i1e49022 [pii] 10.2196/49022 [doi],epublish,J Med Internet Res. 2024 Feb 29;26:e49022. doi...,ORCID: 0009-0001-2785-3930 ORCID: 0000-0003-26...,,"(c)Hansa Bhargava, Carmela Salomon, Srinivasan...",NOTNLM,ASD CME artificial intelligence autism autism ...
1,38421670,NLM,Publisher,,20240229,2168-6173 (Electronic) 2168-6165 (Linking),,,2024 Feb 29,Accuracy of an Artificial Intelligence Chatbot...,...,2024/02/29 11:33,2024/02/29 12:43 [medline] 2024/02/29 12:43 [p...,2815697 [pii] 10.1001/jamaophthalmol.2024.0017...,aheadofprint,JAMA Ophthalmol. 2024 Feb 29. doi: 10.1001/jam...,,,,,
2,38421439,NLM,Publisher,,20240229,1436-2813 (Electronic) 0941-1291 (Linking),,,2024 Feb 29,ChatGPT in surgery: a revolutionary innovation?,...,2024/02/29 11:07,2024/02/29 12:46 [medline] 2024/02/29 12:46 [p...,10.1007/s00595-024-02800-6 [pii] 10.1007/s0059...,aheadofprint,Surg Today. 2024 Feb 29. doi: 10.1007/s00595-0...,,,(c) 2024. The Author(s).,NOTNLM,Artificial intelligence ChatGPT Surgery
3,38421392,NLM,Publisher,,20240229,1434-4726 (Electronic) 0937-4477 (Linking),,,2024 Feb 29,Exploring the landscape of AI-assisted decisio...,...,2024/02/29 11:05,2024/02/29 12:42 [medline] 2024/02/29 12:42 [p...,10.1007/s00405-024-08525-z [pii] 10.1007/s0040...,aheadofprint,Eur Arch Otorhinolaryngol. 2024 Feb 29. doi: 1...,ORCID: 0000-0003-0957-8354,,"(c) 2024. The Author(s), under exclusive licen...",NOTNLM,Artificial intelligence (AI) models Cancer car...
4,38421272,NLM,In-Process,,20240229,1661-4917 (Electronic) 0004-069X (Linking),72.0,1.0,2024 Jan 1,"Artificial Intelligence, Big Data, and Regulat...",...,2024/02/29 09:53,2024/02/29 12:45 [medline] 2024/02/29 12:45 [p...,aite-2024-0006 [pii] 10.2478/aite-2024-0006 [doi],epublish,Arch Immunol Ther Exp (Warsz). 2024 Feb 29;72(...,ORCID: 0000-0002-9792-8694 ORCID: 0000-0002-33...,,"(c) 2024 Bhagirath Singh et al., published by ...",NOTNLM,Artificial intelligence Big data Ethics of imm...


In [232]:
# print the total number of rows and columns
print(f"Total number of rows: {df.shape[0]} x {df.shape[1]}")

Total number of rows: 91454 x 41


In [233]:
# display columns data types
df.dtypes

PMID    object
OWN     object
STAT    object
DCOM    object
LR      object
IS      object
VI      object
IP      object
DP      object
TI      object
PG      object
LID     object
AB      object
FAU     object
AU      object
AD      object
LA      object
GR      object
PT      object
DEP     object
PL      object
TA      object
JT      object
JID     object
SB      object
MH      object
PMC     object
MID     object
COIS    object
EDAT    object
MHDA    object
CRDT    object
PHST    object
AID     object
PST     object
SO      object
AUID    object
CIN     object
CI      object
OTO     object
OT      object
dtype: object

In [234]:
# check for missing values in AB column
print(f"Number of missing values in AB column: {df['AB'].isnull().sum()}")

# drop rows with missing values in AB column
df = df.dropna(subset=['AB'])
print(f"Number of articles after dropping missing values in AB column: {df.shape[0]}")

# find total number of duplicates in PMID column
print(f"Number of duplicate PMIDs: {df['PMID'].duplicated().sum()}")

# drop duplicates
df = df.drop_duplicates(subset=['PMID'])
print(f"Number of articles after dropping duplicates: {df.shape[0]}")

# df = df.reset_index(drop=True)

Number of missing values in AB column: 6371
Number of articles after dropping missing values in AB column: 85083
Number of duplicate PMIDs: 23074
Number of articles after dropping duplicates: 62009


# Convert columns data type

In [235]:
# Convert PMID to string
df['PMID'] = df['PMID'].astype(str)


# convert Titles, Abstracts to string
df['TI'] = df['TI'].astype(str)
df['AB'] = df['AB'].astype(str)

# Full Author Names and Abbreviated Author Names to string
df['FAU'] = df['FAU'].astype(str)
df['AU'] = df['AU'].astype(str)

# MeSH Terms and Other Terms (OT) to string
df['MH'] = df['MH'].astype(str)
df['OT'] = df['OT'].astype(str)

# Place of Publication to string
df['PL'] = df['PL'].astype(str) 

# Journal Title to string
df['JT'] = df['JT'].astype(str)

# PubMed Central ID
df['PMC'] = df['PMC'].astype(str)

# Convert DP, EDAT, and MHDA to datetime64
df['DP'] = pd.to_datetime(df['DP'], errors='coerce')  # Coerce errors in case of invalid dates
df['EDAT'] = pd.to_datetime(df['EDAT'], format='%Y/%m/%d', errors='coerce')
df['MHDA'] = pd.to_datetime(df['MHDA'], format='%Y/%m/%d', errors='coerce')

In [236]:
df['PMID'].value_counts()

PMID
38421690    1
37255522    1
37274299    1
37274276    1
37274142    1
           ..
34773485    1
34773465    1
34773361    1
34773156    1
25308198    1
Name: count, Length: 62009, dtype: int64

In [237]:
# List of important keys based on the explanation provided
important_keys = [
    'PMID', 'TI', 'AB', 'FAU', 'AU', 'DP', 'MH','OT','PL', 'JT', 'PMC', 
    'EDAT', 'MHDA', 'STAT'
]
# Select only the columns with the important keys
df_important = df[important_keys]
df_important.head(5)

Unnamed: 0,PMID,TI,AB,FAU,AU,DP,MH,OT,PL,JT,PMC,EDAT,MHDA,STAT
0,38421690,"Promises, Pitfalls, and Clinical Applications ...",Artificial intelligence (AI) broadly describes...,"Bhargava, Hansa Salomon, Carmela Suresh, Srini...",Bhargava H Salomon C Suresh S Chang A Kilian R...,2024-02-29,,ASD CME artificial intelligence autism autism ...,Canada,Journal of medical Internet research,,NaT,NaT,Publisher
1,38421670,Accuracy of an Artificial Intelligence Chatbot...,IMPORTANCE: Ophthalmology is reliant on effect...,"Mihalache, Andrew Huang, Ryan S Popovic, Marko...",Mihalache A Huang RS Popovic MM Patil NS Pandy...,2024-02-29,,,United States,JAMA ophthalmology,,NaT,NaT,Publisher
2,38421439,ChatGPT in surgery: a revolutionary innovation?,ChatGPT has brought about a new era of digital...,"Bektas, Mustafa Pereira, Jaime Ken Daams, Free...",Bektas M Pereira JK Daams F van der Peet DL,2024-02-29,,Artificial intelligence ChatGPT Surgery,Japan,Surgery today,,NaT,NaT,Publisher
3,38421392,Exploring the landscape of AI-assisted decisio...,PURPOSE: Recent breakthroughs in natural langu...,"Marchi, Filippo Bellini, Elisa Iandelli, Andre...",Marchi F Bellini E Iandelli A Sampieri C Peret...,2024-02-29,,Artificial intelligence (AI) models Cancer car...,Germany,European archives of oto-rhino-laryngology : o...,,NaT,NaT,Publisher
4,38421272,"Artificial Intelligence, Big Data, and Regulat...",The immune system is regulated by a complex se...,"Singh, Bhagirath Jevnikar, Anthony M Desjardin...",Singh B Jevnikar AM Desjardins E,2024-01-01,,Artificial intelligence Big data Ethics of imm...,Switzerland,Archivum immunologiae et therapiae experimentalis,,NaT,NaT,In-Process


In [None]:
# show only rows
# df_important[df_important['LA'].str.contains('Tur', case=False)]

In [238]:
from langchain_community.document_loaders import DataFrameLoader
from langchain_community.embeddings import OllamaEmbeddings, GPT4AllEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate
loader = DataFrameLoader(df_important, page_content_column="AB")
docs = loader.load()
docs

[Document(page_content='Artificial intelligence (AI) broadly describes a branch of computer science focused on developing machines capable of performing tasks typically associated with human intelligence. Those who connect AI with the world of science fiction may meet its growing rise with hesitancy or outright skepticism. However, AI is becoming increasingly pervasive in our society, from algorithms helping to sift through airline fares to substituting words in emails and SMS text messages based on user choices. Data collection is ongoing and is being leveraged by software platforms to analyze patterns and make predictions across multiple industries. Health care is gradually becoming part of this technological transformation, as advancements in computational power and storage converge with the rapid expansion of digitized medical information. Given the growing and inevitable integration of AI into health care systems, it is our viewpoint that pediatricians urgently require training an

In [None]:
#%pip install --upgrade --quiet  gpt4all > /dev/null

In [239]:
embeddings = GPT4AllEmbeddings()

bert_load_from_file: gguf version     = 2
bert_load_from_file: gguf alignment   = 32
bert_load_from_file: gguf data offset = 695552
bert_load_from_file: model name           = BERT
bert_load_from_file: model architecture   = bert
bert_load_from_file: model file type      = 1
bert_load_from_file: bert tokenizer vocab = 30522


In [None]:
# pip install faiss-cpu

In [240]:
text_splitter = RecursiveCharacterTextSplitter()
documents = text_splitter.split_documents(docs)
print(f"Number of documents: {len(documents)}")
# Number of documents: 62158

Number of documents: 62158


In [None]:
# Save the index to disk
vector = FAISS.from_documents(documents, embeddings)

In [None]:
vector.save_local(os.path.join(project_dir,'vectorStore'))

In [241]:
db = FAISS.load_local(os.path.join(project_dir,'vectorStore'), embeddings)

In [242]:
llm = Ollama(model="llama2")

In [None]:
from langchain.chains.combine_documents import create_stuff_documents_chain

prompt = ChatPromptTemplate.from_template("""Answer the following question based only on the provided context and metadata:

<context>
{context}
</context>
<metadata>
{metadata}
<metadata>
Question: {input}""")

document_chain = create_stuff_documents_chain(llm, prompt)

In [None]:
from langchain.chains import create_retrieval_chain

retriever = vector.as_retriever()

In [245]:
query = "what are the symptoms of covid?"
document = retriever.get_relevant_documents(query)
context = document[0]
metadata = document[0].metadata
print(context)
metadata

page_content='OBJECTIVE: This work aims to study the profiles of Long COVID from the perspective of the patients spontaneously sharing their experiences and symptoms on Reddit. METHODS: We collected 27,216 posts shared between July 2020 and July 2022 on Long COVID-related Reddit forums. Natural language processing, clustering techniques and a Long COVID symptoms lexicon were used to extract the different symptoms and categories of symptoms and to study the co-occurrences and correlation between them. RESULTS: More than 78% of the posts mentioned at least one Long COVID symptom. Fatigue (29.4%), pain (22%), clouded consciousness (19.1%), anxiety (17.7%) and headaches (15.6%) were the most prevalent symptoms. They also highly co-occurred with a variety of other symptoms (e.g., fever, sinonasal congestion). Different categories of symptoms were found: general (45.5%), neurological/ocular (42.9%), mental health/psychological/behavioral (35.2%), body pain/mobility (35.1%) and cardiorespirat

{'PMID': '37663849',
 'TI': "The Long COVID experience from a patient's perspective: a clustering analysis of 27,216 Reddit posts.",
 'FAU': 'Ayadi, Hanin Bour, Charline Fischer, Aurelie Ghoniem, Mohammad Fagherazzi, Guy',
 'AU': 'Ayadi H Bour C Fischer A Ghoniem M Fagherazzi G',
 'DP': NaT,
 'MH': 'Humans *COVID-19/epidemiology Post-Acute COVID-19 Syndrome Cluster Analysis Fatigue Pain',
 'OT': 'Long COVID artificial intelligence digital health machine learning natural language processing patient-reported outcomes public health social media',
 'PL': 'Switzerland',
 'JT': 'Frontiers in public health',
 'PMC': 'PMC10470115',
 'EDAT': NaT,
 'MHDA': NaT,
 'STAT': 'MEDLINE'}

In [246]:
from langchain_core.documents import Document

document_chain.invoke({
    "input": query,
    "context": [context],
    "metadata": [metadata]
})

'Based on the provided context and metadata, the most prevalent symptoms of Long COVID are:\n\n1. Fatigue (29.4%)\n2. Pain (22%)\n3. Clouded consciousness (19.1%)\n4. Anxiety (17.7%)\n5. Headaches (15.6%)\n\nThese symptoms were found to highly co-occur with a variety of other symptoms, including fever, sinonasal congestion. The study also identified different categories of symptoms, such as:\n\n1. General symptoms (45.5%)\n2. Neurological/ocular symptoms (42.9%)\n3. Mental health/psychological/behavioral symptoms (35.2%)\n4. Body pain/mobility symptoms (35.1%)\n5. Cardiorespiratory symptoms (31.2%)\n\nIn addition, the study addressed other concerns of the community, such as vaccine, recovery and relapse following recovery.'

In [None]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder

# First we need a prompt that we can pass into an LLM to generate this search query

prompt = ChatPromptTemplate.from_messages([
    MessagesPlaceholder(variable_name="chat_history"),
    ("user", "{input}"),
    ("user", "Given the above conversation, generate a search query to look up in order to get information relevant to the conversation")
])
retriever_chain = create_history_aware_retriever(llm, retriever, prompt)

In [None]:
from langchain_core.messages import HumanMessage, AIMessage

chat_history = [HumanMessage(content="Can LangSmith help test my LLM applications?"), AIMessage(content="Yes!")]
retriever_chain.invoke({
    "chat_history": chat_history,
    "input": "Tell me who?"
})