## Load PDF Document

In [24]:
import fitz  # PyMuPDF
import spacy  # spaCy for token counting and NLP
from tqdm.auto import tqdm  # Progress bar

In [30]:
def text_formatter(text: str) -> str: 
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip()

    # Potentially more text formatting functions can go here
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = [] 
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_texts.append({"page_number": page_number -0,
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_setence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4, # 1 token = ~4 characters
                                "text": text})
    return pages_and_texts


pdf_path = "/Users/saifmohammed/Desktop/CSE299/LLM-1/PDFs/Diabetes_Care_BADAS_guideline2019.pdf"
pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

0it [00:00, ?it/s]

[{'page_number': 0,
  'page_char_count': 217,
  'page_word_count': 64,
  'page_setence_count_raw': 1,
  'page_token_count': 54.25,
  'text': 'DIABETES CARE  BADAS Guideline 2019                             P|) DAS GUELINE ON Man  DELIT  IGEMEN     A Joint Initiative of  Diabetic Association of Bangladesh  NCDC Program, Directorate General of Health Services'},
 {'page_number': 1,
  'page_char_count': 199,
  'page_word_count': 39,
  'page_setence_count_raw': 1,
  'page_token_count': 49.75,
  'text': 'DIABETES CARE  BADAS Guideline 2019     A Joint Initiative of  Diabetic Association of Bangladesh  NCDC Program, Directorate General of Health Services     Diabetes Care: BADAS Guideline 2019 HEI!  1'}]

In [31]:
import random

random.sample(pages_and_texts, k=2)

[{'page_number': 47,
  'page_char_count': 1689,
  'page_word_count': 313,
  'page_setence_count_raw': 10,
  'page_token_count': 422.25,
  'text': 'Treatment  ¢  With known atherosclerotic cardiovascular disease:  ¢  ACE inhibitor or angiotensin receptor blocker.  *  In patients with prior myocardial infarction, b-blockers should be continued  for at least 2 years after the event.  *  In patients with type 2 diabetes with stable congestive heart failure, metformin  may be used  if estimated glomerular filtration rate remains >30 mL/min but  should be avoided  in unstable or hospitalized patients with congestive heart  failure with hypoxia.  *  Among persons with type  2 diabetes who have established atherosclerotic  cardiovascular  disease,  sodium-glucose  cotransporter  2  inhibitors  or  glucagon-like peptide  1  receptor agonists are recommended  as part of the  antihyperglycemic regimen.  *  Among persons with atherosclerotic cardiovascular disease at high risk of heart  failure  o

In [32]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_setence_count_raw,page_token_count,text
0,0,217,64,1,54.25,DIABETES CARE BADAS Guideline 2019 ...
1,1,199,39,1,49.75,DIABETES CARE BADAS Guideline 2019 A Join...
2,2,1144,225,1,286.0,DIABETES CARE: BADAS GUIDELINE 2019 Convener:...
3,3,1306,233,10,326.5,President Diabetic Association of Banglade...
4,4,898,162,4,224.5,Secretary General Diabetic Association of ...


In [33]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_setence_count_raw,page_token_count
count,79.0,79.0,79.0,79.0,79.0
mean,39.0,1508.29,277.24,10.81,377.07
std,22.95,611.95,107.05,7.65,152.99
min,0.0,43.0,8.0,1.0,10.75
25%,19.5,1148.5,224.0,4.0,287.12
50%,39.0,1494.0,271.0,10.0,373.5
75%,58.5,1866.5,341.0,15.5,466.62
max,78.0,2960.0,542.0,30.0,740.0


## Further text processing (splitting pages into sentences)

In [34]:
from spacy.lang.en import English

nlp = English()

# Add a sentencizer pipeline, see https://spacy.io/api/sentencizer 
nlp.add_pipe("sentencizer")

# Create document instance as an example
doc = nlp("This is a sentence. This another sentence. I like elephants.")
assert len(list(doc.sents)) == 3

# Print out our sentences split
list(doc.sents)

[This is a sentence., This another sentence., I like elephants.]

In [36]:
pages_and_texts[50]

{'page_number': 50,
 'page_char_count': 1754,
 'page_word_count': 285,
 'page_setence_count_raw': 17,
 'page_token_count': 438.5,
 'text': 'Pharmacological therapy based on age, ASCVD or ASCVD risk factors  Age | ASCVD or 10-tear ASCVD risk >20%  Along with lifestyle modification,     Recommended pharmacological therapy  <40  No  None or moderate intensity statin may be  considered based on risk-benefit profile or  presence of ASCVD risk.  <40  Yes  High intensity statin; if LDL =>70 mg/dl despite  of therapy consider combining with ezetimibe.  240  No  Moderate intensity statin  or high intensity  statin may be considered based on risk-benefit  profile or presence of ASCVD risk.  240  Yes  High intensity statin; if LDL >70 mg/dl despite  of therapy, consider combining with ezetimibe.        NB: Intensity of statin therapy may need to be adjusted according to individual  patient response to medication (eg side effects, tolerability, LDL cholesterol  levels, etc).  Statin therapy is con

In [37]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    # Make sure all sentences are strings (the default type is a spaCy datatype)
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    # Count the sentences
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/79 [00:00<?, ?it/s]

In [38]:
random.sample(pages_and_texts, k=1)

[{'page_number': 14,
  'page_char_count': 2732,
  'page_word_count': 461,
  'page_setence_count_raw': 25,
  'page_token_count': 683.0,
  'text': "Prediabetes  IGT (impaired glucose tolerance) and IFG (impaired fasting glucose) are referred as  ‘Prediabetes: Persons with prediabetes have high risk of development of diabetes  (25% IFG and 30% IGT cases become diabetic over time) and cardiovascular  diseases. Any type of diabetes can pass through the stages of prediabetes, but it is  most  obvious  in  type  2  diabetes. These  persons  are  treated  by  lifestyle  modifications; drugs may be used where indicated. About 40% of IFG and 30% of  IGT cases may revert to normal if proper intervention can be given.  Clinical presentation  The spectrum of presentation ranges from asymptomatic to typical features.  Asymptomatic cases are diagnosed by biochemical test only. A vast majority of  type 2 diabetes and other types remain asymptomatic over a prolonged period.  Routine check-up usually pi

In [39]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_setence_count_raw,page_token_count,page_sentence_count_spacy
count,79.0,79.0,79.0,79.0,79.0,79.0
mean,39.0,1508.29,277.24,10.81,377.07,10.65
std,22.95,611.95,107.05,7.65,152.99,7.61
min,0.0,43.0,8.0,1.0,10.75,1.0
25%,19.5,1148.5,224.0,4.0,287.12,4.0
50%,39.0,1494.0,271.0,10.0,373.5,10.0
75%,58.5,1866.5,341.0,15.5,466.62,15.0
max,78.0,2960.0,542.0,30.0,740.0,32.0


## Chunking sentences together

In [40]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10

# Create a function to split lists of texts recursively into chunk size
# e.g. [20] -> [10, 10] or [25] -> [10, 10, 5]
def split_list(input_list: list[str],
               slice_size: int=num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

test_list = list(range(25))
split_list(test_list)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [41]:
# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/79 [00:00<?, ?it/s]

random.sample(pages_and_texts, k=1)

In [43]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_setence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,79.0,79.0,79.0,79.0,79.0,79.0,79.0
mean,39.0,1508.29,277.24,10.81,377.07,10.65,1.61
std,22.95,611.95,107.05,7.65,152.99,7.61,0.74
min,0.0,43.0,8.0,1.0,10.75,1.0,1.0
25%,19.5,1148.5,224.0,4.0,287.12,4.0,1.0
50%,39.0,1494.0,271.0,10.0,373.5,10.0,1.0
75%,58.5,1866.5,341.0,15.5,466.62,15.0,2.0
max,78.0,2960.0,542.0,30.0,740.0,32.0,4.0


## Splitting each chunk into its own item

In [44]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts): 
    for sentence_chunk in item["sentence_chunks"]: 
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        # Join the sentences together into a paragraph-like structure, aka join the list of sentences into one paragraph
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" => ". A" (will work for any captial letter)

        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get some stats on our chunks
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 chars

        pages_and_chunks.append(chunk_dict) 

len(pages_and_chunks)

  0%|          | 0/79 [00:00<?, ?it/s]

127

In [45]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 26,
  'sentence_chunk': 'fe PEST icles eer. VPLS]                                                       Class Name Duration of action (hr) | Mode of excretion A. Secretagogues 15* generation (Not in use at present) Tolbutamide 6-12 Mostly urine Chlorpropamide 24-72 Urine Sulphonylureas 2"4 generation Glibenclamide 24 Urine & faeces Glipizide 8-12 Mostly urine Gliclazide 8-12 Urine & faeces Glimepiride 24 Urine & faeces Meglitinide analogue Non-sulphonylureas Repaglinide | 45 | Mostly faeces d-phenylalanine derivative Nateglinide | 45 | Mostly urine B. Insulin sensitizers Biguanides Metformin 8-12 Mostly urine Thiazolidinediones Pioglitazone 24 Urine & faeces Rosiglitazone 24 Urine & faeces C. Alpha-glucosidase inhibitors Acarbose 4 Faeces & urine Miglitol 4 Urine Voglibose NA Mostly faeces D. DPP-4 inhibitors Sitagliptin 24 Mostly urine Vildagliptin 24 Mostly urine Linagliptin 24 Mostly faeces Saxagliptin 24 Urine & faeces Alogliptin 24 Mostly urine E. SGLT-2 inhibitor

In [46]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,127.0,127.0,127.0,127.0
mean,41.41,905.89,140.5,226.47
std,22.8,500.82,79.94,125.21
min,0.0,42.0,7.0,10.5
25%,21.5,474.0,65.5,118.5
50%,41.0,941.0,154.0,235.25
75%,61.0,1304.5,193.5,326.12
max,78.0,2484.0,402.0,621.0


In [47]:
df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count
0,0,DIABETES CARE BADAS Guideline 2019 ...,196,43,49.0
1,1,DIABETES CARE BADAS Guideline 2019 A Joint I...,191,31,47.75
2,2,DIABETES CARE: BADAS GUIDELINE 2019 Convener: ...,1098,179,274.5
3,3,President Diabetic Association of Bangladesh...,1278,205,319.5
4,4,Secretary General Diabetic Association of Ba...,876,140,219.0


## Filter chunks of text for short chunks

In [48]:
# Show random chunks with under 30 tokens in length
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 23.75 | Text: What will be the management stratigy plan for him?  76 ‘158 Diabetes Care: BADAS Guideline 2019
Chunk token count: 10.5 | Text: Diabetes Care: BADAS Guideline 2019 HE) 57
Chunk token count: 28.0 | Text: **Including nephrotic syndrome (albumin excretion ACR > 2220 mg/g)   Diabetes Care: BADAS Guideline 2019 HEI) 39
Chunk token count: 10.5 | Text: 56 ‘ll Diabetes Care: BADAS Guideline 2019
Chunk token count: 29.25 | Text: Targets of diabetes management in the young (based on ISPAD guideline)   58 (\l§§ Diabetes Care: BADAS Guideline 2019


In [50]:
# Filter our DataFrame for rows with under 30 tokens
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': 0,
  'sentence_chunk': 'DIABETES CARE BADAS Guideline 2019               P|) DAS GUELINE ON Man DELIT IGEMEN   A Joint Initiative of Diabetic Association of Bangladesh NCDC Program, Directorate General of Health Services',
  'chunk_char_count': 196,
  'chunk_word_count': 43,
  'chunk_token_count': 49.0},
 {'page_number': 1,
  'sentence_chunk': 'DIABETES CARE BADAS Guideline 2019   A Joint Initiative of Diabetic Association of Bangladesh NCDC Program, Directorate General of Health Services   Diabetes Care: BADAS Guideline 2019 HEI! 1',
  'chunk_char_count': 191,
  'chunk_word_count': 31,
  'chunk_token_count': 47.75}]

In [51]:
random.sample(pages_and_chunks_over_min_token_len, k=1)

[{'page_number': 40,
  'sentence_chunk': 'Treatment * Good glycemic control reduces incidence of diabetic nephropathy and delays its progression. * Control of hypertension is very important because uncontrolled hypertension causes rapid progression of diabetic nephropathy. Nephropathy itself makes hypertension refractory to anti-hypertensive drugs, thus necessitates intensive and combination regimens. Target of BP is <130/80 mm of Hg. * ACE inhibitors and ARBs are drugs of first choice to reduce or revert nephropathy. But these two drugs must not be combined. Check electrolytes and creatinine 2 weeks after starting.  40 l§N Diabetes Care: BADAS Guideline 2019',
  'chunk_char_count': 625,
  'chunk_word_count': 91,
  'chunk_token_count': 156.25}]

## Embedding our text chunks

In [52]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cpu")

# Create a list of sentences
sentences = ["The Sentence Transformer library provides an easy way to create embeddings.",
             "Sentences can be embedded one by one or in a list.",
             "I like horses!"]

# Sentences are encoded/embedded by calling model.encode()
embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

# See the embeddings
for sentence, embedding in embeddings_dict.items():
    print(f"Sentence: {sentence}")
    print(f"Embedding: {embedding}")
    print("")

Sentence: The Sentence Transformer library provides an easy way to create embeddings.
Embedding: [-3.44286226e-02  2.95328405e-02 -2.33643223e-02  5.57257831e-02
 -2.19098274e-02 -6.47056662e-03  1.02848867e-02 -6.57804534e-02
  2.29718834e-02 -2.61120312e-02  3.80421393e-02  5.61403185e-02
 -3.68747041e-02  1.52788181e-02  4.37020697e-02 -5.19723557e-02
  4.89479117e-02  3.58107686e-03 -1.29750324e-02  3.54382698e-03
  4.23262566e-02  3.52606736e-02  2.49402504e-02  2.99176928e-02
 -1.99382436e-02 -2.39752401e-02 -3.33369104e-03 -4.30450477e-02
  5.72014526e-02 -1.32517535e-02 -3.54478285e-02 -1.13936355e-02
  5.55562191e-02  3.61095858e-03  8.88526586e-07  1.14027131e-02
 -3.82229574e-02 -2.43545347e-03  1.51313841e-02 -1.32682035e-04
  5.00659831e-02 -5.50876819e-02  1.73445288e-02  5.00959605e-02
 -3.75959128e-02 -1.04463315e-02  5.08321971e-02  1.24861663e-02
  8.67377147e-02  4.64143232e-02 -2.10689828e-02 -3.90251353e-02
  1.99690554e-03 -1.42345624e-02 -1.86794419e-02  2.826694

In [53]:
embedding_model.to("cpu")

# Embed each chunk one by one
for item in tqdm(pages_and_chunks_over_min_token_len):
     item["embedding"] = embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/118 [00:00<?, ?it/s]

In [56]:
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]
text_chunks[40]

'Selection issues of an oral agent Tay: | Sulphonylureas Advantage Potent; reduce pre- & post-prandial BG Non-sulphonylureas Less hypoglycaemia; reduce post-prandial BG Improve insulin sensitivity; weight friendly ; reduce pre- (mostly) & post- prandial BG; favourable effect on lipid & NAFLD Biguanides Thiazolidinediones Improve insulin sensitivity; reduce pre- (mostly) & post- prandial BG; favourable effect on lipid (pioglitazone) & NAFLD Alpha-glucosidase Weight friendly; reduce inhibitors post-prandial BG DPP-4 inhibitors Weight friendly; reduce pre- & post-prandial (mostly) BG SGLT-2 inhibitors Weight-friendly; reduce pre- & post-prandial BG Weight-friendly; reduce pre- & post-prandial BG; reduce LDL Bile acid sequestrants Weight-friendly; reduce pre- & post-prandial BG; reduce TG Dopamine-2 agonists   NB. NAFLD- Non-alcoholic fatty liver disease eGFR ml/min/1.73m2   Hazard Hypoglycemia; weight gain Weight gain Gl upset; lactic acidosis Weight gain; fluid retention; IHD & raise LDL

In [57]:
len(text_chunks)

118

In [58]:
# Embed all texts in batches
text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=32, # you can experiment to find which batch size leads to best results
                                               convert_to_tensor=True)
text_chunk_embeddings

tensor([[ 0.0682, -0.0422,  0.0046,  ...,  0.0064, -0.0006, -0.0206],
        [ 0.0457, -0.0438,  0.0023,  ..., -0.0287, -0.0164, -0.0145],
        [ 0.0532, -0.0541,  0.0016,  ..., -0.0058, -0.0372, -0.0234],
        ...,
        [ 0.0116, -0.0334,  0.0245,  ...,  0.0206, -0.0080, -0.0218],
        [ 0.0594,  0.0017,  0.0231,  ...,  0.0559, -0.0775, -0.0265],
        [ 0.0481,  0.0526,  0.0194,  ..., -0.0183, -0.0314, -0.0433]])

## Save Embeddings to File

In [59]:
pages_and_chunks_over_min_token_len[55]

{'page_number': 38,
 'sentence_chunk': 'Diabetic Association of Bangladesh.5th ed 2018. Kitabchi AE, UmpierrezGE, MilesJM, Fisher JN. Hyperglycemic crises in adult patients with diabetes. Diabetes Care 2009:32(7):1335-1343. https://www.aacc.org/community/aacc-academy/publications/scientific-shorts/ 2013/correcting-serum-or-plasma-sodium-for-hyperglycemia-should-labs-repo rt-a-corrected-sodium   38 {IN Diabetes Care: BADAS Guideline 2019',
 'chunk_char_count': 400,
 'chunk_word_count': 34,
 'chunk_token_count': 100.0,
 'embedding': array([ 4.39346023e-02, -5.34026511e-02,  2.24816725e-02, -1.16520431e-02,
         2.54839901e-02, -9.49985255e-03, -2.53603775e-02, -7.18203187e-03,
         4.44827043e-02,  2.65188310e-02,  3.60674560e-02,  1.76567771e-02,
         4.75546392e-03,  6.24508522e-02, -2.83382554e-03, -1.33239469e-02,
        -1.76365711e-02, -3.61277163e-02,  5.98629825e-02,  3.55343148e-02,
        -1.67616773e-02, -5.87151386e-04, -4.58563454e-02,  4.22729701e-02,
        

In [60]:
# Save embeddings to file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [61]:
# Import saved file and view 
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,0,DIABETES CARE BADAS Guideline 2019 ...,196,43,49.0,[ 6.81944862e-02 -4.22385372e-02 4.58518323e-...
1,1,DIABETES CARE BADAS Guideline 2019 A Joint I...,191,31,47.75,[ 4.56828177e-02 -4.38070223e-02 2.28480832e-...
2,2,DIABETES CARE: BADAS GUIDELINE 2019 Convener: ...,1098,179,274.5,[ 5.31841069e-02 -5.40936999e-02 1.59960892e-...
3,3,President Diabetic Association of Bangladesh...,1278,205,319.5,[ 5.81505671e-02 -5.39151691e-02 -8.90129991e-...
4,4,Secretary General Diabetic Association of Ba...,876,140,219.0,[ 6.01391383e-02 -3.85712944e-02 -6.06791256e-...
