In [1]:
from pathlib import Path

# Define paths
input_dir = Path("../Data/raw/test_data/")
input_file_name = "0001.txt"
input_file = input_dir / input_file_name

print(input_file)

# Read input
with open(input_file, "r", encoding="utf-8") as f:
    clinical_text_content  = f.read().strip()

print(clinical_text_content )

..\Data\raw\test_data\0001.txt
﻿Admission Date :
2012-10-31
Discharge Date :
2012-11-07
Date of Birth :
1941-03-23
Sex : 
M
Service :
MEDICINE
Allergies : 
Nsaids/Anti-Inflammatory Classifier / Vancomycin
Attending : 
Kristie R. Hamby , M.D. 
Chief Complaint :
CC : Antonio M. Z. Eddings , M.D. 
Major Surgical or Invasive Procedure :
Mesenteric angiograpm w/ coil embolization of bleeding vessel .
Sigmoidoscopy .
Colonoscopy .
History of Present Illness :
HPI: Pt is a 71 y/o male with h/o dm2 , cad s/p cabg , DVT/PE on long term anti-coagulation , ulcerative colitis on Asacol presents with brbpr starting at 9am of the morning of admission .
He 'd been having lower abdominal pain for approximately the past week , a symptom for which he 's been admitted in the past .
His PCP had recently started ciprofloxacin for a UTI . 
At 9am the morning of admission he passed a large , bloody bowel movement and came to the Michael . There , his vitals were intially stable with a hct of 36.7 , though he

Entity-Only Zero Shot Prompt

In [2]:
# Define meaningful prompt parts
instruction = (
    "Your task is to extract entities from clinical text and categorize them into predefined labels "
    "based on their role in medical documentation."
)

source_info = (
    "The source text comes from Electronic Health Records (EHRs), which are digital versions of patients' medical histories, "
    "including diagnoses, medications, procedures, lab results, and physician notes."
)

category_definition = (
    "Entity Categories\n"
    "Each extracted entity should be classified into one of the following categories:\n"
    "Test: A diagnostic procedure, lab test, or medical examination performed to investigate or reveal a medical condition.\n"
    "Examples:\n"
    "entity=\"colonoscopy\" label=\"test\"\n"
    "entity=\"MRI scan\" label=\"test\"\n"
    "entity=\"blood glucose test\" label=\"test\"\n\n"
    "Treatment: A medication, procedure, or intervention administered to manage, improve, or prevent a medical problem.\n"
    "Examples:\n"
    "entity=\"ramipril\" label=\"treatment\"\n"
    "entity=\"chemotherapy\" label=\"treatment\"\n"
    "entity=\"insulin therapy\" label=\"treatment\"\n\n"
    "Problem: A medical condition, symptom, or diagnosis affecting a patient, which may require testing or treatment.\n"
    "Examples:\n"
    "entity=\"diabetes mellitus\" label=\"problem\"\n"
    "entity=\"chest pain\" label=\"problem\"\n"
    "entity=\"hypertension\" label=\"problem\""
)

output_format = (
    "Expected Output Format\n"
    "Entities should be extracted and presented in the following format:\n"
    "entity=\"<extracted medical entity>\" label=\"<assigned category>\""
)

uncertain_handling = (
    "Handling Uncertain Cases\n"
    "If an entity cannot be clearly classified as test, treatment, or problem based on the information provided, "
    "label it as:\n"
    "entity=\"<extracted medical entity>\" label=\"unknown\""
)

triple_quote_note = (
    "Clinical Text Input\n"
    "Below, you will be given a clinical text document enclosed within triple quotes (\"\"\" \"\"\"). "
    "Your task is to extract relevant medical entities and assign them the appropriate labels based on the definitions provided above."
)

# Format with triple quotes
clinical_text = f'""" \n{clinical_text_content}\n"""'

# Combine all into one prompt
full_prompt = "\n\n".join([
    instruction,
    source_info,
    category_definition,
    output_format,
    uncertain_handling,  
    triple_quote_note,
    f"Clinical Text Input\n{clinical_text}"
])


print(full_prompt)

Your task is to extract entities from clinical text and categorize them into predefined labels based on their role in medical documentation.

The source text comes from Electronic Health Records (EHRs), which are digital versions of patients' medical histories, including diagnoses, medications, procedures, lab results, and physician notes.

Entity Categories
Each extracted entity should be classified into one of the following categories:
Test: A diagnostic procedure, lab test, or medical examination performed to investigate or reveal a medical condition.
Examples:
entity="colonoscopy" label="test"
entity="MRI scan" label="test"
entity="blood glucose test" label="test"

Treatment: A medication, procedure, or intervention administered to manage, improve, or prevent a medical problem.
Examples:
entity="ramipril" label="treatment"
entity="chemotherapy" label="treatment"
entity="insulin therapy" label="treatment"

Problem: A medical condition, symptom, or diagnosis affecting a patient, whic

Prompt: Sentence level 

In [5]:
from pathlib import Path

# Path to the sample training file
sample_file_path = Path("../Data/processed/llm/tag_sentences/train/random_100_sentences.txt")

# Read and wrap with triple quotes
with open(sample_file_path, "r", encoding="utf-8") as f:
    sample_sentences = f.read().strip()

# print(sample_sentences)

# Define meaningful prompt parts
instruction = (
    "Your task is to extract entities from clinical text and categorize them into predefined labels "
    "based on their role in medical documentation."
)

source_info = (
    "The source text comes from Electronic Health Records (EHRs), which are digital versions of patients' medical histories, "
    "including diagnoses, medications, procedures, lab results, and physician notes."
)

category_definition = (
    "Entity Categories\n"
    "Each extracted entity should be classified into one of the following categories:\n"
    "Test: A diagnostic procedure, lab test, or medical examination performed to investigate or reveal a medical condition.\n"
    "Examples:\n"
    "entity=\"colonoscopy\" label=\"test\"\n"
    "entity=\"MRI scan\" label=\"test\"\n"
    "entity=\"blood glucose test\" label=\"test\"\n\n"
    "Treatment: A medication, procedure, or intervention administered to manage, improve, or prevent a medical problem.\n"
    "Examples:\n"
    "entity=\"ramipril\" label=\"treatment\"\n"
    "entity=\"chemotherapy\" label=\"treatment\"\n"
    "entity=\"insulin therapy\" label=\"treatment\"\n\n"
    "Problem: A medical condition, symptom, or diagnosis affecting a patient, which may require testing or treatment.\n"
    "Examples:\n"
    "entity=\"diabetes mellitus\" label=\"problem\"\n"
    "entity=\"chest pain\" label=\"problem\"\n"
    "entity=\"hypertension\" label=\"problem\""
)

output_format = (
    "Expected Output Format\n"
    "Entities should be extracted and presented in the following format:\n"
    "entity=\"<extracted medical entity>\" label=\"<assigned category>\""
)

uncertain_handling = (
    "Handling Uncertain Cases\n"
    "If an entity cannot be clearly classified as test, treatment, or problem based on the information provided, "
    "label it as:\n"
    "entity=\"<extracted medical entity>\" label=\"unknown\""
)


sample_training_sentences = (
    "Sample Training Clinical Text\n"
    "You will be given a set of clinical sentences enclosed within triple quotes (\"\"\" \"\"\"). "
    "Each sentence will contain multiple tagged medical entities. Each entity is surrounded by angle brackets indicating its category: <test>, <treatment>, or <problem>."
    "For example, a blood test would be written as <test> blood test </test>, and a condition like chest pain would appear as <problem> chest pain </problem>."
    "These examples should help clarify how entities should be identified and classified. The system should learn from these examples and apply similar logic when "
    "extracting entities from new clinical text.\n\n"

    "Below is the annotated clinical sentences for training:\n"
    f"\"\"\"\n{sample_sentences}\n\"\"\""
)

triple_quote_note = (
    "Clinical Text Input\n"
    "Below, you will be given a clinical text document enclosed within triple quotes (\"\"\" \"\"\"). "
    "Your task is to extract relevant medical entities and assign them the appropriate labels based on the definitions provided above."
)

# Format with triple quotes
clinical_text = f'""" \n{clinical_text_content}\n"""'

# Combine all into one prompt
full_prompt = "\n\n".join([
    instruction,
    source_info,
    category_definition,
    output_format,
    uncertain_handling,
    triple_quote_note,
    sample_training_sentences, 
    f"Clinical Text Input\n{clinical_text}"  # ← actual test input
])


print(full_prompt)



Your task is to extract entities from clinical text and categorize them into predefined labels based on their role in medical documentation.

The source text comes from Electronic Health Records (EHRs), which are digital versions of patients' medical histories, including diagnoses, medications, procedures, lab results, and physician notes.

Entity Categories
Each extracted entity should be classified into one of the following categories:
Test: A diagnostic procedure, lab test, or medical examination performed to investigate or reveal a medical condition.
Examples:
entity="colonoscopy" label="test"
entity="MRI scan" label="test"
entity="blood glucose test" label="test"

Treatment: A medication, procedure, or intervention administered to manage, improve, or prevent a medical problem.
Examples:
entity="ramipril" label="treatment"
entity="chemotherapy" label="treatment"
entity="insulin therapy" label="treatment"

Problem: A medical condition, symptom, or diagnosis affecting a patient, whic

Prompt: Document Level

In [20]:
from pathlib import Path

# Path to the sample training file
sample_file_path = Path("../Data/processed/llm/all_sentences/train/record-13-llm.txt")

# Read and wrap with triple quotes
with open(sample_file_path, "r", encoding="utf-8") as f:
    sample_document = f.read().strip()

# print(sample_document)

# Define meaningful prompt parts
instruction = (
    "Your task is to extract entities from clinical text and categorize them into predefined labels "
    "based on their role in medical documentation."
)

source_info = (
    "The source text comes from Electronic Health Records (EHRs), which are digital versions of patients' medical histories, "
    "including diagnoses, medications, procedures, lab results, and physician notes."
)

category_definition = (
    "Entity Categories\n"
    "Each extracted entity should be classified into one of the following categories:\n"
    "Test: A diagnostic procedure, lab test, or medical examination performed to investigate or reveal a medical condition.\n"
    "Examples:\n"
    "entity=\"colonoscopy\" label=\"test\"\n"
    "entity=\"MRI scan\" label=\"test\"\n"
    "entity=\"blood glucose test\" label=\"test\"\n\n"
    "Treatment: A medication, procedure, or intervention administered to manage, improve, or prevent a medical problem.\n"
    "Examples:\n"
    "entity=\"ramipril\" label=\"treatment\"\n"
    "entity=\"chemotherapy\" label=\"treatment\"\n"
    "entity=\"insulin therapy\" label=\"treatment\"\n\n"
    "Problem: A medical condition, symptom, or diagnosis affecting a patient, which may require testing or treatment.\n"
    "Examples:\n"
    "entity=\"diabetes mellitus\" label=\"problem\"\n"
    "entity=\"chest pain\" label=\"problem\"\n"
    "entity=\"hypertension\" label=\"problem\""
)

output_format = (
    "Expected Output Format\n"
    "Entities should be extracted and presented in the following format:\n"
    "entity=\"<extracted medical entity>\" label=\"<assigned category>\""
)

uncertain_handling = (
    "Handling Uncertain Cases\n"
    "If an entity cannot be clearly classified as test, treatment, or problem based on the information provided, "
    "label it as:\n"
    "entity=\"<extracted medical entity>\" label=\"unknown\""
)

sample_training_document = (
    "Sample Training Clinical Document\n"
    "You will be given a cinical document that contains a set of clinical sentences enclosed within triple quotes (\"\"\" \"\"\"). "
    "Each sentence will contain medical entities, if present, tagged using angle brackets indicating their category: "
    "<test>, <treatment>, or <problem>. For example, a blood test would be written as <test> blood test </test>, and a condition like chest pain would appear as "
    "<problem> chest pain </problem>.\n\n"
    "Please note that:\n"
    "- Some sentences may not contain any identifiable medical entities.\n"
    "- Some sentences may contain multiple entities from different categories.\n\n"
    "These examples are provided to help the system understand how to identify and categorize entities. "
    "You should learn from these patterns and apply similar logic when extracting entities from new clinical text.\n\n"
    "Below is the content of clinical training document:\n"
    f"\"\"\"\n{sample_document}\n\"\"\""
)

triple_quote_note = (
    "Clinical Text Input\n"
    "Below, you will be given a clinical text document enclosed within triple quotes (\"\"\" \"\"\"). "
    "Your task is to extract relevant medical entities and assign them the appropriate labels based on the definitions provided above."
)

# Format with triple quotes
clinical_text = f'""" \n{clinical_text_content}\n"""'

# Combine all into one prompt
full_prompt = "\n\n".join([
    instruction,
    source_info,
    category_definition,
    output_format,
    uncertain_handling,
    triple_quote_note,
    sample_training_document, 
    f"Clinical Text Input\n{clinical_text}"  # ← actual test input
])



print(full_prompt)

Your task is to extract entities from clinical text and categorize them into predefined labels based on their role in medical documentation.

The source text comes from Electronic Health Records (EHRs), which are digital versions of patients' medical histories, including diagnoses, medications, procedures, lab results, and physician notes.

Entity Categories
Each extracted entity should be classified into one of the following categories:
Test: A diagnostic procedure, lab test, or medical examination performed to investigate or reveal a medical condition.
Examples:
entity="colonoscopy" label="test"
entity="MRI scan" label="test"
entity="blood glucose test" label="test"

Treatment: A medication, procedure, or intervention administered to manage, improve, or prevent a medical problem.
Examples:
entity="ramipril" label="treatment"
entity="chemotherapy" label="treatment"
entity="insulin therapy" label="treatment"

Problem: A medical condition, symptom, or diagnosis affecting a patient, whic

Prompt: Entity-Seen

In [None]:
from pathlib import Path

# Path to the sample training file
sample_file_path = Path("../Data/processed/llm/entity_only/train/all_entities_small.txt")

# Read and wrap with triple quotes
with open(sample_file_path, "r", encoding="utf-8") as f:
    all_entity_list = f.read().strip()

# print(all_entity_list)

# Define meaningful prompt parts
instruction = (
    "Your task is to extract entities from clinical text and categorize them into predefined labels "
    "based on their role in medical documentation."
)

source_info = (
    "The source text comes from Electronic Health Records (EHRs), which are digital versions of patients' medical histories, "
    "including diagnoses, medications, procedures, lab results, and physician notes."
)

category_definition = (
    "Entity Categories\n"
    "Each extracted entity should be classified into one of the following categories:\n"
    "Test: A diagnostic procedure, lab test, or medical examination performed to investigate or reveal a medical condition.\n"
    "Examples:\n"
    "entity=\"colonoscopy\" label=\"test\"\n"
    "entity=\"MRI scan\" label=\"test\"\n"
    "entity=\"blood glucose test\" label=\"test\"\n\n"
    "Treatment: A medication, procedure, or intervention administered to manage, improve, or prevent a medical problem.\n"
    "Examples:\n"
    "entity=\"ramipril\" label=\"treatment\"\n"
    "entity=\"chemotherapy\" label=\"treatment\"\n"
    "entity=\"insulin therapy\" label=\"treatment\"\n\n"
    "Problem: A medical condition, symptom, or diagnosis affecting a patient, which may require testing or treatment.\n"
    "Examples:\n"
    "entity=\"diabetes mellitus\" label=\"problem\"\n"
    "entity=\"chest pain\" label=\"problem\"\n"
    "entity=\"hypertension\" label=\"problem\""
)

output_format = (
    "Expected Output Format\n"
    "Entities should be extracted and presented in the following format:\n"
    "entity=\"<extracted medical entity>\" label=\"<assigned category>\""
)

uncertain_handling = (
    "Handling Uncertain Cases\n"
    "If an entity cannot be clearly classified as test, treatment, or problem based on the information provided, "
    "label it as:\n"
    "entity=\"<extracted medical entity>\" label=\"unknown\""
)



triple_quote_note = (sample_training_entity_all = (
    "Sample Training Entities\n"
    "You will be given a list of sample entities enclosed within triple quotes (\"\"\" \"\"\"). "
    "You will find the list of test, treatment, and problem entities.\n\n"
    "These examples serve as a training guide and should help the system understand how medical terms are categorized based on their role in clinical documentation.\n"
    "You should use these examples to infer how to extract and label entities from the testing clinical text.\n\n"
    "Below is the list of training entities enclosed into square bracket. Each entity is comma separated.:\n"
    f"\"\"\"\n{all_entity_list}\n\"\"\""
)
    "Clinical Text Input\n"
    "Below, you will be given a clinical text document enclosed within triple quotes (\"\"\" \"\"\"). "
    "Your task is to extract relevant medical entities and assign them the appropriate labels based on the definitions provided above."
)

# Format with triple quotes
clinical_text = f'""" \n{clinical_text_content}\n"""'

# Combine all into one prompt
full_prompt = "\n\n".join([
    instruction,
    source_info,
    category_definition,
    output_format,
    uncertain_handling,
    triple_quote_note,
    sample_training_entity_all, 
    f"Clinical Text Input\n{clinical_text}"  # ← actual test input
])



print(full_prompt)

Your task is to extract entities from clinical text and categorize them into predefined labels based on their role in medical documentation.

The source text comes from Electronic Health Records (EHRs), which are digital versions of patients' medical histories, including diagnoses, medications, procedures, lab results, and physician notes.

Entity Categories
Each extracted entity should be classified into one of the following categories:
Test: A diagnostic procedure, lab test, or medical examination performed to investigate or reveal a medical condition.
Examples:
entity="colonoscopy" label="test"
entity="MRI scan" label="test"
entity="blood glucose test" label="test"

Treatment: A medication, procedure, or intervention administered to manage, improve, or prevent a medical problem.
Examples:
entity="ramipril" label="treatment"
entity="chemotherapy" label="treatment"
entity="insulin therapy" label="treatment"

Problem: A medical condition, symptom, or diagnosis affecting a patient, whic

Prompt: Entity Unseen

In [63]:
from pathlib import Path

# Path to the sample training file
sample_file_path = Path("../Data/processed/llm/entity_only/train/all_entities_but_test_0001_small.txt")

# Read and wrap with triple quotes
with open(sample_file_path, "r", encoding="utf-8") as f:
    all_entity_list_but_test = f.read().strip()

# print(all_entity_list)

# Define meaningful prompt parts
instruction = (
    "Your task is to extract entities from clinical text and categorize them into predefined labels "
    "based on their role in medical documentation."
)

source_info = (
    "The source text comes from Electronic Health Records (EHRs), which are digital versions of patients' medical histories, "
    "including diagnoses, medications, procedures, lab results, and physician notes."
)

category_definition = (
    "Entity Categories\n"
    "Each extracted entity should be classified into one of the following categories:\n"
    "Test: A diagnostic procedure, lab test, or medical examination performed to investigate or reveal a medical condition.\n"
    "Examples:\n"
    "entity=\"colonoscopy\" label=\"test\"\n"
    "entity=\"MRI scan\" label=\"test\"\n"
    "entity=\"blood glucose test\" label=\"test\"\n\n"
    "Treatment: A medication, procedure, or intervention administered to manage, improve, or prevent a medical problem.\n"
    "Examples:\n"
    "entity=\"ramipril\" label=\"treatment\"\n"
    "entity=\"chemotherapy\" label=\"treatment\"\n"
    "entity=\"insulin therapy\" label=\"treatment\"\n\n"
    "Problem: A medical condition, symptom, or diagnosis affecting a patient, which may require testing or treatment.\n"
    "Examples:\n"
    "entity=\"diabetes mellitus\" label=\"problem\"\n"
    "entity=\"chest pain\" label=\"problem\"\n"
    "entity=\"hypertension\" label=\"problem\""
)

output_format = (
    "Expected Output Format\n"
    "Entities should be extracted and presented in the following format:\n"
    "entity=\"<extracted medical entity>\" label=\"<assigned category>\""
)

uncertain_handling = (
    "Handling Uncertain Cases\n"
    "If an entity cannot be clearly classified as test, treatment, or problem based on the information provided, "
    "label it as:\n"
    "entity=\"<extracted medical entity>\" label=\"unknown\""
)

sample_training_entity_all_but_test = (
    "Sample Training Entities\n"
    "You will be given a list of sample entities enclosed within triple quotes (\"\"\" \"\"\"). "
    "You will find the list of test, treatment, and problem entities.\n\n"
    "These examples serve as a training guide and should help the system understand how medical terms are categorized based on their role in clinical documentation.\n"
    "You should use these examples to infer how to extract and label entities from the testing clinical text.\n\n"
    "Below is the list of training entities enclosed into square bracket. Each entity is comma separated.\n"
    f"\"\"\"\n{all_entity_list_but_test}\n\"\"\""
)

triple_quote_note = (
    "Clinical Text Input\n"
    "Below, you will be given a clinical text document enclosed within triple quotes (\"\"\" \"\"\"). "
    "Your task is to extract relevant medical entities and assign them the appropriate labels based on the definitions provided above."
)

# Format with triple quotes
clinical_text = f'""" \n{clinical_text_content}\n"""'

# Combine all into one prompt
full_prompt = "\n\n".join([
    instruction,
    source_info,
    category_definition,
    output_format,
    uncertain_handling,
    triple_quote_note,
    sample_training_entity_all_but_test, 
    f"Clinical Text Input\n{clinical_text}"  # ← actual test input
])



print(full_prompt)

Your task is to extract entities from clinical text and categorize them into predefined labels based on their role in medical documentation.

The source text comes from Electronic Health Records (EHRs), which are digital versions of patients' medical histories, including diagnoses, medications, procedures, lab results, and physician notes.

Entity Categories
Each extracted entity should be classified into one of the following categories:
Test: A diagnostic procedure, lab test, or medical examination performed to investigate or reveal a medical condition.
Examples:
entity="colonoscopy" label="test"
entity="MRI scan" label="test"
entity="blood glucose test" label="test"

Treatment: A medication, procedure, or intervention administered to manage, improve, or prevent a medical problem.
Examples:
entity="ramipril" label="treatment"
entity="chemotherapy" label="treatment"
entity="insulin therapy" label="treatment"

Problem: A medical condition, symptom, or diagnosis affecting a patient, whic

Generic code for all prompts. Only change the output_file_name

In [3]:
output_dir = Path("../Results/test/GPT/")
output_file_name = "0001_zeroshot_2.txt"
predicted_file = output_dir / output_file_name

In [None]:
import requests
import time

# Replace with your actual OpenAI API key
api_key = "Your API Key"

# OpenAI Chat API URL
api_url = "https://api.openai.com/v1/chat/completions"
headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json"
}

data = {
    "model": "gpt-4o",  
    "messages": [
        {"role": "system", "content": "You are a clinical information extractor."},
        {"role": "user", "content": full_prompt}
    ],
    "temperature": 0.3,
    "top_p": 1.0
}


In [5]:
# Start timer
start_time = time.time()

# Send request
response = requests.post(api_url, headers=headers, json=data)

# End timer
end_time = time.time()
response_time = end_time - start_time
print(f"Response time: {response_time:.2f} seconds")

# Parse and save response
output_text = response.json()['choices'][0]['message']['content']

# Save to a text file
with open(predicted_file, "w", encoding="utf-8") as f:
    f.write(output_text)

print("Output saved to:", predicted_file)


Response time: 5.72 seconds
Output saved to: ..\Results\test\GPT\0001_zeroshot_2.txt


In [62]:
'''
{
    "error": {
        "message": "Request too large for gpt-4o in organization org-eHMc6WjJvjEtKB4cX7h6RyoU on tokens per min (TPM): Limit 30000, Requested 31582. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.",
        "type": "tokens",
        "param": null,
        "code": "rate_limit_exceeded"
    }
}
'''

'\n{\n    "error": {\n        "message": "Request too large for gpt-4o in organization org-eHMc6WjJvjEtKB4cX7h6RyoU on tokens per min (TPM): Limit 30000, Requested 31582. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.",\n        "type": "tokens",\n        "param": null,\n        "code": "rate_limit_exceeded"\n    }\n}\n'

In [2]:
pip install nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Using cached click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Using cached click-8.1.8-py3-none-any.whl (98 kB)
Installing collected packages: click, nltk
Successfully installed click-8.1.8 nltk-3.9.1
Note: you may need to restart the kernel to use updated packages.


Code to generate random 100 sentences from the first five documents

In [3]:


import os
import random
import nltk
from pathlib import Path

# Make sure you have punkt tokenizer
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

def sample_sentences_from_files(input_dir, output_file, sample_size=100):
    all_sentences = []
    
    # Get first five files (sorted for consistency)
    files = sorted(os.listdir(input_dir))[:5]
    
    for fname in files:
        fpath = os.path.join(input_dir, fname)
        if os.path.isfile(fpath):
            with open(fpath, 'r', encoding='utf-8') as f:
                text = f.read()
                sentences = sent_tokenize(text)
                all_sentences.extend(sentences)

    # Randomly sample 100 sentences
    sampled_sentences = random.sample(all_sentences, min(sample_size, len(all_sentences)))

    # Save to output file
    with open(output_file, 'w', encoding='utf-8') as out:
        for sentence in sampled_sentences:
            out.write(sentence.strip() + "\n")

input_dir = Path("../Data/processed/llm/tag_sentences/train")
output_file_name = "random_100_sentences.txt"
output_file = input_dir / output_file_name
# Example usage
sample_sentences_from_files(input_dir, output_file)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nipua\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
