<a href="https://colab.research.google.com/github/sonia73b/tech400w2asst/blob/main/TECH400W2ASST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
nltk.download ("stopwords")
nltk.download ("punkt")
nltk.download ("wordnet")
nltk.download ("punkt_tab")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
import os, string, logging, re
from collections import defaultdict, Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
STOPWORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()

In [None]:
def  load_text_files (folder_path ):
    data = {}
    doc_id_to_filename = {}
    doc_id = 0

    print(f"Scanning folder: {folder_path}")
    for filename in os.listdir(folder_path):
        print(f"Found file: {filename}")
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                content = file.read()
                data[doc_id] = content
                doc_id_to_filename[doc_id] = filename
                print(f"Loaded doc_id {doc_id} -> {filename}")
                doc_id += 1

    print(f"Total files loaded: {len(data)}")
    return data, doc_id_to_filename


In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    tokens = word_tokenize(text)
    cleaned_tokens = [LEMMATIZER.lemmatize(word) for word in tokens if len(word) > 1]
    return cleaned_tokens

In [None]:
def build_inverted_index(data):
    inverted_index = defaultdict(set)
    term_frequencies = Counter()

    for doc_id, content in data.items():
        tokens = clean_text(content)
        for token in tokens:
            inverted_index[token].add(doc_id)
            term_frequencies[token] += 1

    return inverted_index, term_frequencies


In [None]:
def boolean_query(query, inverted_index, doc_id_to_filename):
    query = query.lower()
    tokens = query.split()
    result_set = set()

    if 'and' in tokens:
        terms = [term for term in tokens if term not in ['', 'or', 'not', 'and']]
        if all(term in inverted_index for term in terms):
            result_set = inverted_index[terms[0]].copy()
            for term in terms[1:]:
                result_set &= inverted_index[term]
    elif 'or' in tokens:
        terms = [term for term in tokens if term not in ['', 'or', 'not', 'and']]
        for term in terms:
            if term in inverted_index:
                result_set |= inverted_index[term]
    elif 'not' in tokens:
        term = tokens[1]
        all_docs = set(doc_id_to_filename.keys())
        if term in inverted_index:
            result_set = all_docs - inverted_index[term]
        else:
            result_set = all_docs
    else:
        if query in inverted_index:
            result_set = inverted_index[query]

    result_filenames = [doc_id_to_filename[doc_id] for doc_id in result_set if doc_id in doc_id_to_filename]
    logging.info(f"Query '{query}' resulted in: {result_filenames}")
    return result_filenames



In [None]:
def generate_queries_file(term_frequencies, output_file="queries.txt", top_n=5):
    queries = ["EHR AND patient", "ICD-10 OR SNOMED", "NOT telehealth"]
    with open("queries.txt", "w", encoding="utf-8") as f:
        for q in queries:
            f.write(q + "\n")
    print("Generated queries file: queries.txt")

In [None]:
def main():
    folder_path = r"/content/drive/MyDrive/TECH400W2ASST"

    data, doc_id_to_filename = load_text_files(folder_path)

    for doc_id, content in data.items():
        tokens = clean_text(content)
        print(f"Doc {doc_id} cleaned tokens:", tokens[:20])

    inverted_index, term_frequencies = build_inverted_index(data)
    print("Sample inverted index keys:", list(inverted_index.keys())[:20])

    generate_queries_file(term_frequencies)

    queries = ["EHR AND patient", "ICD-10 OR SNOMED", "NOT telehealth"]

    with open("query_results.txt", 'w', encoding='utf-8') as result_file:
        for query in queries:
            result = boolean_query(query, inverted_index, doc_id_to_filename)
            result_str = f"Results for '{query}': {result}\n"
            print(result_str)
            result_file.write(result_str)

if __name__ == "__main__":
    main()

Scanning folder: /content/drive/MyDrive/TECH400W2ASST
Found file: Electronic Health Records (EHR).txt
Loaded doc_id 0 -> Electronic Health Records (EHR).txt
Found file: Interoperability.txt
Loaded doc_id 1 -> Interoperability.txt
Found file: Virtualcare.txt
Loaded doc_id 2 -> Virtualcare.txt
Found file: Clinical Decision Support Systems.txt
Loaded doc_id 3 -> Clinical Decision Support Systems.txt
Found file: HIPAA.txt
Loaded doc_id 4 -> HIPAA.txt
Found file: Health data analytics.txt
Loaded doc_id 5 -> Health data analytics.txt
Found file: E-prescribing.txt
Loaded doc_id 6 -> E-prescribing.txt
Found file: The Internet of Medical Things.txt
Loaded doc_id 7 -> The Internet of Medical Things.txt
Found file: Clinical coding standards.txt
Loaded doc_id 8 -> Clinical coding standards.txt
Found file: Change management.txt
Loaded doc_id 9 -> Change management.txt
Total files loaded: 10
Doc 0 cleaned tokens: ['electronic', 'health', 'record', 'ehr', 'are', 'digital', 'version', 'of', 'patient',