In [3]:
pip install whoosh

Collecting whoosh
  Obtaining dependency information for whoosh from https://files.pythonhosted.org/packages/ba/19/24d0f1f454a2c1eb689ca28d2f178db81e5024f42d82729a4ff6771155cf/Whoosh-2.7.4-py2.py3-none-any.whl.metadata
  Downloading Whoosh-2.7.4-py2.py3-none-any.whl.metadata (3.1 kB)
Downloading Whoosh-2.7.4-py2.py3-none-any.whl (468 kB)
   ---------------------------------------- 0.0/468.8 kB ? eta -:--:--
   - ------------------------------------- 20.5/468.8 kB 640.0 kB/s eta 0:00:01
   ------- -------------------------------- 92.2/468.8 kB 1.3 MB/s eta 0:00:01
   ---------------------------- ----------- 337.9/468.8 kB 3.0 MB/s eta 0:00:01
   ---------------------------------------- 468.8/468.8 kB 3.3 MB/s eta 0:00:00
Installing collected packages: whoosh
Successfully installed whoosh-2.7.4
Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import shutil
from whoosh.index import create_in, exists_in, open_dir
from whoosh.fields import Schema, TEXT, ID
from whoosh.writing import AsyncWriter

schema = Schema(title=TEXT(stored=True), content=TEXT(stored=True), path=ID(stored=True))

def create_or_open_index(index_dir):
    if not os.path.exists(index_dir):
        os.makedirs(index_dir)
    if exists_in(index_dir):
        return open_dir(index_dir)
    return create_in(index_dir, schema)

def index_documents(processed_dir, index_dir):
    index = create_or_open_index(index_dir)
    with AsyncWriter(index) as writer:
        for filename in os.listdir(processed_dir):
            if filename.endswith('.txt'):
                file_path = os.path.join(processed_dir, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
                title = filename[:-4]  # Removes the .txt extension
                writer.add_document(title=title, content=content, path=file_path)
                print(f"Indexed {filename}")

def main(root_dir):
    for dirpath, dirnames, filenames in os.walk(root_dir, topdown=True):
        dirnames[:] = [d for d in dirnames if d != '.git' and 'Index' not in d]  # Exclude .git and Index directories
        if 'Processed' in dirnames:
            processed_dir = os.path.join(dirpath, 'Processed')
            index_dir = os.path.join(dirpath, 'Index')
            if not os.path.exists(index_dir):
                os.makedirs(index_dir)
            index_documents(processed_dir, index_dir)
            # Copying files from 'Processed' to 'Index'
            for filename in os.listdir(processed_dir):
                src_file = os.path.join(processed_dir, filename)
                dst_file = os.path.join(index_dir, filename)
                if not os.path.exists(dst_file):
                    shutil.copy2(src_file, dst_file)
                    print(f"Copied {filename} to {index_dir}")

if __name__ == "__main__":
    root_directory = r'C:\Users\steph\OneDrive\Documents\GitHub\ML-Literature-Search-Engine-Resources'
    main(root_directory)


Indexed Alzubaidi_2021_Revew DL concepts_processed.txt
Indexed Angermueller-2016-DL_processed.txt
Indexed Cao-2018-DL_processed.txt
Indexed Chahal_2019_Machine Learning and Deep Learning_processed.txt
Indexed Chen-2014-DL_processed.txt
Indexed Ching-2018-DL_processed.txt
Indexed Christin-2019-DL_processed.txt
Indexed Deng-2014-DL_processed.txt
Indexed Dong-2021-DL_processed.txt
Indexed Ghosh-2019-DL_processed.txt
Indexed Guo-2016-DL_processed.txt
Indexed Janiesch,Janiesch,Heinrich3_2021_Machine learning and deep learning_processed.txt
Indexed Kamilaris-2018-DL_processed.txt
Indexed LeCun-2015-DL_processed.txt
Indexed Lu,Ehwerhemuepha,Rakovski_2022_DLM for text classifcation_processed.txt
Indexed Mamoshina-2016-DL_processed.txt
Indexed Min-2017-DL_processed.txt
Indexed Minar-2018-DL_processed.txt
Indexed Mu-2019-DL_processed.txt
Indexed Niu-2021-DL_processed.txt
Indexed Paramasivam_KNN-Based Machine Learning Classifier Used on Deep Learned Spatial Motion Features for Human Action Recogn

In [5]:
from whoosh.index import open_dir
from whoosh.qparser import QueryParser
import os

INDEX_ROOT = r'C:\Users\steph\OneDrive\Documents\GitHub\ML-Literature-Search-Engine-Resources'

def get_index_directories(root_dir):
    for dirpath, dirnames, _ in os.walk(root_dir):
        if 'Index' in dirnames:
            yield os.path.join(dirpath, 'Index')

def perform_search(search_query):
    results = []
    for index_dir in get_index_directories(INDEX_ROOT):
        try:
            ix = open_dir(index_dir)
            with ix.searcher() as searcher:
                query = QueryParser("content", ix.schema).parse(search_query)
                whoosh_results = searcher.search(query, limit=10)  
                for hit in whoosh_results:
                    results.append({
                        'title': hit['title'],
                        'score': hit.score,
                        'snippet': hit.highlights("content"),
                        'path': hit['path']
                    })
        except Exception as e:
            print(f"Error accessing index at {index_dir}: {e}")
    results.sort(key=lambda x: x['score'], reverse=True)
    return results

search_query = "health" 
results = perform_search(search_query)
results  


Error accessing index at C:\Users\steph\OneDrive\Documents\GitHub\ML-Literature-Search-Engine-Resources\.git\hooks\Index: Index 'MAIN' does not exist in FileStorage('C:\\Users\\steph\\OneDrive\\Documents\\GitHub\\ML-Literature-Search-Engine-Resources\\.git\\hooks\\Index')
Error accessing index at C:\Users\steph\OneDrive\Documents\GitHub\ML-Literature-Search-Engine-Resources\.git\hooks\Index\Index: Index 'MAIN' does not exist in FileStorage('C:\\Users\\steph\\OneDrive\\Documents\\GitHub\\ML-Literature-Search-Engine-Resources\\.git\\hooks\\Index\\Index')
Error accessing index at C:\Users\steph\OneDrive\Documents\GitHub\ML-Literature-Search-Engine-Resources\.git\hooks\Index\Index\Index: Index 'MAIN' does not exist in FileStorage('C:\\Users\\steph\\OneDrive\\Documents\\GitHub\\ML-Literature-Search-Engine-Resources\\.git\\hooks\\Index\\Index\\Index')
Error accessing index at C:\Users\steph\OneDrive\Documents\GitHub\ML-Literature-Search-Engine-Resources\.git\hooks\Index\Index\Index\Index: In

[{'title': 'Stevens_2023_ASDR_processed',
  'score': 4.091159333656623,
  'snippet': 'patient electronic <b class="match term0">health</b> record wa conducted',
  'path': 'C:\\Users\\steph\\OneDrive\\Documents\\GitHub\\ML-Literature-Search-Engine-Resources\\Unsupervised Learning\\Processed\\Stevens_2023_ASDR_processed.txt'},
 {'title': 'Park_2023_VisionTransformer_processed',
  'score': 3.954250463871494,
  'snippet': 'program korea <b class="match term0">health</b> industry development...funded ministry <b class="match term0">health</b> welfare republic...annamaraju haydel <b class="match term0">health</b> insurance portability',
  'path': 'C:\\Users\\steph\\OneDrive\\Documents\\GitHub\\ML-Literature-Search-Engine-Resources\\Supervised Learning\\Processed\\Park_2023_VisionTransformer_processed.txt'},
 {'title': 'Truong_2017_EpilepticSeizureDetection_processed',
  'score': 3.5362392327556846,
  'snippet': 'journal biomedical <b class="match term0">health</b> informatics 19 ensemble...j