In [2]:
!pip3 install whoosh

Collecting whoosh
  Downloading Whoosh-2.7.4-py2.py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.8/468.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: whoosh
Successfully installed whoosh-2.7.4


In [75]:
import os
import os.path

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from collections import defaultdict
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.index import create_in, open_dir
from whoosh.qparser import QueryParser
from whoosh import scoring
from whoosh.index import LockError

In [113]:
# Crawler to fetch data from the Coventry University Research Centre for Health and Life Sciences (RCHL) portal
def crawl_and_index(base_url, index_path):
    # Fetch the page containing the list of publications
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Initialize Whoosh index
    # schema = Schema(title=TEXT(stored=True))
    schema = Schema(title=TEXT(stored=True), authors=TEXT(stored=True), year=ID(stored=True), 
                    publication_url=ID(stored=True, unique=True), author_profile_url=ID(stored=True))
    
    if not os.path.exists(index_path):
        os.mkdir(index_path)
        
    ix = create_in(index_path, schema)
    writer = ix.writer()

    # Extract publication information
    for publication_div in soup.find_all('div', class_='result-container'):
        title_tag = publication_div.find('h3', class_="title")

        if title_tag:
            title = title_tag.get_text(strip = True)
        else:
            title = "N/A"
        
            
        authors_tags = publication_div.find_all('a', class_='link person')
        authors = [author.text.strip() for author in authors_tags] if authors_tags else ["N/A"]


        year_tag = publication_div.find('span', class_='date')
        year = year_tag.text.strip() if year_tag else "N/A"

        publication_url_tag = publication_div.find('a', class_='title')
        publication_url = urljoin(base_url, publication_url_tag['href']) if publication_url_tag else "N/A"

        author_profile_url_tag = publication_div.find('a', class_='link person')
        author_profile_url = urljoin(base_url, author_profile_url_tag['href']) if author_profile_url_tag else "N/A"
        
        # Add data to the Whoosh index
        try:
            writer.add_document(title=title, authors=', '.join(authors), year=year,
                            publication_url=publication_url, author_profile_url=author_profile_url)
            
            # print(title, authors, year, publication_url, author_profile_url)

        except LockError as e:
            print(f"LockError: {e}")
            print("Attempting to clean up lock files...")

            # Manually clean up lock files
            lock_file_path = f"{index_path}/write.lock"

            try:
                os.remove(lock_file_path)
                print(f"Lock file {lock_file_path} removed.")
            except Exception as cleanup_error:
                print(f"Error cleaning up lock file: {cleanup_error}")

    # Commit changes to the index
    print("Committing please wait...")
    writer.commit()
    print("Finished")

In [116]:
# Function to search the index
def search(query, index_path):
    ix = open_dir(index_path)
    
    with ix.searcher(weighting=scoring.TF_IDF()) as searcher:
        query_parser = QueryParser("title", ix.schema)
        query = query_parser.parse(query)
        results = searcher.search(query, terms=True)

        # Display search results
        for result in results:
            print(f"Title: {result['title']}")
            print(f"Authors:  {result['authors']}")
            print(f"Year: {result['year']}")
            print(f"Publication URL: {result['publication_url']}")
            print(f"Author Profile URL: {result['author_profile_url']}")
            print("\n")

In [117]:
# Example usage
base_url = "https://pureportal.coventry.ac.uk/en/organisations/centre-for-health-and-life-sciences"
index_path = "storage"

# Crawl and index data (This should be scheduled to run once per week)
crawl_and_index(base_url, index_path)

# Search for publications
#Barriers and Facilitators
user_query = input("Enter your query: ")
search(user_query, index_path)

Barriers and Facilitators to Participation and Key Components of Sleep Health Programs: perspectives for the corporate work environment ['Roden, L.'] 1 Jan 2024 N/A https://pureportal.coventry.ac.uk/en/persons/laura-roden
Human gut microbiota and endocrinology: paradigm shift from genome to its regulation ['Turner, M. C.', 'Morozov, I.'] 22 Jan 2024 N/A https://pureportal.coventry.ac.uk/en/persons/mark-turner
SGABU computational platform for multiscale modeling: Bridging the gap between education and research ['Jakovljević, D.'] 1 Jan 2024 N/A https://pureportal.coventry.ac.uk/en/persons/djordje-jakovljevic
Hatem Ali ['Hatem Ali'] N/A N/A https://pureportal.coventry.ac.uk/en/persons/hatem-ali
Sara Anisi ['Sara Anisi'] N/A N/A https://pureportal.coventry.ac.uk/en/persons/sara-anisi
Soraya Anisi ['Soraya Anisi'] N/A N/A https://pureportal.coventry.ac.uk/en/persons/soraya-anisi
STRATIFYHF:Artificial intelligence-based decision support system for risk stratification and early detection of 

Enter your query:  Awards


Title: Innovate UK KTP Awards
Authors:  Farnaud, Sebastien
Year: N/A
Publication URL: N/A
Author Profile URL: https://pureportal.coventry.ac.uk/en/persons/sebastien-farnaud


