### Crawler and search system

In [4]:
import os
import requests
from bs4 import BeautifulSoup 
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.index import create_in, open_dir
from urllib.parse import urljoin
from whoosh.qparser import QueryParser
from whoosh import scoring

In [5]:
BASE_URL = "https://pureportal.coventry.ac.uk/en/organisations/ihw-centre-for-health-and-life-sciences-chls/publications/"
INDEX_PATH = "storage"

In [6]:

def crawl():
    response = requests.get(BASE_URL)
    soup = BeautifulSoup(response.text, "html.parser")
                         
    schema = Schema(title=TEXT(stored=True), authors=TEXT(stored=True), year=ID(stored=True),publication_url=ID(stored=True, unique=True))
                         
    if not os.path.exists(INDEX_PATH):
        os.mkdir(INDEX_PATH)
                         
    ix = create_in(INDEX_PATH, schema)
    writer = ix.writer()
                         
    # Extract publication information
    for publication_div in soup.find_all('div', class_='result-container'):
        title = publication_div.find('h3', class_='title')
        title = title.text.strip()
                         
                         
        authors_tags = publication_div.find_all('span')
        authors = [author.text.strip() for author in authors_tags]
                         
        year = publication_div.find('span', class_='date')
        year = year.text.strip() if year else ["N/A"]
                         
        url = publication_div.find('a', class_='link person')
        url = urljoin(BASE_URL, url['href'])
                         
        writer.add_document(title=title, authors=' ,'.join(authors), year=year, publication_url=url)
                         
    print("commiting please wait")
    writer.commit()
    print("data download completed")

                         

In [7]:
crawl()

TypeError: 'NoneType' object is not subscriptable

In [None]:
def search(query):
    ix = open_dir(INDEX_PATH)
    
    with ix.searcher(weighting=scoring.TF_IDF()) as searcher:
        query_parser = QueryParser("title", ix.schema)
        query = query_parser.parse(query)
        results = searcher.search(query, terms=True)
        
        for result in results:
            # print(result)
            print(f"Title: {result['title']}")
            print(f"Authors: {result['authors']}")
            print(f"Publication Date: {result['year']}")
            print(f"Publication URL: {result['publication_url']}")
            print("\n")

In [55]:
user_query = input("Enter your query:")
search(user_query)

Enter your query: Barriers


Title: Barriers and Facilitators to Participation and Key Components of Sleep Health Programs: perspectives for the corporate work environment
Authors: Barriers and Facilitators to Participation and Key Components of Sleep Health Programs: perspectives for the corporate work environment ,Pienaar, P. R. ,Bosma, A. ,Rae, D. E. ,Roden, L. ,van Mechelen, W. ,Lambert, E. V. ,Boot, C. R. L. ,1 Jan 2024 ,In: Journal of Occupational and Environmental Medicine. ,Journal of Occupational and Environmental Medicine. ,66 ,1 ,p. 35-42 ,8 p. ,Research output: ,: ,Contribution to journal › ,› ,Article ,› peer-review ,› ,Health
100% , ,Health ,100% ,Sleep
100% , ,Sleep ,100% ,Health Program
100% , ,Health Program ,100% ,Work Environment
100% , ,Work Environment ,100% ,Workplace
75% , ,Workplace ,75%
Publication Date: 1 Jan 2024
Publication URL: https://pureportal.coventry.ac.uk/en/persons/laura-roden


