In [5]:
!pip install whoosh
import time
import os
import json
import schedule
from random import randint
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from whoosh.fields import Schema, TEXT, ID, STORED
from whoosh.index import create_in, open_dir
from whoosh.qparser import QueryParser

# SETTINGS
PUB_LIST_URL = "https://pureportal.coventry.ac.uk/en/organisations/fbl-school-of-economics-finance-and-accounting/publications/"
DATA_FILE = "coventry_publications.json"
INDEX_DIR = "whoosh_index"
EDGE_DRIVER_PATH = r"c:\Edge driver\msedgedriver.exe"  
CRAWL_DELAY = (3, 6)  # seconds delay to be polite

def polite_sleep():
    time.sleep(randint(*CRAWL_DELAY))

def get_edge_driver(headless=False):
    options = webdriver.EdgeOptions()
    if headless:
        options.add_argument("--headless")
        options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    service = Service(EDGE_DRIVER_PATH)
    driver = webdriver.Edge(service=service, options=options)
    return driver

def scroll_page(driver):
    # Scroll slowly to bottom to trigger lazy loading JS if any
    last_height = driver.execute_script("return document.body.scrollHeight")
    scroll_pause = 1
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(scroll_pause)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

def crawl_dept_publications():
    print("[*] Starting publication crawl...")
    driver = get_edge_driver(headless=False)  # Set False to debug loading visually. Set True after testing.
    driver.get(PUB_LIST_URL)
    wait = WebDriverWait(driver, 30)

    polite_sleep()
    scroll_page(driver)
    polite_sleep()
    publications = []
    page = 1

    while True:
        try:
            # Wait for any publication container
            wait.until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "div.result-container, ul.list-results"))
            )
            soup = BeautifulSoup(driver.page_source, "html.parser")

            # Try to find publication entries under div.result-container or ul.list-results
            pub_boxes = soup.select("div.result-container") or soup.select("ul.list-results > div.result-container")

            print(f"[Page {page}] Found {len(pub_boxes)} publications.")

            # If the first method returns 0, try an alternative selector on ul.list-results li.result-container
            if len(pub_boxes) == 0:
                pub_boxes = soup.select("ul.list-results > li.result-container")
                print(f"[Page {page}] Alternative selector found {len(pub_boxes)} publications.")

            for box in pub_boxes:
                try:
                    title_tag = box.find("h3").find("a")
                    if not title_tag:
                        continue
                    title = title_tag.text.strip()
                    pub_url = title_tag.get("href")
                    if not pub_url.startswith("http"):
                        pub_url = "https://pureportal.coventry.ac.uk" + pub_url

                    year_tag = box.find("span", class_="date")
                    year = year_tag.text.strip() if year_tag else ""

                    authors = [a.text.strip() for a in box.select("span[itemprop='name']")]
                    author_profiles = []
                    for a in box.select("span.person a[href]"):
                        href = a.get("href")
                        if href and not href.startswith("http"):
                            href = "https://pureportal.coventry.ac.uk" + href
                        if href:
                            author_profiles.append(href)

                    publications.append({
                        "title": title,
                        "pub_link": pub_url,
                        "year": year,
                        "authors": authors,
                        "author_profiles": author_profiles
                    })
                except Exception as e:
                    print(f"Error parsing publication: {e}")

            # Try to paginate: click next link if exists and not disabled
            try:
                next_btn = driver.find_element(By.CSS_SELECTOR, ".nextLink")
                is_disabled = next_btn.get_attribute("class")
                if not next_btn.is_enabled() or "disabled" in is_disabled:
                    print("[*] Last page reached.")
                    break
                # Scroll slightly before clicking next - to ensure button visible
                driver.execute_script("arguments[0].scrollIntoView({behavior:'smooth',block:'center'});", next_btn)
                polite_sleep()
                next_btn.click()
                page += 1
                polite_sleep()
                scroll_page(driver)
                polite_sleep()
            except NoSuchElementException:
                print("[*] No next button, finishing crawl.")
                break

        except TimeoutException:
            print("[ERROR] Timeout waiting for publications to load, stopping crawl.")
            break
        except Exception as e:
            print(f"[ERROR] Unexpected error: {e}")
            break

    driver.quit()
    print(f"[*] Crawl complete: Collected {len(publications)} publications.")
    with open(DATA_FILE, "w", encoding="utf-8") as f:
        json.dump(publications, f)

    return publications

def build_index():
    if not os.path.exists(INDEX_DIR):
        os.mkdir(INDEX_DIR)

    schema = Schema(
        title=TEXT(stored=True),
        pub_link=ID(stored=True, unique=True),
        year=TEXT(stored=True),
        authors=TEXT(stored=True),
        author_profiles=STORED
    )
    ix = create_in(INDEX_DIR, schema)
    writer = ix.writer()
    with open(DATA_FILE, "r", encoding="utf-8") as f:
        publications = json.load(f)

    for pub in publications:
        writer.add_document(
            title=pub["title"],
            pub_link=pub["pub_link"],
            year=pub["year"],
            authors=", ".join(pub["authors"]),
            author_profiles=pub["author_profiles"]
        )
    writer.commit()
    print("[*] Search index built.")

def search_ui():
    ix = open_dir(INDEX_DIR)
    parser = QueryParser("title", ix.schema)
    print("Coventry Scholar Search - Department Publications")
    while True:
        query = input("Enter keywords (or 'q' to quit): ").strip()
        if query.lower() == "q":
            break
        qp = parser.parse(query)
        with ix.searcher() as searcher:
            results = searcher.search(qp, limit=10)
            print(f"\nTop {len(results)} results:")
            if len(results) == 0:
                print(" No results found.")
            for i, hit in enumerate(results):
                print(f"[{i+1}] {hit['title']} ({hit['year']})")
                print(f"Authors: {hit['authors']}")
                print(f"Publication URL: {hit['pub_link']}")
                for ap in hit['author_profiles']:
                    print(f"Author Profile: {ap}")

def scheduled_crawl():
    print("\n[Scheduled] Starting scheduled crawl @", time.asctime())
    crawl_dept_publications()
    build_index()
    print("[Scheduled] Crawl and indexing complete.\n")

def run_scheduler():
    scheduled_crawl()
    schedule.every().week.do(scheduled_crawl)
    print("[*] Scheduler started - weekly crawling activated.")
    while True:
        schedule.run_pending()
        time.sleep(60*5)




In [6]:
pubs = crawl_dept_publications()
build_index()


[*] Starting publication crawl...
[Page 1] Found 50 publications.
[ERROR] Unexpected error: Message: element click intercepted: Element <a href="/en/organisations/fbl-school-of-economics-finance-and-accounting/publications/?page=1" class="nextLink" aria-label="Next page, page2">...</a> is not clickable at point (959, 298). Other element would receive the click: <div class="onetrust-pc-dark-filter ot-fade-in" style="z-index:2147483645;"></div>
  (Session info: MicrosoftEdge=139.0.3405.111); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#elementclickinterceptedexception
Stacktrace:
	GetHandleVerifier [0x0x7ff61f34e755+22117]
	(No symbol) [0x0x7ff61f29edd0]
	GetHandleVerifier [0x0x7ff61f5d4bfc+2669324]
	(No symbol) [0x0x7ff61f0bfe45]
	(No symbol) [0x0x7ff61f0bddf3]
	(No symbol) [0x0x7ff61f0bb86d]
	(No symbol) [0x0x7ff61f0baa38]
	(No symbol) [0x0x7ff61f0afcfe]
	(No symbol) [0x0x7ff61f0da7ea]
	(No symbol) [0x0x7ff61f0af

In [7]:
search_ui()


Coventry Scholar Search - Department Publications


Enter keywords (or 'q' to quit):  q


In [27]:
import json

# filename(JSON file)
filename = 'coventry_publications.json'

# Load the JSON data into a Python list
with open(filename, 'r') as file:
   documents = json.load(file)
    


In [28]:
print(type(documents))

<class 'list'>


In [29]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

from collections import defaultdict
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\navod\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\navod\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [30]:
stemmer= PorterStemmer()
stop_words= set(stopwords.words('english'))
                               

In [31]:
Positional_index= defaultdict(lambda: defaultdict(list))
print(Positional_index)

defaultdict(<function <lambda> at 0x000001DF793218A0>, {})


In [None]:
for doc in documents:
    if 'title' in doc:
        text = doc['title']
        tokens = word_tokenize(text.lower())
        print(tokens)
    else:
        print("No 'title' key in document:", doc)

In [39]:
Positional_index = {}

# Suppose 'documents' is a list of your documents
for doc_id, document in enumerate(documents):
    # Example tokenization, replace with your actual tokenization method
    tokens = document['title'].split()  # or your own tokenization
    position = 0
    for token in tokens:
        # Check if token is a number or stop word
        if not token.isdigit() and token.lower() not in stop_words:
            stemmed = stemmer.stem(token.lower())
            if stemmed not in Positional_index:
                Positional_index[stemmed] = {}
            if doc_id not in Positional_index[stemmed]:
                Positional_index[stemmed][doc_id] = []
            Positional_index[stemmed][doc_id].append(position)
            position += 1

# Print the index
for term, doc_positions in Positional_index.items():
    print(f"{term}: {doc_positions}")



qard: {0: [0]}
hassan: {0: [1]}
(benevol: {0: [2]}
loan): {0: [3]}
crowdfund: {0: [4]}
model: {0: [5], 34: [3]}
refuge: {0: [6]}
financ: {0: [7], 2: [6], 11: [5], 13: [4], 41: [2]}
assess: {1: [0], 26: [0]}
determin: {1: [1]}
particip: {1: [2], 28: [7]}
circular: {1: [3], 12: [4], 28: [11], 34: [4]}
plastic: {1: [4], 28: [10], 34: [5]}
economi: {1: [5], 12: [5], 23: [11], 28: [12], 34: [6]}
nigerian: {1: [6], 9: [3], 45: [2]}
student: {1: [7]}
conceptualis: {2: [0]}
account: {2: [1], 17: [1], 23: [0], 30: [1], 43: [5], 46: [2]}
value-bas: {2: [2]}
concept: {2: [3]}
context: {2: [4], 35: [9]}
altern: {2: [5]}
connected: {3: [0], 18: [5]}
invest: {3: [1], 40: [1]}
strategi: {3: [2]}
volatil: {3: [3]}
assets:: {3: [4]}
dcc-garch: {3: [5]}
r2: {3: [6]}
analysi: {3: [7], 17: [6], 20: [4], 35: [7]}
ofcryptocurr: {3: [8]}
emerg: {3: [9], 23: [10], 46: [3]}
market: {3: [10], 8: [2], 37: [7], 39: [1]}
sector: {3: [11]}
corpor: {4: [0], 6: [0], 19: [5], 31: [0], 32: [0], 39: [4], 43: [2]}
social

In [10]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load your data
with open('coventry_publications.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Extract texts
documents = [item['title'] for item in data]

# Proceed with the TF-IDF vectorization and similarity calculation as above
query = "performance and financial distress during COVID-19"

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)
query_vec = vectorizer.transform([query])
cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()

# Get and display top results
top_indices = cosine_similarities.argsort()[::-1]
for idx in top_indices[:5]:  # top 5 results
    print(f"Score: {cosine_similarities[idx]:.4f} - {documents[idx]}")

Score: 0.7164 - ESG performance and financial distress during COVID-19:  the moderating effects of innovation and capital intensity
Score: 0.1811 - Does the CEO effect differ in times of crisis? Evidence from US and China during the global financial crisis
Score: 0.1685 - ESG disclosure and financial performance of multinational enterprises: The moderating effect of board standing committees
Score: 0.1386 - Corporate Financial Hedging and the Cost of Equity Capital
Score: 0.1050 - Does Digital Orientation Enhance Firm Performance?


In [5]:
with open('coventry_publications.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

print(data[0])

{'title': 'A Qard Hassan (Benevolent Loan) Crowdfunding Model for Refugee Finance', 'pub_link': 'https://pureportal.coventry.ac.uk/en/publications/a-qard-hassan-benevolent-loan-crowdfunding-model-for-refugee-fina', 'year': '11 Feb 2025', 'authors': [], 'author_profiles': []}
