# Information Retrieval Coursework (7071CEM)

Task:
Develop a vertical search engine similar to Google Scholar that only retrieves papers/books published by a member of Coventry University. As such, at least one of the co-authors must be a Coventry University Staff member. 

To this end, the profiles of academic staff at CU available on the CU web site are crawled, and their papers within their profiles are indexed.

## Package Installs

Some packages required for this task are by default not included in the jupyter notebooks package list. A pip-install of these packages is required. 

In [38]:
# Delete the "#" before each subsequent line to install the package

# %pip install scrapy
# %pip install requests
# %pip install BeautifulSoup4
# %pip install nltk
# %pip install gensim
# %pip install xgboost
# %pip install pandastable

## Imports

In [39]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import datetime
import string
import json
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 1. Crawler Component

In [40]:
# Define Seed Page
URL = "https://pureportal.coventry.ac.uk/en/organisations/centre-for-intelligent-healthcare/publications/"

# Define profile URL format. This was obtained by manually examining the profile pages
profile_url = "https://pureportal.coventry.ac.uk/en/persons/"

The seed page, i.e coventry researchers profiles landing page, contains profiles of all academic researchers of the University. As there are up to 2206 results, a limited number are displayed at once (50).

Even if new profiles are added and a new page is required, the crawler will need to crawl through all pages to access every profile. To do this, a function is defined to retrieve the current total number of result pages. 

In [41]:
def get_maximum_page():
    
    first = requests.get(URL)
    soup = BeautifulSoup(first.text, 'html.parser')
    final_page = soup.select('#main-content > div > section > nav > ul > li:nth-child(12) > a')[0]['href']
    fp = final_page.split('=')[-1]
    return int(fp)
    
mx = get_maximum_page()    

In [42]:
print(mx)

13


Instead of crawling all researchers, this web crawler is designed to specifically find researchers who:
    
1. Have research publications
2. Are part of the "**School of Computing, Electronics and Maths**" 

In [43]:
def check_department(researcher):
    
    l1 = researcher.find('div', class_='rendering_person_short')
      
    for span in l1.find_all('span'):
        # Check department
        if span.text == str('School of Computing, Electronics and Maths'):
            name = researcher.find('h3', class_='title').find('span').text
            return name
        else:
            pass

# def create_csv():
#     database = pd.DataFrame(columns=['Title', 'Author', 'Published', 'Link'])
#     database.to_csv('database.csv')
    
def update_csv(database):
    current_data = pd.read_csv(database, index_col="Unnamed: 0")
    return current_data        

def enter_each_researchers_publication(researcher, url, df):
    
    new_url = url + str(researcher).replace(' ','-').lower() + '/publications/'
    page = requests.get(new_url)
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.find(id="main-content")
    papers = results.find_all("li", class_="list-result-item")
    
    
    for paper in papers:
        title = paper.find('h3', class_='title').find('span')
        author = paper.find('a', class_='link person').find('span')
        date = paper.find('span', class_="date")
        link = paper.find('h3', class_='title').find('a', href=True)['href']
        
        opening = pd.read_csv('database.csv', index_col="Unnamed: 0")
        opening = opening.append({'Title': title.text, 
                                  'Author': author.text, 
                                  'Published': date.text,
                                  'Link': link}, ignore_index=True)
   
        opening.to_csv('database.csv')
                

In [44]:
## Scrape function
def scrape(mx):
    df = update_csv('database.csv')
    i=0
    while True:
    
        if i > 17:
            break
            
        if i>0:
            url = URL + '?page=' + str(i)
        else:
            url = URL
    
        i = i+1
        # scraping starts here
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        results = soup.find(id="main-content")
        researchers = results.find_all("li", class_="grid-result-item")

        for researcher in researchers:
            # Check if researcher has any papers
            check = researcher.find('div', class_='stacked-trend-widget')
            if check:
                name = check_department(researcher)
                print(name)
                if name is None:
                    pass
                else:
                    enter_each_researchers_publication(name, profile_url, df)
                    print(name)

In [45]:
scrape(mx)

In [46]:
#create_csv()
update_csv(database='database.csv') 
%time scrape(mx)

CPU times: total: 9.28 s
Wall time: 8min 29s


In [47]:
sample_db = pd.read_csv('database.csv').rename(columns={'Unnamed: 0':'SN'})
sample_db
print(f'{sample_db.shape[0]} records were scraped')

695 records were scraped


# 2. Indexing Component

In [48]:
scraped_db = pd.read_csv('database.csv').rename(columns={'Unnamed: 0':'SN'}).reset_index(drop=True)
scraped_db.head()
# scraped_db = pd.read_csv('database.csv', index_col=0)

Unnamed: 0,SN,Title,Author,Published,Link
0,0,A phantom study of a protective trolley for ne...,"Liu, H.",Feb-23,https://pureportal.coventry.ac.uk/en/publicati...
1,1,A Serious Game for Patients With Eating Disord...,"Bul, K.",27-Jan-23,https://pureportal.coventry.ac.uk/en/publicati...
2,2,Association of the COVID-19 pandemic on stroke...,"Kurmi, O.",17-Mar-23,https://pureportal.coventry.ac.uk/en/publicati...
3,3,"Capability, Opportunity, and Motivation—Identi...","McGregor, G.",28-Jan-23,https://pureportal.coventry.ac.uk/en/publicati...
4,4,Cerebral Hemodynamics Underlying Artery-to-Art...,"Liu, H.",10-Mar-23,https://pureportal.coventry.ac.uk/en/publicati...


In [49]:
sample_db.head(7)
#ids = scraped_db["Title"]
#scraped_db[ids.isin(ids[ids.duplicated()])]

Unnamed: 0,SN,Title,Author,Published,Link
0,0,A phantom study of a protective trolley for ne...,"Liu, H.",Feb-23,https://pureportal.coventry.ac.uk/en/publicati...
1,1,A Serious Game for Patients With Eating Disord...,"Bul, K.",27-Jan-23,https://pureportal.coventry.ac.uk/en/publicati...
2,2,Association of the COVID-19 pandemic on stroke...,"Kurmi, O.",17-Mar-23,https://pureportal.coventry.ac.uk/en/publicati...
3,3,"Capability, Opportunity, and Motivation—Identi...","McGregor, G.",28-Jan-23,https://pureportal.coventry.ac.uk/en/publicati...
4,4,Cerebral Hemodynamics Underlying Artery-to-Art...,"Liu, H.",10-Mar-23,https://pureportal.coventry.ac.uk/en/publicati...
5,5,Congenic hematopoietic stem cell transplantati...,"Sadozai, H.",2-Apr-23,https://pureportal.coventry.ac.uk/en/publicati...
6,6,Contactless WiFi Sensing and Monitoring for Fu...,"Shah, S. A.",2023,https://pureportal.coventry.ac.uk/en/publicati...


In [50]:
single_row = scraped_db.loc[1,:].copy()
single_row

SN                                                           1
Title        A Serious Game for Patients With Eating Disord...
Author                                                 Bul, K.
Published                                            27-Jan-23
Link         https://pureportal.coventry.ac.uk/en/publicati...
Name: 1, dtype: object

## 2.1 Preprocess Text

In [51]:
# Remove stop words
sw = stopwords.words("english")
lemmatizer = WordNetLemmatizer()




def tp1(txt):
    txt = txt.lower()   # Make lowercase
    txt = txt.translate(str.maketrans('',
                                      '',
                                      string.punctuation))   # Remove punctuation marks
    txt = lematize(txt)
    return txt


def fwpt(word):
    tag = pos_tag([word])[0][1][0].upper()
    hash_tag = {"V": wordnet.VERB, "R": wordnet.ADV,"N": wordnet.NOUN,"J": wordnet.ADJ}         
    return hash_tag.get(tag, wordnet.NOUN)

def lematize(text):
        tkns = nltk.word_tokenize(text)
        ax = ""
        for each in tkns:
            if each not in sw:
                ax += lemmatizer.lemmatize(each, fwpt(each)) + " "
        return ax


In [52]:
# Sample title
single_row['Title']

'A Serious Game for Patients With Eating Disorders (Maze Out): Pilot User Experience and Acceptance Study'

In [53]:
# Demonstration of lowercase and punctuation removal
tp1(single_row['Title'])

'serious game patient eat disorder maze pilot user experience acceptance study '

In [54]:
# Demonstration of lematization

lematize(tp1(single_row['Title']))
#lematize(single_row['Title'])

'serious game patient eat disorder maze pilot user experience acceptance study '

#### Unprocessed

In [55]:
sample_db['Title'].iloc[5]

'Congenic hematopoietic stem cell transplantation promotes survival of heart allografts in murine models of acute and chronic rejection'

#### Processed

In [56]:
scraped_db['Title'].iloc[5]

'Congenic hematopoietic stem cell transplantation promotes survival of heart allografts in murine models of acute and chronic rejection'

### 2.1.1 Preprocess entire dataframe

In [57]:
processed_db = scraped_db.copy()

def preprocess_df(df):
    df.Title = df.Title.apply(tp1)
    df.Author = df.Author.str.lower()
    df = df.drop(columns=['Author','Published'], axis=1)
    return df
    
preprocess_df(processed_db)
processed_db.head()

Unnamed: 0,SN,Title,Author,Published,Link
0,0,phantom study protective trolley neonatal radi...,"liu, h.",Feb-23,https://pureportal.coventry.ac.uk/en/publicati...
1,1,serious game patient eat disorder maze pilot u...,"bul, k.",27-Jan-23,https://pureportal.coventry.ac.uk/en/publicati...
2,2,association covid19 pandemic stroke admission ...,"kurmi, o.",17-Mar-23,https://pureportal.coventry.ac.uk/en/publicati...
3,3,capability opportunity motivation—identifying ...,"mcgregor, g.",28-Jan-23,https://pureportal.coventry.ac.uk/en/publicati...
4,4,cerebral hemodynamics underlie arterytoartery ...,"liu, h.",10-Mar-23,https://pureportal.coventry.ac.uk/en/publicati...


## 2.2 Index Construction

In [58]:
single = processed_db.loc[0,:].copy()
print(single)
indexing_trial = {}

words = single.Title.split()
SN = single.SN
word = words[0]
example = {word: [SN]}

print('=====================================================================')
print('Sample index')
print(example)

SN                                                           0
Title        phantom study protective trolley neonatal radi...
Author                                                 liu, h.
Published                                               Feb-23
Link         https://pureportal.coventry.ac.uk/en/publicati...
Name: 0, dtype: object
Sample index
{'phantom': [0]}


In [59]:
## Indexer Function
def apply_index(inputs, index):
    words = inputs.Title.split()
    SN = int(inputs.SN)
    for word in words:
        if word in index.keys():
            if SN not in index[word]:
                index[word].append(SN)
        else:
            index[word] = [SN]
    return index

indx = apply_index(inputs=single, index= {})
#print(indx)

In [60]:
def full_index(df, index):
    for x in range(len(df)):
        inpt = df.loc[x,:]
        ind = apply_index(inputs=inpt, index=index)
    return ind

def construct_index(df, index):
    queue = preprocess_df(df)
    ind = full_index(df=queue, index=index)
    return ind

indexed = full_index(processed_db, 
                     index = {})


indexes = construct_index(df=scraped_db, 
                          index = {})

In [61]:
with open('indexes.json', 'w') as new_f:
    json.dump(indexes, new_f, sort_keys=True, indent=4)
    
with open('indexes.json', 'r') as file:
    data = json.load(file)

def index_2(df, x_path):
    if len(df) > 0:
        with open(x_path, 'r') as file:
            prior_index = json.load(file)
        new_index = construct_index(df = df, index = prior_index)
        with open(x_path, 'w') as new_f:
            json.dump(new_index, new_f, sort_keys=True, indent=4)

In [62]:
len(data)

2383

In [63]:
data

{'05': [535, 565, 624, 626],
 '0·5': [637, 638],
 '1': [510, 584],
 '10': [584],
 '101007s11695021055129': [335],
 '101038s41598020623913': [584],
 '103389fphys2021718540': [336],
 '106911': [636],
 '10year': [565, 624],
 '10–24': [91, 380],
 '12': [336],
 '125oh2': [499],
 '127': [379],
 '1317': [308],
 '1424': [138],
 '156': [396],
 '15w': [343],
 '170618': [375],
 '187386': [308],
 '1950–2019': [380],
 '19902019': [465, 571],
 '1990–2019': [74, 91, 562, 563],
 '1year': [178],
 '2': [50,
  51,
  100,
  162,
  242,
  249,
  252,
  294,
  357,
  363,
  364,
  486,
  496,
  565,
  575,
  607,
  608,
  612,
  676],
 '2000': [301, 570],
 '2000–2018': [415],
 '2003': [424],
 '20052019': [313],
 '2009': [424],
 '2010': [94],
 '20102020': [412],
 '2010–19': [259],
 '2017': [570],
 '2018': [301],
 '2019': [73, 74, 91, 94, 234, 259, 380, 381, 465, 558, 562, 563, 571],
 '2020': [266, 337, 543, 584],
 '2021': [335, 336, 337, 514, 543],
 '204': [465, 562, 563, 571],
 '23': [248],
 '24': [688],
 '

## 3.  Query Processor

In [64]:
def demonstrate_query_processing():
    sample = input('Enter Search Terms: ')
    processed_query = tp1(sample)
    #print(f'User Search Query: {sample}')
    print(f'Processed Search Query: {processed_query}')
    return processed_query
    
#demonstrate_query_processing()

### 3.1.  Split Query into individual terms

In [65]:
def split_query(terms):
    each = tp1(terms)
    return each.split()

dqp = demonstrate_query_processing()
dqp
print(f'Split Query: {split_query(dqp)}')

Processed Search Query: diabetes 
Split Query: ['diabetes']


### 3.2.  Boolean Functionalities

In [66]:
def union(lists):
    union = list(set.union(*map(set, lists)))
    union.sort()
    return union

def intersection(lists):
    intersect = list(set.intersection(*map(set, lists)))
    intersect.sort()
    return intersect

### 3.3. Search Engine Function

In [67]:
def vertical_search_engine(df, query, index=indexes):
    query_split = split_query(query)
    retrieved = []
    for word in query_split:
        if word in index.keys():
            retrieved.append(index[word])
            
            
    # Ranked Retrieval
    if len(retrieved)>0:
        high_rank_result = intersection(retrieved)
        low_rank_result = union(retrieved) 
        c = [x for x in low_rank_result if x not in high_rank_result]      
        high_rank_result.extend(c)
        result = high_rank_result
        
        final_output = df[df.SN.isin(result)].reset_index(drop=True)
    
        # Return result in order of Intersection ----> Union
        dummy = pd.Series(result, name = 'SN').to_frame()
        result = pd.merge(dummy, final_output, on='SN', how = 'left')
        
    else:
        result = 'No result found'
    
    return result

In [68]:
def test_search_engine():
    xtest = scraped_db.copy()
    query = input("Enter your search query: ")
    return vertical_search_engine(xtest, query, indexed)
    
test_search_engine()

Unnamed: 0,SN,Title,Author,Published,Link
0,51,effect low renal acid load diet blood pressure...,"clark, c. c. t.",Dec-23,https://pureportal.coventry.ac.uk/en/publicati...
1,62,longterm effect kerala diabetes prevention pro...,"tapp, r.",22-Mar-23,https://pureportal.coventry.ac.uk/en/publicati...
2,79,randomize tripleblind control clinical trial e...,"clark, c. c. t.",28-Jul-22,https://pureportal.coventry.ac.uk/en/publicati...
3,100,comparison nutritional supplement glycemic con...,"clark, c. c. t.",Sep-22,https://pureportal.coventry.ac.uk/en/publicati...
4,162,lifestyle factor fetal childhood origin type 2...,,Mar-22,https://pureportal.coventry.ac.uk/en/publicati...
5,209,retinal microvascular association cardiometabo...,"tapp, r. j.",Oct-22,https://pureportal.coventry.ac.uk/en/publicati...
6,242,effect probioticssynbiotics supplementation re...,"clark, c. c. t.",28-Aug-22,https://pureportal.coventry.ac.uk/en/publicati...
7,246,effect synbiotic supplementation atherogenic i...,"clark, c. c. t.",21-Jun-22,https://pureportal.coventry.ac.uk/en/publicati...
8,249,effect walnut juglans regia leaf extract glyce...,"clark, c. c. t.",25-Apr-22,https://pureportal.coventry.ac.uk/en/publicati...
9,252,effect covid19 lockdown glycaemic control lipi...,"adegboye, a. r. a.",19-Jan-22,https://pureportal.coventry.ac.uk/en/publicati...


In [69]:
def final_engine(results):
    if type(results) != 'list':
        return results
        #print(results)
    else:
        for i in range(len(results)):
            printout = results.loc[i, :]
            #print(printout['Title'])
            #print(printout['Author'])
            #print(printout['Published'])
            #print(printout['Link'])
            #print('')

In [70]:
scraped_db['Author'].iloc[24]

'abbott, s.'

In [71]:
final_engine(test_search_engine())

Unnamed: 0,SN,Title,Author,Published,Link
0,51,effect low renal acid load diet blood pressure...,"clark, c. c. t.",Dec-23,https://pureportal.coventry.ac.uk/en/publicati...
1,62,longterm effect kerala diabetes prevention pro...,"tapp, r.",22-Mar-23,https://pureportal.coventry.ac.uk/en/publicati...
2,79,randomize tripleblind control clinical trial e...,"clark, c. c. t.",28-Jul-22,https://pureportal.coventry.ac.uk/en/publicati...
3,100,comparison nutritional supplement glycemic con...,"clark, c. c. t.",Sep-22,https://pureportal.coventry.ac.uk/en/publicati...
4,162,lifestyle factor fetal childhood origin type 2...,,Mar-22,https://pureportal.coventry.ac.uk/en/publicati...
5,209,retinal microvascular association cardiometabo...,"tapp, r. j.",Oct-22,https://pureportal.coventry.ac.uk/en/publicati...
6,242,effect probioticssynbiotics supplementation re...,"clark, c. c. t.",28-Aug-22,https://pureportal.coventry.ac.uk/en/publicati...
7,246,effect synbiotic supplementation atherogenic i...,"clark, c. c. t.",21-Jun-22,https://pureportal.coventry.ac.uk/en/publicati...
8,249,effect walnut juglans regia leaf extract glyce...,"clark, c. c. t.",25-Apr-22,https://pureportal.coventry.ac.uk/en/publicati...
9,252,effect covid19 lockdown glycaemic control lipi...,"adegboye, a. r. a.",19-Jan-22,https://pureportal.coventry.ac.uk/en/publicati...


## 4. Schedule Crawler for every week

To demonstrate a weekly scheduled crawling, the following parameters are defined:

* `interval` : Represents number of days in reality. In this code, it represents only seconds for demonstration


In [72]:
# days = 0
# interval = 7
# while days <= 1:
#     scrape(mx)
#     print(f"Crawled at {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
#     print(f'Next crawl scheduled after {interval} days')
#     time.sleep(interval)
#     days = days + 1

# 5. GUI

In [73]:
from tkinter import *
from tkinter import messagebox
from PIL import Image, ImageTk 
from tkinter import scrolledtext
from pandastable import Table, TableModel
from contextlib import suppress
import warnings
import tkinter as tk
from ctypes import windll
warnings.filterwarnings('ignore')

def get_screen_resolution():
    user32 = windll.user32
    width = user32.GetSystemMetrics(0)
    height = user32.GetSystemMetrics(1)
    return width, height


image1 = Image.open('coventry-university-logo.png')
resized_image1 = image1.resize((500,300))

def new_gui(image1):
    window = Tk()
    window.configure(bg='#0F1E9D')
    window.title("Coventry University")
    
    screen_width, screen_height = get_screen_resolution()

    # Set the desired width and height as a percentage of the screen resolution
    width_percent = 0.8  # 80% of the screen width
    height_percent = 0.6  # 60% of the screen height
    
    # Calculate the actual width and height
    width = int(screen_width * width_percent)
    height = int(screen_height * height_percent)
    
    # Set the window geometry
    window.geometry(f"{width}x{height}")
    
    lbl = Label(window, text="Search Engine",bg="#0F1E9D", font=("Arial Bold", 30), padx=5, pady=5)
    lbl.grid(column=1, row=0)
    
    lbl2 = Label(window, text="Enter your search query here ===>", bg="#0F1E9D",font=("Arial", 15), padx=5, pady=5)
    lbl2.grid(column=0, row=1)
    
    
    img = ImageTk.PhotoImage(image1)
    
    lbl3 = Label(image=img)
    lbl3.image = img
    lbl3.grid(column=1, row=3, padx=5, pady=5)
    
    
    
    query = Entry(window,width=40)
    query.grid(column=1, row=1,  padx=5, pady=5)
    
    results = Canvas(window, height=30, width=250)
    results.grid(column=1, row=2, padx=5, pady=5)
    
    # Entry
    def getInputBoxValue():
        userInput = query.get()
        return userInput

    
    # Button
    def clicked():
        search()
        #pass
        
    def no_result():
        messagebox.showwarning("Warning", "No results found. Please try different search terms")
        
    
    def search():
        xtest = scraped_db.copy()
        q = query.get()
        f = Frame(window)
        df = vertical_search_engine(xtest, q, indexed)
        if type(df) == str:
            no_result()
        else:
            pt = Table(results)
            try:
                table = pt = Table(results, dataframe=df)
                pt.show()
            except AttributeError:
                pass
            
    def close_window():
        if messagebox.askokcancel("Quit", "Quit Programme?"):
            window.destroy()
        
    
    btn = Button(window, text="Search",bg="#0F1E9D", command=clicked)
    btn.grid(column=2, row=1)
    
    

    window.protocol("WM_DELETE_WINDOW", close_window)       
    window.mainloop()

In [74]:
new_gui(resized_image1)