<a href="https://colab.research.google.com/github/sharan6422/information-retrieval/blob/main/website_crawling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#imports

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import datetime
import string
import json
import nltk


In [None]:
#all the nltk libraries
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:

#other libraries
import csv
from urllib.parse import urljoin

#Web Scrapping and Crawling


In [None]:
# Robot.txt and crawl delay
def fetch_crawl_delay(url):
    robots_url = urljoin(url, "/robots.txt")
    response = requests.get(robots_url)
    if response.status_code == 200:
        robots_content = response.text
        for line in robots_content.split('\n'):
            if line.startswith("Crawl-delay:"):
                delay = float(line.split(":")[1].strip())
                return delay
    return None

In [None]:
# fetching publication details
def fetch_publication_details(url, crawl_delay):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')
    publications = []

    for publication in soup.select('li.list-result-item'):
        title_element = publication.select_one('h3.title > a')
        title = title_element.get_text(strip=True) if title_element else "Title Not Found"
        publication_url = title_element['href'] if title_element else "Publication URL Not Found"

        author_elements = publication.select('a.link.person')
        authors = [author.get_text(strip=True) for author in author_elements]
        author_links = [author['href'] for author in author_elements]

        publication_year_element = publication.select_one('span.date')
        publication_year = publication_year_element.get_text(strip=True) if publication_year_element else "Publication Year Not Found"

        publications.append((title, authors, publication_year, publication_url, author_links))

    # being polite
    if crawl_delay:
        time.sleep(crawl_delay)

    return publications

In [None]:
# saving to csv
def save_to_csv(publications, csv_file):
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Title', 'Authors', 'Publication_Year', 'Publication_URL', 'Author_URLs'])
        for title, authors, publication_year, publication_url, author_links in publications:
            writer.writerow([title, ", ".join(authors), publication_year, publication_url, ", ".join(author_links)])

In [None]:
# crawling and saving to csv
def main_crawl():
    url = "https://pureportal.coventry.ac.uk/en/organisations/centre-global-learning/publications/"
    csv_file = 'publications_and_hyperlinks.csv'

    # Check robot.txt
    crawl_delay = fetch_crawl_delay(url)
    if crawl_delay:
        print("delay=", crawl_delay, "sec")

    # empty list for publication
    all_publications = []

    # variables for pagination
    current_page = 1
    count = 0

    # dictionary for staff and publications
    staff_publications = {}

    # Crawl and extract with delay
    while True:
        result_temp = fetch_publication_details(url + f"?page={current_page-1}", crawl_delay)

        if not result_temp:
            print('Pages end here!!')
            break

        if result_temp[0][0] == "Title is not available" and not result_temp[0][1]:
            break
        else:
            all_publications.extend(result_temp)
            print(f"Scraped {len(result_temp)} publications from {url}?page={current_page}")
            count += len(result_temp)
            current_page += 1

            # Update dictionary
            for _, authors, _, _, _ in result_temp:
                for author in authors:
                    if author not in staff_publications:
                        staff_publications[author] = []
                    staff_publications[author].append(_)

    print("Total publications in numbers:", count)

    # Calculate the number of staff
    num_staff = len(staff_publications)

    # Calculate the max No:of distinct publications per staff
    max_publications = max(len(publications) for publications in staff_publications.values())

    print("Number of staff whose publications are crawled (approximately):", num_staff)
    print("Num of distinct publications per staff:", max_publications)
    # Save  to a CSV
    save_to_csv(all_publications, csv_file)
    print("CSV files are saved as", csv_file)

    return len(all_publications)


if __name__ == "__main__":
    num_records_crawl = main_crawl()
    print(f"{num_records_crawl} total scrapped")


initial_db = pd.read_csv('publications_and_hyperlinks.csv').rename(columns={'Unnamed: 0':'SN'})
initial_db
print(f'{initial_db.shape[0]} records scraped')



Scraped 50 publications from https://pureportal.coventry.ac.uk/en/organisations/centre-global-learning/publications/?page=1
Scraped 50 publications from https://pureportal.coventry.ac.uk/en/organisations/centre-global-learning/publications/?page=2
Scraped 50 publications from https://pureportal.coventry.ac.uk/en/organisations/centre-global-learning/publications/?page=3
Scraped 50 publications from https://pureportal.coventry.ac.uk/en/organisations/centre-global-learning/publications/?page=4
Scraped 50 publications from https://pureportal.coventry.ac.uk/en/organisations/centre-global-learning/publications/?page=5
Scraped 50 publications from https://pureportal.coventry.ac.uk/en/organisations/centre-global-learning/publications/?page=6
Scraped 50 publications from https://pureportal.coventry.ac.uk/en/organisations/centre-global-learning/publications/?page=7
Scraped 2 publications from https://pureportal.coventry.ac.uk/en/organisations/centre-global-learning/publications/?page=8
Pages end

In [None]:
import pandas as pd

# Read CSV into initial_db and rename the column to 'SN'
initial_db = pd.read_csv('publications_and_hyperlinks.csv').rename(columns={'Unnamed: 0':'SN'})

# Add 'SN' column
initial_db['SN'] = range(len(initial_db))

# Save the updated DataFrame
initial_db.to_csv('publications_and_hyperlinks.csv', index=False)

# Display
initial_db


Unnamed: 0,Title,Authors,Publication_Year,Publication_URL,Author_URLs,SN
0,A bibliometric analysis of shadow education in...,"Karakus, M.",Jul 2024,https://pureportal.coventry.ac.uk/en/publicati...,https://pureportal.coventry.ac.uk/en/persons/m...,0
1,Able or disabled: why should neurodiverse stud...,"Ayoubi, R.",25 Sept 2024,https://pureportal.coventry.ac.uk/en/publicati...,https://pureportal.coventry.ac.uk/en/persons/r...,1
2,Analysing the impact of post-pandemic factors ...,"Ayoubi, R.",13 Mar 2024,https://pureportal.coventry.ac.uk/en/publicati...,https://pureportal.coventry.ac.uk/en/persons/r...,2
3,A Review of the Work of the Positive Youth Fou...,"Morini, L., Price, C.",7 Nov 2024,https://pureportal.coventry.ac.uk/en/publicati...,https://pureportal.coventry.ac.uk/en/persons/l...,3
4,BAAL–Cambridge University Press Seminar 2024 –...,"Orsini-Jones, M.",22 Nov 2024,https://pureportal.coventry.ac.uk/en/publicati...,https://pureportal.coventry.ac.uk/en/persons/m...,4
...,...,...,...,...,...,...
347,Eǧitim örgütlerinde entelektüel sermayenin yön...,"Karakus, M.",2008,https://pureportal.coventry.ac.uk/en/publicati...,https://pureportal.coventry.ac.uk/en/persons/m...,347
348,Practising cultural sensitivity in virtual spaces,"Wimpenny, K.",30 Sept 2006,https://pureportal.coventry.ac.uk/en/publicati...,https://pureportal.coventry.ac.uk/en/persons/k...,348
349,Gender mainstreaming or just more male-streaming?,,2005,https://pureportal.coventry.ac.uk/en/publicati...,,349
350,The charismatic school leader: Potent myth or...,"Crawford, M.",1 Dec 2002,https://pureportal.coventry.ac.uk/en/publicati...,https://pureportal.coventry.ac.uk/en/persons/m...,350


In [None]:
import nltk
nltk.download('all')

#indexing
# Reset the index
scraped_initial_db = initial_db.reset_index(drop=True)

# Display few rows of the DataFrame
scraped_initial_db.head()

# Extract Title column initial_db
ids = initial_db["Title"]

# Find duplicate rows
initial_db[ids.isin(ids[ids.duplicated()])]

# Extract the row at index 1
one_row = initial_db.loc[1, :].copy()

# Display the DataFrame
one_row

def transform_text(text):
    #  lowercase
    text = text.lower()

    #  punctuation marks
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Lemmatize and stop words
    text = lemmatize(text)

    return text

def map_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    hash_tag = {"V": wordnet.VERB, "R": wordnet.ADV, "N": wordnet.NOUN, "J": wordnet.ADJ}
    return hash_tag.get(tag, wordnet.NOUN)

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words("english"))
    tokens = nltk.word_tokenize(text)

    # Lemmatize and remove stop words
    lemmatized_text = ""
    for token in tokens:
        if token not in stop_words:
            lemmatized_text += lemmatizer.lemmatize(token, map_pos(token)) + " "

    return lemmatized_text

#  copy of craped DataFrame
processed_initial_db = scraped_initial_db.copy()

def transform_df(dataframe):

    # Preprocess the Title
    dataframe['Title'] = dataframe['Title'].apply(transform_text)

    # Convert Authors to lowercase
    dataframe['Authors'] = dataframe['Authors'].str.lower()

    # Drop unnecessary columns
    dataframe = dataframe.drop(columns=['Authors', 'Publication_Year'], axis=1)

    return dataframe

# Preprocess
transform_df(processed_initial_db)

# Display
processed_initial_db.head()

# Create a copy of DataFrame
single = processed_initial_db.loc[0, :].copy()

# Print the single row to inspect its content
print(single)

# Initialize dictionary to store index
indexing_trial = {}

# Split 'Title' into words
words = single.Title.split()

# Get 'SN' of single row
SN = single.SN

# Extract the first word Title
word = words[0]

# Create a dictionary word key and SN value
example = {word: [SN]}


def index_documents_for_row(row_data, word_index):
    # Split the Title
    words = row_data['Title'].split()

    # Get the SN of row
    index_sn = int(row_data['SN'])

    # Iterate through Title
    for word in words:
        # If the word is already in the word index, update the index entry
        if word in word_index.keys():
            # if the 'SN' is not there, then add
            if index_sn not in word_index[word]:
                word_index[word].append(index_sn)
        # If word not in word index, create new entry with  'SN'
        else:
            word_index[word] = [index_sn]

    return word_index

# Initialize empty dictionary to store  index
word_index = {}

# Apply indexing 'single' row using the 'word_index' dictionary
word_index = index_documents_for_row(row_data=single, word_index={})

# Display
print(word_index)


def full_index_data_frame(df, index):
    # Iterate through  DataFrame
    for x in range(len(df)):
        # current row data
        row_data = df.loc[x, :]

        # Update the word index using the current row data
        index = index_documents_for_row(row_data=row_data, word_index=index)

    return index

def construct_index(df, index):
    # Preprocess
    processed_df = transform_df(df)

    # full indexing to preprocessed DataFrame
    index = full_index_data_frame(df=processed_df, index=index)

    return index

# Construct word index for processed_initial_db DataFrame
indexed = full_index_data_frame(df=processed_initial_db, index={})

# Construct the word index for the 'scraped_db' DataFrame
indexes = construct_index(df=scraped_initial_db, index={})



[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root

Title               bibliometric analysis shadow education asia pr...
Authors                                                   karakus, m.
Publication_Year                                             Jul 2024
Publication_URL     https://pureportal.coventry.ac.uk/en/publicati...
Author_URLs         https://pureportal.coventry.ac.uk/en/persons/m...
SN                                                                  0
Name: 0, dtype: object
{'bibliometric': [0], 'analysis': [0], 'shadow': [0], 'education': [0], 'asia': [0], 'private': [0], 'supplementary': [0], 'tutor': [0], 'implication': [0]}


In [None]:
import json

def save_index_to_json(index, file_path):
    with open(file_path, 'w') as new_file:
        json.dump(index, new_file, sort_keys=True, indent=4)

def load_index_from_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

def update_index(df, file_path):
    if len(df) > 0:
        prior_index = load_index_from_json(file_path)
        new_index = full_index_data_frame(df=df, index=prior_index)
        save_index_to_json(new_index, file_path)


# Save the word index to 'indexes.json'
save_index_to_json(indexes, 'indexes.json')

# Load the word index from 'indexes.json'
loaded_indexes = load_index_from_json('indexes.json')

# Update the word index with new data from 'processed_db' and save it to 'indexes.json'
update_index(df=processed_initial_db, file_path='indexes.json')


print(len(loaded_indexes))

loaded_indexes

def preprocess_query(query):
    #  lowercase
    query = query.lower()

    # punctuation marks
    query = query.translate(str.maketrans('', '', string.punctuation))

    # Lemmatize
    query = lemmatize(query)

    return query

def demonstrate_query_processing():
    user_input = input('Enter query: ')
    processed_query = preprocess_query(user_input)
    print(f'Processed  Query: {processed_query}')
    return processed_query

demonstrate_query_processing()


def preprocess_and_split_query(terms):
    preprocessed_query = preprocess_query(terms)
    individual_words = preprocessed_query.split()
    return individual_words

def demonstrate_query_processing():
    user_input = input('Enter Query: ')
    processed_query = preprocess_query(user_input)
    print(f'Processed Query: {processed_query}')
    return processed_query

# Get the preprocessed and split query
dqp = demonstrate_query_processing()
split_query_result = preprocess_and_split_query(dqp)

print(f'Split Query: {split_query_result}')

def get_union(lists):
    union = list(set.union(*map(set, lists)))
    union.sort()
    return union

def get_intersection(lists):
    intersect = list(set.intersection(*map(set, lists)))
    intersect.sort()
    return intersect

def vertical_search_engine(data_frame, query, word_index):
    query_terms = preprocess_and_split_query(query)  # Split
    retrieved_sns = []

    # Retrieve SNs from the word index
    for term in query_terms:
        if term in word_index:
            retrieved_sns.append(word_index[term])

    # Perform Ranked Retrieval if matched
    if len(retrieved_sns) > 0:
        high_rank_result = get_intersection(retrieved_sns)  # High-rank result is intersection SNs
        low_rank_result = get_union(retrieved_sns)  # Low-rank result is union of retrieved SNs
        uncommon_sns = [x for x in low_rank_result if x not in high_rank_result]
        high_rank_result.extend(uncommon_sns)
        result_sns = high_rank_result

        # Extract final output containing the search result
        final_output = data_frame[data_frame.SN.isin(result_sns)].reset_index(drop=True)

        # Merge result DataFrame with the SNs to maintain the order of Intersection ----> Union
        dummy = pd.Series(result_sns, name='SN').to_frame()
        result_df = pd.merge(dummy, final_output, on='SN', how='left')
    else:
        result_df = 'No results'

    return result_df

def test_search_engine(data_frame, word_index):
    query = input("Enter query: ")
    result = vertical_search_engine(data_frame, query, word_index)
    return result
def final_engine(results):
    if isinstance(results, pd.DataFrame):
        for i in range(len(results)):
            printout = results.loc[i, :]
            print(f"Title: {printout['Title']}")
            print(f"Author: {printout['Authors']}")
            print(f"Published: {printout['Publication_Year']}")
            print(f"Link: {printout['Publication_URL']}")
            print(f"Author Link: {printout['Author_URLs']}")
            print('')
    else:
        print(results)
def test_search_engine(df, index):
    query = input("Enter your query: ")
    return vertical_search_engine(df, query, index)

# Test
results = test_search_engine(scraped_initial_db, indexed)
final_engine(results)
# initial
days = 0
interval = 7

while True:
    fetch_publication_details("https://pureportal.coventry.ac.uk/en/organisations/centre-global-learning/publications/", interval)
    print(f"Crawled at {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f'Next crawl scheduled after {interval} days')
    time.sleep(interval * 24 * 60 * 60)


1272
Enter query: cancer
Processed  Query: cancer 
Enter Query: cancer
Processed Query: cancer 
Split Query: ['cancer']
Enter your query: cancer
No results
Crawled at 2025-01-15 13:33:02
Next crawl scheduled after 7 days


KeyboardInterrupt: 