In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from time import sleep

urls = []

def gather_links(url, crawl_delay=1):
    '''
    Function to gather all the links from a given base URL (https://iiitp.ac.in/) in a recursive manner.
    The function will keep on crawling the links until all the links are exhausted.
    All urls will be appended to a list and written to a file named urls.txt
    
    Args:
    url: str: base URL to crawl

    Returns:
    urls: list: list of all the URLs crawled
    '''
    response = requests.get(url)

    if response.status_code != 200:
        print(f"Failed to get URL. Status code: {response.status_code}")
        return None
    
    content = response.content
    soup = BeautifulSoup(content, 'html.parser')
    links = soup.find_all('a')

    for link in links:
        href = link.get('href')
        if href and not href.startswith('http'):
            full_url = urljoin(url, href)
        else:
            full_url = href

        if (not full_url) or (not full_url.startswith(url)) or \
            full_url == url or full_url.endswith(('pdf', 'jpg', 'png')) or \
                full_url in urls:
            continue
        urls.append(full_url)
        print(f"Adding {full_url}")
        gather_links(full_url)

    sleep(crawl_delay)
    return urls

In [2]:
# To gather all the links from the base URL

base_url = "https://iiitp.ac.in/"
crawl_delay = 0.1
gather_links(base_url, crawl_delay)

with open('urls.txt', 'w') as f:
    for url in urls:
        f.write(url + '\n')

print(f"Total number of URLs: {len(urls)}")

Adding https://iiitp.ac.in/#main-content
Adding https://iiitp.ac.in/screen-reader-access
Adding https://iiitp.ac.in/screen-reader-access#main-content
Adding https://iiitp.ac.in/sitemap
Adding https://iiitp.ac.in/sitemap#main-content
Adding https://iiitp.ac.in/hi
Adding https://iiitp.ac.in/hi#main-content
Adding https://iiitp.ac.in/hi/mentor-institute-coep
Adding https://iiitp.ac.in/hi/mentor-institute-coep#main-content
Adding https://iiitp.ac.in/hi/page/darsatai-aura-lakasaya
Adding https://iiitp.ac.in/hi/page/darsatai-aura-lakasaya#main-content
Adding https://iiitp.ac.in/hi/page/directordesk
Adding https://iiitp.ac.in/hi/page/directordesk#main-content
Adding https://iiitp.ac.in/hi/page/sansathaana-kae-baarae-maen
Adding https://iiitp.ac.in/hi/page/sansathaana-kae-baarae-maen#main-content
Adding https://iiitp.ac.in/hi/page/agaamai-paraisara
Adding https://iiitp.ac.in/hi/page/agaamai-paraisara#main-content
Adding https://iiitp.ac.in/hi/node/1140
Adding https://iiitp.ac.in/hi/node/1140#m

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Adding https://iiitp.ac.in/node/255
Adding https://iiitp.ac.in/node/255#main-content
Adding https://iiitp.ac.in/testimonials
Adding https://iiitp.ac.in/testimonials#main-content
Adding https://iiitp.ac.in/page/iiit-pune-sgrc
Adding https://iiitp.ac.in/page/iiit-pune-sgrc#main-content
Adding https://iiitp.ac.in/page/iiit-pune-icc
Adding https://iiitp.ac.in/page/iiit-pune-icc#main-content
Adding https://iiitp.ac.in/page/iiit-pune-eoc
Adding https://iiitp.ac.in/page/iiit-pune-eoc#main-content
Adding https://iiitp.ac.in/page/right-information
Adding https://iiitp.ac.in/page/right-information#main-content
Adding https://iiitp.ac.in/page/national-institutional-ranking-framework-nirf
Adding https://iiitp.ac.in/page/national-institutional-ranking-framework-nirf#main-content
Total number of URLs: 327


In [13]:
import requests
from bs4 import BeautifulSoup

def scrape_text(url):
    '''
    Function to scrape text from a given URL.
    Attempts to retrieve main content by checking several common containers.
    Removes unwanted text like footers, headers, sidebars, links, images, etc.

    Args:
    url (str): URL to scrape.

    Returns:
    text (str): Scraped text from the URL.
    '''
    url = url.strip()
    print(f"Scraping {url}")
    response = requests.get(url)

    if response.status_code != 200:
        print(f"Failed to get URL. Status code: {response.status_code}")
        return None
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Try to find main content by several common containers
    main_content = soup.find('div', class_='innerpage-container')
    if not main_content:
        main_content = soup.find('main') or soup.find('div', id='content') or soup.find('body')

    if not main_content:
        print(f"No main content found for {url}")
        return None

    # Remove unnecessary elements
    for tag in ["script", "style", "footer", "header", "aside", "a", "img"]:
        for element in main_content.find_all(tag):
            element.decompose()

    # Extract text and format it
    text = main_content.get_text(separator='\n')
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)

    return f"These are the details related to {url.split('/')[-1]}:\n```{text}```\n\n"


In [14]:
# Scrape text from the URLs

from time import sleep

with open('urls.txt', 'r') as f:
    urls = f.readlines()

with open('raw.txt', 'w', encoding='utf-8') as f:
    for url in urls:
        text = scrape_text(url)
        print(text)
        if text:
            f.write(text)
        sleep(0.1)

print("Scraping completed!")

Scraping https://iiitp.ac.in/#main-content
These are the details related to #main-content:
```0
1
2
3
4
5
6
7
8
9
10
Latest Updates
Latest News
To address the challenges faced by the Indian IT industry and growth of the domestic IT market, the
Ministry of Education (MoE),
Government of India intends to establish twenty
Indian Institutes of Information Technology(IIIT)
, on a Not-for-profit Public Private Partnership (N-PPP) basis. The partners in setting up the IllTs would be the Ministry of Education (MoE), Governments of the respective States where each lllT will be established, and the industry.
Admission 2024-25
Achievements
Information Desk
IIIT Pune At a Glance
Events
Testimonials
Indian Institute Of Information Technology, Pune is the hub of skills, knowledge,success and best placements.It is one of India’s prestigious and fastest growing technical Institutes. I joined IIIT PU ...
Nikita Bhatia
M.Tech ,
Electronics and Communication Engineering ,
Indian Institute of Information 

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


No main content found for https://iiitp.ac.in/sites/default/files/2024-01/ACADEMIC%20PROGRESS%20PHD%20Format.doc
None
Scraping https://iiitp.ac.in/node/255
These are the details related to 255:
```Quick Links
Locate us
Indian Institute of Information Technology (IIIT) Pune
Unique Visitors:
452010
NUMBER OF HITS:
3576527
Copyright © 2021 - All Rights Reserved - IIIT Pune```


Scraping https://iiitp.ac.in/node/255#main-content
These are the details related to 255#main-content:
```Quick Links
Locate us
Indian Institute of Information Technology (IIIT) Pune
Unique Visitors:
452010
NUMBER OF HITS:
3576527
Copyright © 2021 - All Rights Reserved - IIIT Pune```


Scraping https://iiitp.ac.in/testimonials
These are the details related to testimonials:
```Nikita Bhatia
M.Tech,
Electronics and Communication Engineering,
Indian Institute of Information Technology, Pune
Indian Institute Of Information Technology, Pune is the hub of skills, knowledge,success and best placements.It is one of India’s 

* Once the raw text is extracted from the website, it is saved as 'gat_raw.txt'.
* Later, the text is manually cleaned and formatted as per the requirements for 'Retrieval Augmented Generation' (RAG) task.
* The cleaned text is saved as 'gat_refined.txt'.