In [None]:
import csv
import requests
from bs4 import BeautifulSoup
import re

In [None]:
# Define the CSV file path
csv_file = "colleges.csv"

def retrieveInfoFromCollege(url):
    # Sending a GET request to the URL, ignoring SSL verification
    response = requests.get(url, verify=False)

    # Checking if the request was successful
    if response.status_code == 200:
        # Parsing the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Finding the table containing the required information
        table = soup.find('table', class_='table-bordered')

        if table:
            # Extracting data from table rows
            rows = table.find_all('tr')
            college_info = {}
            for row in rows:
                ths = row.find_all('th')
                tds = row.find_all('td')
                if len(ths) == len(tds):  # Ensure equal number of th and td pairs
                    for th, td in zip(ths, tds):
                        college_info[th.text.strip()] = td.text.strip()

            # Writing the extracted information into csv
            with open(csv_file, 'a', newline='') as csvfile:
                writer = csv.DictWriter(csvfile, fieldnames=["College Name", "College Address", "Principal Phone No", "College STD & Land Phone"])
                
                # Write the data
                writer.writerow({
                    "College Name": college_info.get('College Name', 'Not found'),
                    "College Address": ''.join([line.rstrip('\n') for line in college_info.get('College Address', 'Not found')]),
                    "Principal Phone No": college_info.get('Principal Phone No', 'Not found'),
                    "College STD & Land Phone": college_info.get('College STD & Land Phone', 'Not found')
                })
                
        else:
            print("Table not found on the webpage.")
    else:
        print("Failed to retrieve data from the webpage.")


In [None]:
def retrieveCollegeFromMandal(url):
    # Sending a GET request to the URL, ignoring SSL verification
    response = requests.get(url, verify=False)

    # Checking if the request was successful
    if response.status_code == 200:
        # Parsing the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Finding the table containing the required information
        table = soup.find('table', class_='table-bordered')

        if table:
            # Extracting data from table rows
            tbody = table.find('tbody')
            if tbody:
                rows = tbody.find_all('tr')
                college_links = []
                for row in rows:
                    # Extracting link associated with the college name
                    college_name_cell = row.find_all('td')[5]  # 5th cell contains college name
                    link = college_name_cell.find('a')
                    if link:
                        onclick_attr = link.get('onclick')
                        # Extracting the URL from onclick attribute
                        url_match = re.search(r"'(.*?)'", onclick_attr)
                        if url_match:
                            college_url = url_match.group(1)
                            full_url = f"https://bieap.apcfss.in/{college_url}"
                            college_links.append(full_url)

                # calling retireveInfoFromCollege function on the list of formatted URLs
                for link in college_links:
                    retrieveInfoFromCollege(link)
        else:
            print("Table not found on the webpage.")
    else:
        print("Failed to retrieve data from the webpage.")


In [None]:
def retrieveMandalFromDistrict(url):
    # Sending a GET request to the URL, ignoring SSL verification
    response = requests.get(url, verify=False)

    # Checking if the request was successful
    if response.status_code == 200:
        # Parsing the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Finding the table containing the required information
        table = soup.find('table', class_='table-bordered')

        if table:
            # Extracting data from table rows
            tbody = table.find('tbody')
            if tbody:
                rows = tbody.find_all('tr')
                mandal_links = []
                for row in rows:
                    # Extracting the onclick link from the "Total Colleges" column (third cell)
                    total_colleges_cell = row.find_all('td')[2]  # 3rd cell contains total colleges
                    link = total_colleges_cell.find('a')
                    if link:
                        onclick_attr = link.get('onclick')
                        onclick_attr = onclick_attr.replace('¶m', '&param')
                        # Extracting the URL from onclick attribute
                        url_match = re.search(r"'(.*?)'", onclick_attr)
                        if url_match:
                            onclick_link = url_match.group(1)
                            full_link = f"https://bieap.apcfss.in/{onclick_link}"
                            mandal_links.append(full_link)

                # calling retrieveCollegeFromMandal function on the list of formatted URLs
                for link in mandal_links:
                    retrieveCollegeFromMandal(link)
        else:
            print("Table not found on the webpage.")
    else:
        print("Failed to retrieve data from the webpage.")

In [None]:
# URL of the webpage
url = "https://bieap.apcfss.in/CollegesReport.do"

# Sending a GET request to the URL, ignoring SSL verification
response = requests.get(url, verify=False)

# Checking if the request was successful
if response.status_code == 200:
    # Parsing the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Finding the table containing the required information
    table = soup.find('table', class_='table-bordered')
    
    if table:
        # Extracting data from table rows
        tbody = table.find('tbody')
        if tbody:
            rows = tbody.find_all('tr')
            district_links = []
            for row in rows:
                # Extracting the onclick link associated with the district name (second cell)
                district_name_cell = row.find_all('td')[1]  # 2nd cell contains district name
                link = district_name_cell.find('a')
                if link:
                    onclick_attr = link.get('onclick')
                    onclick_attr = onclick_attr.replace('¶m', '&param')
                    # Extracting the URL from onclick attribute
                    url_match = re.search(r"'(.*?)'", onclick_attr)
                    if url_match:
                        onclick_link = url_match.group(1)
                        full_link = f"https://bieap.apcfss.in/{onclick_link}"
                        district_links.append(full_link)
            # calling retrieveCollegeFromMandal function on the list of formatted URLs
            for link in district_links:
                retrieveMandalFromDistrict(link)
    else:
        print("Table not found on the webpage.")
else:
    print("Failed to retrieve data from the webpage.")