In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import csv
import os
import re

# Set up ChromeDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# Function to clean text by removing extra spaces, newlines, and redundant phrases
def clean_text(text):
    if not text:
        return "N/A"
    # Remove extra spaces and newlines
    text = " ".join(text.split())
    # Remove common redundant phrases
    redundant_phrases = [
        "Below mentioned we have listed some top scholarships for B.Tech Students 2025.",
        "The key objective of this scheme is to provide financial assistance to",
        "This scholarship is for",
        "Eligibility:",
        "The scholarship aims to",
        "This scheme is",
        "The candidate must",
        "Students must",
    ]
    for phrase in redundant_phrases:
        text = text.replace(phrase, "")
    return text.strip()

# Function to scrape scholarships from collegedunia.com
def scrape_collegedunia(url):
    driver.get(url)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "table-striped")))
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    scholarships_data = {}

    for table in soup.find_all('table', class_='table table-striped style_table'):
        for row in table.find_all('tr')[1:]:  # Skip header row
            cols = row.find_all('td')
            if len(cols) >= 5:  # Ensure there are enough columns
                scholarship_name = cols[0].text.strip()
                provider = cols[1].text.strip()
                eligibility = clean_text(cols[2].text.strip())
                amount = clean_text(cols[4].text.strip())

                # Use scholarship name as key to avoid duplicates
                scholarships_data[scholarship_name] = {
                    'Scholarship Name': scholarship_name,
                    'Provider': provider,
                    'Eligibility': eligibility,
                    'Amount': amount
                }

    return scholarships_data

# Function to scrape scholarships from vidhyaa.in
def scrape_vidhyaa():
    url = "https://www.vidhyaa.in/blog/scholarship-for-btech-students"
    driver.get(url)
    time.sleep(5)  # Wait for JavaScript to load content
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    scholarship_sections = soup.find_all('h2')
    scholarships_data = {}

    for section in scholarship_sections:
        scholarship_name_tag = section.find('a')
        scholarship_name = scholarship_name_tag.text.strip() if scholarship_name_tag else None

        if not scholarship_name or scholarship_name == "N/A":
            continue

        eligibility_section = section.find_next('p')
        eligibility = eligibility_section.text.strip() if eligibility_section else "N/A"
        eligibility = clean_text(eligibility)

        # Use scholarship name as key to avoid duplicates
        scholarships_data[scholarship_name] = {
            'Scholarship Name': scholarship_name,
            'Provider': 'N/A',  # vidhyaa.in does not provide this info
            'Eligibility': eligibility,
            'Amount': 'N/A'  # vidhyaa.in does not provide this info
        }

    return scholarships_data

# Scrape data from collegedunia.com first
collegedunia_urls = [
    "https://collegedunia.com/courses/bachelor-of-technology-btech/scholarship-for-btech-students-in-india",
    "https://collegedunia.com/courses/bachelor-of-technology-btech/scholarships-for-btech-students-in-kerala"
]

print("Scraping collegedunia.com...")
combined_data = {}
for url in collegedunia_urls:
    print(f"Scraping {url}...")
    scholarships = scrape_collegedunia(url)
    combined_data.update(scholarships)  # Add scholarships to combined data

# Scrape data from vidhyaa.in
print("Scraping vidhyaa.in...")
vidhyaa_scholarships = scrape_vidhyaa()

# Add vidhyaa.in scholarships to combined data, avoiding duplicates
for name, data in vidhyaa_scholarships.items():
    if name not in combined_data:  # Only add if not already present
        combined_data[name] = data

# Close the browser
driver.quit()

# Save combined data to CSV
csv_filename = "combined_scholarships_no_duplicates.csv"
with open(csv_filename, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=['Scholarship Name', 'Provider', 'Eligibility', 'Amount'])
    writer.writeheader()
    for scholarship in combined_data.values():
        writer.writerow(scholarship)

# Print CSV file path
csv_path = os.path.abspath(csv_filename)
print(f"✅ Combined data (no duplicates) successfully saved to: {csv_path}")

Scraping collegedunia.com...
Scraping https://collegedunia.com/courses/bachelor-of-technology-btech/scholarship-for-btech-students-in-india...
Scraping https://collegedunia.com/courses/bachelor-of-technology-btech/scholarships-for-btech-students-in-kerala...
Scraping vidhyaa.in...
✅ Combined data (no duplicates) successfully saved to: C:\Users\sreen\Desktop\scholaship\combined_scholarships_no_duplicates.csv


In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import csv
import os
import re

# Set up ChromeDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# Function to clean text by removing extra spaces, newlines, and redundant phrases
def clean_text(text):
    if not text:
        return "N/A"
    # Remove extra spaces and newlines
    text = " ".join(text.split())
    # Remove common redundant phrases
    redundant_phrases = [
        "Below mentioned we have listed some top scholarships for B.Tech Students 2025.",
        "The key objective of this scheme is to provide financial assistance to",
        "This scholarship is for",
        "Eligibility:",
        "The scholarship aims to",
        "This scheme is",
        "The candidate must",
        "Students must",
    ]
    for phrase in redundant_phrases:
        text = text.replace(phrase, "")
    return text.strip()

# Function to scrape scholarships from collegedunia.com
def scrape_collegedunia(url):
    driver.get(url)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "table-striped")))
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    scholarships_data = {}

    for table in soup.find_all('table', class_='table table-striped style_table'):
        for row in table.find_all('tr')[1:]:  # Skip header row
            cols = row.find_all('td')
            if len(cols) >= 5:  # Ensure there are enough columns
                scholarship_name = cols[0].text.strip()
                provider = cols[1].text.strip()
                eligibility = clean_text(cols[2].text.strip())
                amount = clean_text(cols[4].text.strip())

                # Use scholarship name as key to avoid duplicates
                scholarships_data[scholarship_name] = {
                    'Scholarship Name': scholarship_name,
                    'Provider': provider,
                    'Eligibility': eligibility,
                    'Amount': amount
                }

    return scholarships_data

# Function to scrape scholarships from vidhyaa.in
def scrape_vidhyaa():
    url = "https://www.vidhyaa.in/blog/scholarship-for-btech-students"
    driver.get(url)
    time.sleep(5)  # Wait for JavaScript to load content
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    scholarship_sections = soup.find_all('h2')
    scholarships_data = {}

    for section in scholarship_sections:
        scholarship_name_tag = section.find('a')
        scholarship_name = scholarship_name_tag.text.strip() if scholarship_name_tag else None

        if not scholarship_name or scholarship_name == "N/A":
            continue

        eligibility_section = section.find_next('p')
        eligibility = eligibility_section.text.strip() if eligibility_section else "N/A"
        eligibility = clean_text(eligibility)

        # Use scholarship name as key to avoid duplicates
        scholarships_data[scholarship_name] = {
            'Scholarship Name': scholarship_name,
            'Provider': 'N/A',  # vidhyaa.in does not provide this info
            'Eligibility': eligibility,
            'Amount': 'N/A'  # vidhyaa.in does not provide this info
        }

    return scholarships_data

# Scrape data from collegedunia.com first
collegedunia_urls = [
    "https://collegedunia.com/courses/bachelor-of-technology-btech/scholarship-for-btech-students-in-india",
    "https://collegedunia.com/courses/bachelor-of-technology-btech/scholarships-for-btech-students-in-kerala"
]

print("Scraping collegedunia.com...")
combined_data = {}
for url in collegedunia_urls:
    print(f"Scraping {url}...")
    scholarships = scrape_collegedunia(url)
    combined_data.update(scholarships)  # Add scholarships to combined data

# Scrape data from vidhyaa.in
print("Scraping vidhyaa.in...")
vidhyaa_scholarships = scrape_vidhyaa()

# Add vidhyaa.in scholarships to combined data, avoiding duplicates
for name , data in vidhyaa_scholarships.items():
    if name not in combined_data:  # Only add if not already present
        combined_data[name] = data

# Close the browser
driver.quit()

# Save combined data to CSV
csv_filename = "combined60.csv"
with open(csv_filename, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=['Scholarship Name', 'Provider', 'Eligibility', 'Amount'])
    writer.writeheader()
    for scholarship in combined_data.values():
        writer.writerow(scholarship)

# Print CSV file path
csv_path = os.path.abspath(csv_filename)
print(f"✅ Combined data (no duplicates) successfully saved to: {csv_path}")

Scraping collegedunia.com...
Scraping https://collegedunia.com/courses/bachelor-of-technology-btech/scholarship-for-btech-students-in-india...
Scraping https://collegedunia.com/courses/bachelor-of-technology-btech/scholarships-for-btech-students-in-kerala...
Scraping vidhyaa.in...
✅ Combined data (no duplicates) successfully saved to: C:\Users\sreen\Desktop\scholaship\combined60.csv
