In [None]:
!pip install requests beautifulsoup4 selenium pandas lxml

In [1]:
#!/usr/bin/env python3
"""
University Website Scraper
A comprehensive web scraping solution for extracting data from university websites.
Includes multiple scraping techniques, error handling, and data storage.
"""

import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time
import re
from urllib.parse import urljoin, urlparse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import logging
from dataclasses import dataclass
from typing import List, Dict, Optional
import os
from datetime import datetime

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('scraper.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

@dataclass
class UniversityData:
    """Data structure to hold university information"""
    name: str
    url: str
    description: str
    departments: List[str]
    courses: List[Dict]
    faculty: List[Dict]
    news: List[Dict]
    contact_info: Dict
    scraped_at: str

class UniversityScraper:
    """
    A comprehensive university website scraper that handles:
    - Static content with requests + BeautifulSoup
    - Dynamic content with Selenium
    - Rate limiting and respectful scraping
    - Error handling and retry logic
    - Data storage in multiple formats
    """
    
    def __init__(self, base_url: str, delay: float = 1.0):
        self.base_url = base_url
        self.delay = delay
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        
        # Setup Selenium WebDriver (headless)
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--window-size=1920,1080')
        
        try:
            self.driver = webdriver.Chrome(options=chrome_options)
            self.use_selenium = True
        except Exception as e:
            logger.warning(f"Selenium setup failed: {e}. Using requests only.")
            self.use_selenium = False
    
    def get_page_content(self, url: str, use_selenium: bool = False) -> Optional[BeautifulSoup]:
        """
        Fetch page content using either requests or Selenium
        """
        try:
            if use_selenium and self.use_selenium:
                self.driver.get(url)
                WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )
                html = self.driver.page_source
                return BeautifulSoup(html, 'html.parser')
            else:
                response = self.session.get(url, timeout=10)
                response.raise_for_status()
                return BeautifulSoup(response.content, 'html.parser')
        
        except Exception as e:
            logger.error(f"Error fetching {url}: {e}")
            return None
    
    def extract_university_info(self, soup: BeautifulSoup) -> Dict:
        """Extract basic university information"""
        info = {}
        
        # Extract university name
        name_selectors = ['h1', '.university-name', '#university-name', 'title']
        for selector in name_selectors:
            element = soup.select_one(selector)
            if element:
                info['name'] = element.get_text(strip=True)
                break
        
        # Extract description/mission
        desc_selectors = ['.description', '.mission', '.about', '#about']
        for selector in desc_selectors:
            element = soup.select_one(selector)
            if element:
                info['description'] = element.get_text(strip=True)[:1000]  # Limit length
                break
        
        # Extract contact information
        contact_info = {}
        
        # Phone numbers
        phone_pattern = r'[\+]?[1-9]?[0-9]{3}[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}'
        phones = re.findall(phone_pattern, str(soup))
        if phones:
            contact_info['phones'] = list(set(phones))
        
        # Email addresses
        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
        emails = re.findall(email_pattern, str(soup))
        if emails:
            contact_info['emails'] = list(set(emails))
        
        # Address
        address_keywords = ['address', 'location', 'campus']
        for keyword in address_keywords:
            addr_elem = soup.find(text=re.compile(keyword, re.I))
            if addr_elem:
                parent = addr_elem.parent
                if parent:
                    contact_info['address'] = parent.get_text(strip=True)
                    break
        
        info['contact_info'] = contact_info
        return info
    
    def scrape_departments(self, soup: BeautifulSoup) -> List[str]:
        """Extract department/faculty information"""
        departments = []
        
        # Common selectors for departments
        dept_selectors = [
            '.department', '.faculty', '.school',
            'a[href*="department"]', 'a[href*="faculty"]',
            'a[href*="school"]', '.nav-item a'
        ]
        
        for selector in dept_selectors:
            elements = soup.select(selector)
            for elem in elements:
                dept_name = elem.get_text(strip=True)
                if dept_name and len(dept_name) > 3:  # Filter out very short names
                    departments.append(dept_name)
        
        return list(set(departments))  # Remove duplicates
    
    def scrape_courses(self, soup: BeautifulSoup) -> List[Dict]:
        """Extract course information"""
        courses = []
        
        # Look for course listings
        course_selectors = [
            '.course', '.program', '.degree',
            'a[href*="course"]', 'a[href*="program"]'
        ]
        
        for selector in course_selectors:
            elements = soup.select(selector)
            for elem in elements:
                course_name = elem.get_text(strip=True)
                course_link = elem.get('href') if elem.name == 'a' else None
                
                if course_link:
                    course_link = urljoin(self.base_url, course_link)
                
                if course_name and len(course_name) > 5:
                    courses.append({
                        'name': course_name,
                        'url': course_link,
                        'description': ''  # Could be extracted from course pages
                    })
        
        return courses[:50]  # Limit to first 50 courses
    
    def scrape_faculty(self, soup: BeautifulSoup) -> List[Dict]:
        """Extract faculty information"""
        faculty = []
        
        # Look for faculty/staff listings
        faculty_selectors = [
            '.faculty-member', '.staff-member', '.professor',
            'a[href*="faculty"]', 'a[href*="staff"]'
        ]
        
        for selector in faculty_selectors:
            elements = soup.select(selector)
            for elem in elements:
                name = elem.get_text(strip=True)
                profile_link = elem.get('href') if elem.name == 'a' else None
                
                if profile_link:
                    profile_link = urljoin(self.base_url, profile_link)
                
                if name and len(name) > 3:
                    faculty.append({
                        'name': name,
                        'profile_url': profile_link,
                        'department': '',  # Could be extracted from profile
                        'title': ''        # Could be extracted from profile
                    })
        
        return faculty[:30]  # Limit to first 30 faculty
    
    def scrape_news(self, soup: BeautifulSoup) -> List[Dict]:
        """Extract news/announcements"""
        news = []
        
        # Look for news items
        news_selectors = [
            '.news-item', '.announcement', '.article',
            'a[href*="news"]', 'a[href*="announcement"]'
        ]
        
        for selector in news_selectors:
            elements = soup.select(selector)
            for elem in elements:
                title = elem.get_text(strip=True)
                news_link = elem.get('href') if elem.name == 'a' else None
                
                if news_link:
                    news_link = urljoin(self.base_url, news_link)
                
                # Try to extract date
                date_elem = elem.find(class_=re.compile(r'date|time'))
                date = date_elem.get_text(strip=True) if date_elem else ''
                
                if title and len(title) > 10:
                    news.append({
                        'title': title,
                        'url': news_link,
                        'date': date,
                        'summary': ''  # Could be extracted from news pages
                    })
        
        return news[:20]  # Limit to first 20 news items
    
    def scrape_additional_pages(self, urls: List[str]) -> Dict:
        """Scrape additional pages like About, Academics, etc."""
        additional_data = {}
        
        for url in urls:
            try:
                soup = self.get_page_content(url)
                if soup:
                    # Extract text content
                    text_content = soup.get_text()
                    # Clean up text
                    text_content = re.sub(r'\s+', ' ', text_content).strip()
                    
                    # Store first 2000 characters
                    page_name = urlparse(url).path.split('/')[-1] or 'main'
                    additional_data[page_name] = text_content[:2000]
                
                time.sleep(self.delay)  # Rate limiting
            except Exception as e:
                logger.error(f"Error scraping {url}: {e}")
        
        return additional_data
    
    def discover_urls(self, soup: BeautifulSoup, max_urls: int = 10) -> List[str]:
        """Discover important URLs to scrape"""
        urls = []
        
        # Keywords for important pages
        important_keywords = [
            'about', 'academics', 'admissions', 'faculty',
            'courses', 'programs', 'departments', 'news'
        ]
        
        # Find all links
        links = soup.find_all('a', href=True)
        
        for link in links:
            href = link.get('href')
            text = link.get_text(strip=True).lower()
            
            # Check if URL contains important keywords
            for keyword in important_keywords:
                if keyword in href.lower() or keyword in text:
                    full_url = urljoin(self.base_url, href)
                    if full_url not in urls and len(urls) < max_urls:
                        urls.append(full_url)
        
        return urls
    
    def scrape_university(self) -> UniversityData:
        """Main scraping function"""
        logger.info(f"Starting to scrape {self.base_url}")
        
        # Get main page
        main_soup = self.get_page_content(self.base_url)
        if not main_soup:
            raise Exception(f"Could not fetch main page: {self.base_url}")
        
        # Extract basic info
        basic_info = self.extract_university_info(main_soup)
        
        # Extract structured data
        departments = self.scrape_departments(main_soup)
        courses = self.scrape_courses(main_soup)
        faculty = self.scrape_faculty(main_soup)
        news = self.scrape_news(main_soup)
        
        # Discover and scrape additional pages
        additional_urls = self.discover_urls(main_soup)
        additional_data = self.scrape_additional_pages(additional_urls)
        
        # Create UniversityData object
        university_data = UniversityData(
            name=basic_info.get('name', 'Unknown University'),
            url=self.base_url,
            description=basic_info.get('description', ''),
            departments=departments,
            courses=courses,
            faculty=faculty,
            news=news,
            contact_info=basic_info.get('contact_info', {}),
            scraped_at=datetime.now().isoformat()
        )
        
        logger.info(f"Scraping completed. Found {len(departments)} departments, {len(courses)} courses, {len(faculty)} faculty, {len(news)} news items")
        
        return university_data
    
    def save_data(self, data: UniversityData, output_dir: str = "scraped_data"):
        """Save scraped data in multiple formats"""
        os.makedirs(output_dir, exist_ok=True)
        
        # Save as JSON
        json_path = os.path.join(output_dir, f"{data.name.replace(' ', '_')}.json")
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(data.__dict__, f, indent=2, ensure_ascii=False)
        
        # Save courses as CSV
        if data.courses:
            courses_df = pd.DataFrame(data.courses)
            courses_path = os.path.join(output_dir, f"{data.name.replace(' ', '_')}_courses.csv")
            courses_df.to_csv(courses_path, index=False)
        
        # Save faculty as CSV
        if data.faculty:
            faculty_df = pd.DataFrame(data.faculty)
            faculty_path = os.path.join(output_dir, f"{data.name.replace(' ', '_')}_faculty.csv")
            faculty_df.to_csv(faculty_path, index=False)
        
        # Save news as CSV
        if data.news:
            news_df = pd.DataFrame(data.news)
            news_path = os.path.join(output_dir, f"{data.name.replace(' ', '_')}_news.csv")
            news_df.to_csv(news_path, index=False)
        
        logger.info(f"Data saved to {output_dir}")
    
    def __del__(self):
        """Cleanup Selenium driver"""
        if hasattr(self, 'driver'):
            try:
                self.driver.quit()
            except:
                pass

# Example usage and demonstration
def main():
    """
    Example usage of the UniversityScraper
    """
    # Example universities to scrape
    universities = [
        "https://www.kluniversity.in"
    ]
    
    # You can also use a specific university
    # university_url = "https://www.your-university.edu"
    
    for university_url in universities[:1]:  # Scrape only first one for demo
        try:
            # Create scraper instance
            scraper = UniversityScraper(university_url, delay=2.0)
            
            # Scrape the university
            university_data = scraper.scrape_university()
            
            # Save the data
            scraper.save_data(university_data)
            
            # Print summary
            print(f"\n🏫 University: {university_data.name}")
            print(f"📝 Description: {university_data.description[:200]}...")
            print(f"🏢 Departments: {len(university_data.departments)}")
            print(f"📚 Courses: {len(university_data.courses)}")
            print(f"👨‍🏫 Faculty: {len(university_data.faculty)}")
            print(f"📰 News: {len(university_data.news)}")
            print(f"📞 Contact: {university_data.contact_info}")
            
        except Exception as e:
            logger.error(f"Error scraping {university_url}: {e}")

# Advanced scraping utilities
class AdvancedUniversityScraper(UniversityScraper):
    """
    Extended scraper with additional features:
    - Handle JavaScript-heavy sites
    - Extract course syllabi
    - Scrape research papers
    - Handle authentication
    """
    
    def scrape_course_details(self, course_url: str) -> Dict:
        """Scrape detailed course information"""
        soup = self.get_page_content(course_url, use_selenium=True)
        if not soup:
            return {}
        
        course_details = {}
        
        # Extract course code
        code_pattern = r'[A-Z]{2,4}[\s-]?\d{3,4}'
        code_match = re.search(code_pattern, soup.get_text())
        if code_match:
            course_details['code'] = code_match.group()
        
        # Extract credits
        credit_pattern = r'(\d+)[\s-]?credit'
        credit_match = re.search(credit_pattern, soup.get_text(), re.I)
        if credit_match:
            course_details['credits'] = int(credit_match.group(1))
        
        # Extract prerequisites
        prereq_section = soup.find(text=re.compile('prerequisite', re.I))
        if prereq_section:
            prereq_text = prereq_section.parent.get_text(strip=True)
            course_details['prerequisites'] = prereq_text
        
        return course_details
    
    def scrape_with_authentication(self, login_url: str, username: str, password: str):
        """Handle sites that require authentication"""
        if not self.use_selenium:
            logger.error("Authentication scraping requires Selenium")
            return
        
        try:
            self.driver.get(login_url)
            
            # Find and fill login form
            username_field = self.driver.find_element(By.NAME, "username")
            password_field = self.driver.find_element(By.NAME, "password")
            
            username_field.send_keys(username)
            password_field.send_keys(password)
            
            # Submit form
            login_button = self.driver.find_element(By.TYPE, "submit")
            login_button.click()
            
            # Wait for login to complete
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )
            
            logger.info("Authentication successful")
            
        except Exception as e:
            logger.error(f"Authentication failed: {e}")

if __name__ == "__main__":
    main()

2025-07-18 16:17:00,717 - INFO - Starting to scrape https://www.kluniversity.in
  addr_elem = soup.find(text=re.compile(keyword, re.I))
2025-07-18 16:17:23,396 - INFO - Scraping completed. Found 1 departments, 2 courses, 2 faculty, 8 news items
2025-07-18 16:17:23,402 - INFO - Data saved to scraped_data



🏫 University: About Us
📝 Description: ...
🏢 Departments: 1
📚 Courses: 2
👨‍🏫 Faculty: 2
📰 News: 8
📞 Contact: {'phones': ['91781592683', '+91799799838', '9849519527', '18462450823', '+91799799572', '15461422121', '7815901716'], 'address': '.trigger_popup {\r\n                    transform: rotate(90deg) !important;\r\n                    position: fixed; \r\n                    top: 39%; \r\n                   right:-46px;\r\n                    z-index: 999;\r\n                    cursor: pointer;\r\n                    background-color: #b8292f;\r\n                    border-color: #b8292f;\r\n                    border-radius: 5px;\r\n                    border-bottom-right-radius: 0;\r\n                    border-bottom-left-radius: 0; \r\n                    padding: 10px 12px;\r\n                    font-size: 18px;\r\n                    color: #fff;\r\n                    line-height: 1.33;         \r\n                    /* visibility: hidden;          */\r\n                }\r