In [1]:
#!/usr/bin/env python3
"""
KL University Optimized Scraper
Specifically designed for https://www.kluniversity.in/
Handles ASP.NET postbacks, JavaScript content, and dynamic loading
"""

import time
import re
import json
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin, urlparse
import logging
from dataclasses import dataclass
from typing import List, Dict, Optional
import os
from datetime import datetime

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class KLUData:
    """Data structure for KL University information"""
    notifications: List[Dict]
    departments: List[Dict]
    courses: List[Dict]
    faculty: List[Dict]
    news: List[Dict]
    achievements: List[Dict]
    contact_info: Dict
    scraped_at: str

class KLUniversityScraper:
    """
    Specialized scraper for KL University website
    Handles ASP.NET specific challenges:
    - ViewState management
    - PostBack events
    - Dynamic content loading
    - JavaScript-dependent sections
    """
    
    def __init__(self, delay: float = 3.0):
        self.base_url = "https://www.kluniversity.in"
        self.delay = delay
        
        # Setup session for requests
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        })
        
        # Setup Selenium
        self.setup_selenium()
    
    def setup_selenium(self):
        """Configure Selenium WebDriver for KLU website"""
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--window-size=1920,1080')
        chrome_options.add_argument('--disable-blink-features=AutomationControlled')
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        
        try:
            self.driver = webdriver.Chrome(options=chrome_options)
            self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
            logger.info("Selenium WebDriver initialized successfully")
        except Exception as e:
            logger.error(f"Failed to initialize Selenium: {e}")
            raise
    
    def get_page_with_selenium(self, url: str, wait_for_element: str = "body") -> BeautifulSoup:
        """Get page content using Selenium with proper wait"""
        try:
            self.driver.get(url)
            
            # Wait for specific element to load
            WebDriverWait(self.driver, 15).until(
                EC.presence_of_element_located((By.TAG_NAME, wait_for_element))
            )
            
            # Additional wait for dynamic content
            time.sleep(self.delay)
            
            html = self.driver.page_source
            return BeautifulSoup(html, 'html.parser')
            
        except TimeoutException:
            logger.warning(f"Timeout waiting for {url} to load")
            return None
        except Exception as e:
            logger.error(f"Error loading {url}: {e}")
            return None
    
    def get_page_with_requests(self, url: str) -> BeautifulSoup:
        """Get page content using requests (for static content)"""
        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            return BeautifulSoup(response.content, 'html.parser')
        except Exception as e:
            logger.error(f"Error fetching {url} with requests: {e}")
            return None
    
    def scrape_main_page_notifications(self) -> List[Dict]:
        """Scrape notifications from main page"""
        logger.info("Scraping main page notifications...")
        
        soup = self.get_page_with_selenium(self.base_url)
        if not soup:
            return []
        
        notifications = []
        
        # Look for notification links (based on the content structure observed)
        notification_patterns = [
            'a[href*="pdf"]',
            'a[href*="notification"]',
            'a[href*="admit"]',
            'a[href*="admission"]'
        ]
        
        for pattern in notification_patterns:
            elements = soup.select(pattern)
            for elem in elements:
                text = elem.get_text(strip=True)
                href = elem.get('href')
                
                if href and text and len(text) > 10:
                    notifications.append({
                        'title': text,
                        'url': urljoin(self.base_url, href),
                        'type': 'notification',
                        'date': self.extract_date_from_text(text)
                    })
        
        # Remove duplicates
        seen = set()
        unique_notifications = []
        for notif in notifications:
            if notif['title'] not in seen:
                seen.add(notif['title'])
                unique_notifications.append(notif)
        
        logger.info(f"Found {len(unique_notifications)} notifications")
        return unique_notifications[:20]  # Limit to recent 20
    
    def scrape_departments(self) -> List[Dict]:
        """Scrape department information"""
        logger.info("Scraping departments...")
        
        departments = []
        
        # Common department pages to check
        dept_urls = [
            f"{self.base_url}/cse/",
            f"{self.base_url}/ece/",
            f"{self.base_url}/mech/",
            f"{self.base_url}/civil/",
            f"{self.base_url}/eee/"
        ]
        
        # Try to find department links from main page
        main_soup = self.get_page_with_selenium(self.base_url)
        if main_soup:
            dept_links = main_soup.find_all('a', href=re.compile(r'/(cse|ece|mech|civil|eee|mba|csa)/'))
            for link in dept_links:
                dept_url = urljoin(self.base_url, link.get('href'))
                if dept_url not in dept_urls:
                    dept_urls.append(dept_url)
        
        # Scrape each department
        for dept_url in dept_urls[:10]:  # Limit to avoid too many requests
            try:
                soup = self.get_page_with_requests(dept_url)
                if soup:
                    dept_name = self.extract_department_name(soup, dept_url)
                    if dept_name:
                        departments.append({
                            'name': dept_name,
                            'url': dept_url,
                            'description': self.extract_department_description(soup)
                        })
                
                time.sleep(self.delay)
                
            except Exception as e:
                logger.warning(f"Error scraping department {dept_url}: {e}")
                continue
        
        logger.info(f"Found {len(departments)} departments")
        return departments
    
    def scrape_courses_programs(self) -> List[Dict]:
        """Scrape courses and programs"""
        logger.info("Scraping courses and programs...")
        
        courses = []
        
        # Look for academic/courses pages
        course_urls = [
            f"{self.base_url}/academics.aspx",
            f"{self.base_url}/programmes.aspx",
            f"{self.base_url}/courses.aspx"
        ]
        
        for course_url in course_urls:
            try:
                soup = self.get_page_with_selenium(course_url)
                if soup:
                    # Extract course information
                    course_links = soup.find_all('a', href=re.compile(r'(course|program|degree)'))
                    for link in course_links:
                        course_name = link.get_text(strip=True)
                        course_href = link.get('href')
                        
                        if course_name and len(course_name) > 5:
                            courses.append({
                                'name': course_name,
                                'url': urljoin(self.base_url, course_href) if course_href else course_url,
                                'level': self.extract_course_level(course_name),
                                'department': self.extract_department_from_course(course_name)
                            })
                
                time.sleep(self.delay)
                
            except Exception as e:
                logger.warning(f"Error scraping courses from {course_url}: {e}")
                continue
        
        # Remove duplicates
        seen = set()
        unique_courses = []
        for course in courses:
            if course['name'] not in seen:
                seen.add(course['name'])
                unique_courses.append(course)
        
        logger.info(f"Found {len(unique_courses)} courses")
        return unique_courses[:50]  # Limit to 50 courses
    
    def scrape_faculty(self) -> List[Dict]:
        """Scrape faculty information"""
        logger.info("Scraping faculty information...")
        
        faculty = []
        
        # Look for faculty pages
        faculty_urls = [
            f"{self.base_url}/faculty.aspx",
            f"{self.base_url}/staff.aspx"
        ]
        
        for faculty_url in faculty_urls:
            try:
                soup = self.get_page_with_selenium(faculty_url)
                if soup:
                    # Extract faculty information
                    faculty_elements = soup.find_all(['div', 'td', 'span'], text=re.compile(r'Dr\.|Prof\.|Mr\.|Ms\.'))
                    
                    for elem in faculty_elements:
                        faculty_name = elem.get_text(strip=True)
                        if len(faculty_name) > 3 and len(faculty_name) < 100:
                            faculty.append({
                                'name': faculty_name,
                                'url': faculty_url,
                                'department': self.extract_department_from_context(elem),
                                'designation': self.extract_designation(faculty_name)
                            })
                
                time.sleep(self.delay)
                
            except Exception as e:
                logger.warning(f"Error scraping faculty from {faculty_url}: {e}")
                continue
        
        logger.info(f"Found {len(faculty)} faculty members")
        return faculty[:100]  # Limit to 100 faculty
    
    def scrape_achievements(self) -> List[Dict]:
        """Scrape university achievements and awards"""
        logger.info("Scraping achievements...")
        
        achievements = []
        
        # From the main page content, extract achievements
        soup = self.get_page_with_selenium(self.base_url)
        if soup:
            # Look for achievement-related text
            achievement_keywords = ['award', 'ranking', 'recognition', 'prize', 'achievement']
            
            for keyword in achievement_keywords:
                elements = soup.find_all(text=re.compile(keyword, re.I))
                for elem in elements:
                    if elem.parent:
                        achievement_text = elem.parent.get_text(strip=True)
                        if len(achievement_text) > 50:
                            achievements.append({
                                'title': achievement_text[:200],
                                'type': keyword,
                                'year': self.extract_year_from_text(achievement_text),
                                'description': achievement_text[:500]
                            })
        
        logger.info(f"Found {len(achievements)} achievements")
        return achievements[:15]  # Limit to 15 achievements
    
    def extract_contact_info(self, soup: BeautifulSoup) -> Dict:
        """Extract contact information"""
        contact_info = {}
        
        # Extract phone numbers
        phone_pattern = r'[\+]?[0-9]{10,15}'
        phones = re.findall(phone_pattern, str(soup))
        if phones:
            contact_info['phones'] = list(set(phones))
        
        # Extract email addresses
        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
        emails = re.findall(email_pattern, str(soup))
        if emails:
            contact_info['emails'] = list(set(emails))
        
        # Extract address
        address_elem = soup.find(text=re.compile('address|location|campus', re.I))
        if address_elem and address_elem.parent:
            contact_info['address'] = address_elem.parent.get_text(strip=True)
        
        return contact_info
    
    # Helper methods
    def extract_date_from_text(self, text: str) -> str:
        """Extract date from text"""
        date_pattern = r'(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})|(\d{4}[-/]\d{1,2}[-/]\d{1,2})'
        match = re.search(date_pattern, text)
        return match.group() if match else ''
    
    def extract_year_from_text(self, text: str) -> str:
        """Extract year from text"""
        year_pattern = r'20\d{2}'
        match = re.search(year_pattern, text)
        return match.group() if match else ''
    
    def extract_department_name(self, soup: BeautifulSoup, url: str) -> str:
        """Extract department name from page"""
        # Try to get from title or h1
        title_elem = soup.find('title')
        if title_elem:
            return title_elem.get_text(strip=True)
        
        h1_elem = soup.find('h1')
        if h1_elem:
            return h1_elem.get_text(strip=True)
        
        # Extract from URL
        path = urlparse(url).path
        return path.split('/')[-2] if len(path.split('/')) > 1 else 'Unknown'
    
    def extract_department_description(self, soup: BeautifulSoup) -> str:
        """Extract department description"""
        # Look for description in common places
        desc_selectors = ['.description', '#description', '.about', '#about']
        for selector in desc_selectors:
            elem = soup.select_one(selector)
            if elem:
                return elem.get_text(strip=True)[:500]
        
        # Get first paragraph
        p_elem = soup.find('p')
        if p_elem:
            return p_elem.get_text(strip=True)[:500]
        
        return ''
    
    def extract_course_level(self, course_name: str) -> str:
        """Extract course level (UG/PG/PhD)"""
        course_name_lower = course_name.lower()
        if any(term in course_name_lower for term in ['b.tech', 'b.e', 'bachelor', 'btech']):
            return 'Undergraduate'
        elif any(term in course_name_lower for term in ['m.tech', 'm.e', 'master', 'mtech', 'mba']):
            return 'Postgraduate'
        elif any(term in course_name_lower for term in ['ph.d', 'phd', 'doctorate']):
            return 'Doctoral'
        return 'Unknown'
    
    def extract_department_from_course(self, course_name: str) -> str:
        """Extract department from course name"""
        course_name_lower = course_name.lower()
        if 'computer' in course_name_lower or 'cse' in course_name_lower:
            return 'Computer Science'
        elif 'mechanical' in course_name_lower or 'mech' in course_name_lower:
            return 'Mechanical'
        elif 'electrical' in course_name_lower or 'eee' in course_name_lower:
            return 'Electrical'
        elif 'civil' in course_name_lower:
            return 'Civil'
        elif 'electronics' in course_name_lower or 'ece' in course_name_lower:
            return 'Electronics'
        return 'General'
    
    def extract_department_from_context(self, elem) -> str:
        """Extract department from element context"""
        # Look for department indicators in parent elements
        parent = elem.parent
        if parent:
            parent_text = parent.get_text().lower()
            if 'cse' in parent_text or 'computer' in parent_text:
                return 'Computer Science'
            elif 'ece' in parent_text or 'electronics' in parent_text:
                return 'Electronics'
            elif 'mech' in parent_text or 'mechanical' in parent_text:
                return 'Mechanical'
        return 'Unknown'
    
    def extract_designation(self, name: str) -> str:
        """Extract designation from name"""
        if name.startswith('Dr.'):
            return 'Doctor'
        elif name.startswith('Prof.'):
            return 'Professor'
        elif name.startswith('Mr.'):
            return 'Mr.'
        elif name.startswith('Ms.'):
            return 'Ms.'
        return 'Unknown'
    
    def scrape_all_data(self) -> KLUData:
        """Main scraping method"""
        logger.info("Starting comprehensive KL University scraping...")
        
        # Get main page for contact info
        main_soup = self.get_page_with_selenium(self.base_url)
        contact_info = self.extract_contact_info(main_soup) if main_soup else {}
        
        # Scrape all sections
        notifications = self.scrape_main_page_notifications()
        departments = self.scrape_departments()
        courses = self.scrape_courses_programs()
        faculty = self.scrape_faculty()
        achievements = self.scrape_achievements()
        
        # Create data object
        klu_data = KLUData(
            notifications=notifications,
            departments=departments,
            courses=courses,
            faculty=faculty,
            news=notifications,  # Notifications can serve as news
            achievements=achievements,
            contact_info=contact_info,
            scraped_at=datetime.now().isoformat()
        )
        
        logger.info("Scraping completed successfully!")
        return klu_data
    
    def save_data(self, data: KLUData, output_dir: str = "klu_scraped_data"):
        """Save scraped data"""
        os.makedirs(output_dir, exist_ok=True)
        
        # Save as JSON
        json_path = os.path.join(output_dir, "klu_data.json")
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(data.__dict__, f, indent=2, ensure_ascii=False)
        
        # Save individual components as CSV
        if data.notifications:
            pd.DataFrame(data.notifications).to_csv(
                os.path.join(output_dir, "notifications.csv"), index=False
            )
        
        if data.departments:
            pd.DataFrame(data.departments).to_csv(
                os.path.join(output_dir, "departments.csv"), index=False
            )
        
        if data.courses:
            pd.DataFrame(data.courses).to_csv(
                os.path.join(output_dir, "courses.csv"), index=False
            )
        
        if data.faculty:
            pd.DataFrame(data.faculty).to_csv(
                os.path.join(output_dir, "faculty.csv"), index=False
            )
        
        if data.achievements:
            pd.DataFrame(data.achievements).to_csv(
                os.path.join(output_dir, "achievements.csv"), index=False
            )
        
        logger.info(f"Data saved to {output_dir}")
    
    def __del__(self):
        """Cleanup"""
        if hasattr(self, 'driver'):
            try:
                self.driver.quit()
            except:
                pass

# Usage example
def main():
    """Example usage"""
    try:
        # Initialize scraper
        scraper = KLUniversityScraper(delay=2.0)
        
        # Scrape all data
        klu_data = scraper.scrape_all_data()
        
        # Save data
        scraper.save_data(klu_data)
        
        # Print summary
        print(f"\n🏫 KL University Scraping Summary:")
        print(f"📢 Notifications: {len(klu_data.notifications)}")
        print(f"🏢 Departments: {len(klu_data.departments)}")
        print(f"📚 Courses: {len(klu_data.courses)}")
        print(f"👨‍🏫 Faculty: {len(klu_data.faculty)}")
        print(f"🏆 Achievements: {len(klu_data.achievements)}")
        print(f"📞 Contact Info: {klu_data.contact_info}")
        
    except Exception as e:
        logger.error(f"Scraping failed: {e}")

if __name__ == "__main__":
    main()

INFO:__main__:Selenium WebDriver initialized successfully
INFO:__main__:Starting comprehensive KL University scraping...
  address_elem = soup.find(text=re.compile('address|location|campus', re.I))
INFO:__main__:Scraping main page notifications...
INFO:__main__:Found 29 notifications
INFO:__main__:Scraping departments...
ERROR:__main__:Error fetching https://www.kluniversity.in/cse/ with requests: 404 Client Error: Not Found for url: https://www.kluniversity.in/cse/
ERROR:__main__:Error fetching https://www.kluniversity.in/mech/ with requests: 404 Client Error: Not Found for url: https://www.kluniversity.in/mech/
ERROR:__main__:Error fetching https://www.kluniversity.in/civil/ with requests: 404 Client Error: Not Found for url: https://www.kluniversity.in/civil/
INFO:__main__:Found 3 departments
INFO:__main__:Scraping courses and programs...
INFO:__main__:Found 2 courses
INFO:__main__:Scraping faculty information...
  faculty_elements = soup.find_all(['div', 'td', 'span'], text=re.comp


🏫 KL University Scraping Summary:
📢 Notifications: 20
🏢 Departments: 3
📚 Courses: 2
👨‍🏫 Faculty: 0
🏆 Achievements: 15
📞 Contact Info: {'phones': ['1752835982196', '504496530959', '8832122115', '9849519527', '444468980536', '184624508233022', '+917997998383', '7815901716', '1752835982189', '1752835982242', '+917997995727', '1752835982249', '1752835981569', '1316721752', '917815926834', '154614221219641', '1752835981549', '1752835982'], 'address': '.trigger_popup {\n                    transform: rotate(90deg) !important;\n                    position: fixed; \n                    top: 39%; \n                   right:-46px;\n                    z-index: 999;\n                    cursor: pointer;\n                    background-color: #b8292f;\n                    border-color: #b8292f;\n                    border-radius: 5px;\n                    border-bottom-right-radius: 0;\n                    border-bottom-left-radius: 0; \n                    padding: 10px 12px;\n                 

In [2]:
import requests
from bs4 import BeautifulSoup

resp = requests.get("https://www.kluniversity.in/")
soup = BeautifulSoup(resp.text, "html.parser")
# e.g. list all notification titles
for li in soup.select("ul.notifications li a"):
    print(li.get_text(strip=True), li["href"])