In [89]:
import requests
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup


driver = webdriver.Chrome()


driver.get('https://talentedge.com/browse-courses')


a_tags = driver.find_elements(By.CSS_SELECTOR, 'div.know-more-p a')


href_values = [a_tag.get_attribute('href') for a_tag in a_tags[:10]]


fees_inr, description, prerequisites_eligibility_criteria, what_you_will_learn = [], [], [], []
faculty_names, faculty_designations, course_titles, skills = [], [], [], []
institute_names, durations, working_professionals_info, starting_dates = [], [], [], []
contents = []  

def scrape_course_details(url):
    
    starting_dates = []
    working_professional_info = []
    durations = []

    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Scrape duration and working professional information
    duration_div = soup.find('div', class_='duration-of-course')
    if duration_div:
        li_elements = duration_div.find('ul').find_all('li') if duration_div.find('ul') else []
        if len(li_elements) > 0:
            p_tags = li_elements[0].find_all('p')
            if len(p_tags) > 0:
                durations.append(p_tags[0].text.strip())
            if len(p_tags) > 1:
                working_professional_info.append(p_tags[1].text.strip())
        if len(li_elements) > 1:
            start_date_text = li_elements[1].find('p')
            if start_date_text:
                starting_dates.append(start_date_text.text.strip())

    # Scrape start date
    divs = soup.find_all('div', class_='apply-before-date')
    for div in divs:
        strong_tags = div.find_all('h2', class_='no')
        for strong in strong_tags:
            starting_dates.append(strong.get_text(strip=True))

    # Scrape duration (alternative method)
    divs = soup.find_all('div', class_='months pb-1 pt-2')
    for div in divs:
        strong_tags = div.find_all('span', class_='monthsOfCourse')
        for strong in strong_tags:
            durations.append(strong.get_text(strip=True) + " month")

    # Scrape working professional information related to "Effort"
    divs = soup.find_all('div', class_='left-cer')
    for div in divs:
        ul_tags = div.find_all('ul', style='list-style-type: disc !important;')
        for ul in ul_tags:
            li_tags = ul.find_all('li')
            for li in li_tags:
                text = li.get_text(strip=True)
                if 'Effort' in text:
                    working_professional_info.append(text)

    # Return the scraped data
    return {
        "starting_dates": starting_dates,
        "working_professional_info": working_professional_info,
        "durations": durations
    }

# Iterate through each href and extract the data
for href in href_values:
    response = requests.get(href)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract data from scrape_course_details function
        course_details = scrape_course_details(href)
        durations.append(" | ".join(course_details['durations']) if course_details['durations'] else np.nan)
        working_professionals_info.append(" | ".join(course_details['working_professional_info']) if course_details['working_professional_info'] else np.nan)
        starting_dates.append(" | ".join(course_details['starting_dates']) if course_details['starting_dates'] else np.nan)

        # Extract payment amounts and GST information
        payment_details = soup.find_all('div', class_='program-details-total-pay-amt-right')
        for payment in payment_details:
            text = payment.get_text(separator=" ", strip=True)
            clean_text = text.replace("INR", "").replace("USD", "").replace(" ", "")
            if "+GST" in clean_text:
                clean_text = clean_text.split("+GST")[0] + "+GST"
                fees_inr.append(clean_text)
            else:
                clean_text = clean_text.split("Dueby:")[0]
                fees_inr.append(clean_text)

        # Extract course title
        title = None
        title_div = soup.find('div', class_='el-ln-banner-left')
        if title_div:
            title_h1 = title_div.find('h1').text.strip() if title_div.find('h1') else None
            title_h2 = title_div.find('h2').text.strip() if title_div.find('h2') else None
            if title_h1:
                title = title_h1
            elif title_h2:
                title = title_h2
        if not title:
            course_title_div = soup.find_all('h1', class_='pl-title')
            if course_title_div:
                title = course_title_div[0].text.strip()
        course_titles.append(title)

        # Extract descriptions
        descriptions = []
        description_divs = soup.find_all('div', class_='el-lap-section2-inner el-ln-p')
        for div in description_divs:
            p_tags = div.find_all('p')
            for p in p_tags:
                descriptions.append(p.get_text(strip=True))
        description_divs_desc_less = soup.find_all('div', class_='desc_less')
        for div in description_divs_desc_less:
            p_tag = div.find('p')
            if p_tag:
                descriptions.append(p_tag.get_text(strip=True))
        description_text = " | ".join(descriptions) if descriptions else None
        description.append(description_text)

        # Extract prerequisites and eligibility criteria
        prerequisites_eligibility = []
        prerequisites_divs = soup.find_all('div', class_='tab-content current')
        for div in prerequisites_divs:
            ul_tag = div.find('ul')
            if ul_tag:
                li_tags = ul_tag.find_all('li')
                for li in li_tags:
                    prerequisites_eligibility.append(li.get_text(strip=True))
        experience_divs = soup.find_all('div', class_='eligible-right-top-list')
        for div in experience_divs:
            p_tag = div.find('p')
            if p_tag:
                prerequisites_eligibility.append(p_tag.get_text(strip=True))
        prerequisites_eligibility_criteria.append(" | ".join(prerequisites_eligibility))

        # Extract 'What You Will Learn'
        what_you_will_earn = []
        divs = soup.find_all('div', class_='col-md-8')
        for div in divs:
            ul_tag = div.find('ul', style='list-style-type: disc;')
            if ul_tag:
                li_tags = ul_tag.find_all('li')
                for li in li_tags:
                    what_you_will_earn.append(li.get_text(strip=True))
        ul_divs = soup.find_all('div', class_='pl-deeper-undstnd to_flex_ul')
        for ul_div in ul_divs:
            ul_tag = ul_div.find('ul')
            if ul_tag:
                li_tags = ul_tag.find_all('li')
                for li in li_tags:
                    what_you_will_earn.append(li.get_text(strip=True))
        what_you_will_learn.append(" | ".join(what_you_will_earn))

        # Extract faculty names
        

        # Extract faculty names and designations
        faulty_names = []
        faulty_designations = []
        for div in soup.find_all('div', class_='el-fec-right'):
            h4_name_tag = div.find('p')
            h4_designation_tag = div.find('h4')
            if h4_name_tag:
                faulty_names.append(h4_name_tag.get_text(strip=True))
            if h4_designation_tag:
                faulty_designations.append(h4_designation_tag.get_text(strip=True))
        
        for div in soup.find_all('div', class_='best-fdetail'):
            name_tag = div.find('h4', class_='best-fname')
            designation_tag = div.find('p')
            if name_tag:
                faulty_names.append(name_tag.get_text(strip=True))
            if designation_tag:
                faulty_designations.append(designation_tag.get_text(strip=True))
        
        faculty_names.append(" | ".join(faulty_names))
        faculty_designations.append(" | ".join(faulty_designations))

        # Extract content
        content = []
        divs = soup.find_all('div', class_='el-lap-section4-repeater')
        for div in divs:
            tags = div.find_all('div', class_='accordionButton')
            for tag in tags:
                text = tag.get_text(strip=True)
                content.append(text)

        divs = soup.find_all('div', class_='sylab-tab-ul')
        for div in divs:
            ul_tags = div.find_all('ul', class_='nav nav-tabs syl-ul')
            for ul in ul_tags:
                li_tags = ul.find_all('li')
                for li in li_tags:
                    a_tag = li.find('a')
                    if a_tag:
                        text = a_tag.get_text(strip=True)
                        content.append(text)
        contents.append(" | ".join(content))

        # Extract skills
        skill_set = []
        divs = soup.find_all('div', class_='el-lap-hr-r')
        for div in divs:
            strong_tags = div.find_all('strong')
            for strong in strong_tags:
                skill_set.append(strong.get_text(strip=True))
        skills_div = soup.find('div', class_='key-skills-sec')
        if skills_div:
            skills_list = [li.text.strip() for li in skills_div.find('ul').find_all('li')] if skills_div.find('ul') else []
            skills_combined = ", ".join(skills_list) if skills_list else None
            if skills_combined:
                skill_set.append(skills_combined)
        skills.append(", ".join(skill_set))
        # Extract institute names
        institute_names_temp = []
        divs = soup.find_all('div', class_='col-md-8')
        for div in divs:
            tags = div.find_all('div', class_='el-lap-section6-inner')
            for tag in tags:
                text = tag.find_all('div', class_='el-lap-sec-hd white')
                institute_names_temp.append(text[0].get_text(strip=True)[6:] if text else np.nan)
        institute_name_div = soup.find_all('h4', class_='about-ititle')
        if institute_name_div:
            institute_names_temp.append(institute_name_div[0].text.strip())
        institute_names_combined = " | ".join(institute_names_temp)
        institute_names.append(institute_names_combined)
        


driver.quit()


max_length = 10

# Padding function
def pad_list(lst, length):
    return lst + [np.nan] * (length - len(lst))

# Pad each list to the maximum length
fees_inr = pad_list(fees_inr, max_length)
description = pad_list(description, max_length)
prerequisites_eligibility_criteria = pad_list(prerequisites_eligibility_criteria, max_length)
what_you_will_learn = pad_list(what_you_will_learn, max_length)
faculty_names = pad_list(faculty_names, max_length)
faculty_designations = pad_list(faculty_designations, max_length)
course_titles = pad_list(course_titles, max_length)
skills = pad_list(skills, max_length)
institute_names = pad_list(institute_names, max_length)
durations = pad_list(durations, max_length)
working_professionals_info = pad_list(working_professionals_info, max_length)
starting_dates = pad_list(starting_dates, max_length)
contents = pad_list(contents, max_length)
Course_link=pad_list(href_values,max_length)

df = pd.DataFrame({
    'Course Title': course_titles,
    'Fees (INR)': fees_inr,
    'Description': description,
    'Prerequisites & Eligibility': prerequisites_eligibility_criteria,
    'What You Will Learn': what_you_will_learn,
    'Faculty Names': faculty_names,
    'Faculty Designations': faculty_designations,
    'Skills': skills,
    'Institute Name': institute_names,
    'Duration': durations,
    'Working Professionals Info': working_professionals_info,
    'Starting Date': starting_dates,
    'Content': contents,
    'Course_link':Course_link
    
})


df.applymap(lambda x: x if pd.notnull(x) else None)


print(df)



{'Fees (INR)': ['\n1226438\n+GST', '\n15000', '\n105932\n+GST', '\n1287', '\n127119\n+GST', '\n0', '\n740741\n+GST', '\n9055'], 'Description': ['Go beyond the boundaries of your job, without quitting it. Be a sought after specialist with the same doctorate degree awarded on campus. Earn the ‘Dr’ title after successfully completing the program.\xa0The Doctor of Business Administration (DBA) program is designed for professionals who want to further their career with the most advanced business degree progrRead More', 'This program deals with the\xa0collection and management of talent data and its effective application. The course helps to develop the HR analytical skills to handle the data-driven insights and improve the workforce. The managerial skills acquired during this program\xa0helps to take informed decisions and promote employee experience. It is targeted for HR practitionersRead More', 'Talentedge is proud to offer Data Analytics 360 Certificate Program for Indian Executives in 