In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import random
import time
import re
import os
from selenium.common.exceptions import TimeoutException

# 设置 Chrome 选项以连接到已打开的浏览器
options = Options()
options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")

# 初始化 WebDriver
try:
    driver = webdriver.Chrome(options=options)
    print("成功连接到 Chrome 浏览器！")
except Exception as e:
    raise

In [None]:

course_titles = [
 'Statistics for Data Analysis',
 'Excel MO-201 Exam Prep',
 'Excel Pro Tips: Visualization',
 'Excel Pro Tips: Analytics',
 'Excel Pro Tips: Formulas',
 'Excel Pro Tips: Productivity',
 'Excel Pro Tips: Formatting',
 'Excel Charts & Graphs',
 'Intro to Alteryx Designer',
 'Launching Your Data Career',
 'Machine Learning 2: Classification',
 'Machine Learning 4: Unsupervised Learning',
 'Acing the Analyst Interview',
 'Cloud Basics for Data Professionals',
 'Building a World-Class Portfolio',
 'Finding Your Path in Data']


course_links = [
 'https://thinkificenroller.mavenanalytics.io/course/33468c00-4ab7-4e5b-a5ba-9729e73590ac?error_callback=https://app.mavenanalytics.io/learning-plan',
 'https://thinkificenroller.mavenanalytics.io/course/d2dc6ff3-ccda-41a1-bf50-3bcc6bcabcb0?error_callback=https://app.mavenanalytics.io/learning-plan',
 'https://thinkificenroller.mavenanalytics.io/course/5f85edab-ecdc-47cf-b3a7-9e5ad953bcb4?error_callback=https://app.mavenanalytics.io/learning-plan',
 'https://thinkificenroller.mavenanalytics.io/course/9bb95bb7-38bb-43ec-b7d3-2660a9b38347?error_callback=https://app.mavenanalytics.io/learning-plan',
 'https://thinkificenroller.mavenanalytics.io/course/8adc05a7-1f0f-441b-aca2-380f8a3235b3?error_callback=https://app.mavenanalytics.io/learning-plan',
 'https://thinkificenroller.mavenanalytics.io/course/9aee1658-ca1c-4b97-8259-bea7ffb2e7e3?error_callback=https://app.mavenanalytics.io/learning-plan',
 'https://thinkificenroller.mavenanalytics.io/course/6634b138-8b89-4d90-97bc-0f710509bd16?error_callback=https://app.mavenanalytics.io/learning-plan',
 'https://thinkificenroller.mavenanalytics.io/course/9014b0fd-3403-4b06-9421-cbe09b49b746?error_callback=https://app.mavenanalytics.io/learning-plan',
 'https://thinkificenroller.mavenanalytics.io/course/cf4eb33e-99e7-4daa-894b-d72e5d84393b?error_callback=https://app.mavenanalytics.io/learning-plan',
 'https://thinkificenroller.mavenanalytics.io/course/30ad9678-de66-49ea-8e62-5eea9d9f4571?error_callback=https://app.mavenanalytics.io/learning-plan',
 'https://thinkificenroller.mavenanalytics.io/course/496739ae-a7d5-4f04-9e28-9c1e894851b9?error_callback=https://app.mavenanalytics.io/learning-plan',
 'https://thinkificenroller.mavenanalytics.io/course/2adde790-f588-4915-b3cb-5196b64faccc?error_callback=https://app.mavenanalytics.io/learning-plan',
 'https://thinkificenroller.mavenanalytics.io/course/05e431dd-74bb-4898-b904-86327b7d2f46?error_callback=https://app.mavenanalytics.io/learning-plan',
 'https://thinkificenroller.mavenanalytics.io/course/4c87bb7f-6dbb-498f-888f-0eab40118ffe?error_callback=https://app.mavenanalytics.io/learning-plan',
 'https://thinkificenroller.mavenanalytics.io/course/9bfe87a3-f16b-4fee-b084-e3b9f0730044?error_callback=https://app.mavenanalytics.io/learning-plan',
 'https://thinkificenroller.mavenanalytics.io/course/5a440b5f-7806-4ea6-bdd8-72af12bb715c?error_callback=https://app.mavenanalytics.io/learning-plan']

成功连接到 Chrome 浏览器！


In [None]:
def sanitize_filename(title):
    illegal_chars = r'[:\\/*?"<>|]'
    clean_title = re.sub(':', ' -', title)
    clean_title = re.sub(illegal_chars, '', clean_title)
    return clean_title.strip()


def final_output(path, name, url):
    return f"#O,{path}\n{name}.mp4,{url}"


def click_element(target_id, tag_name=None):
    try:
        target = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.ID, target_id))
        )
        if tag_name and target.tag_name != tag_name:
            return False
        actions = ActionChains(driver)
        driver.execute_script("arguments[0].scrollIntoView(true);", target)
        actions.move_to_element(target).click().perform()
        WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '[aria-current="step"]'))
        )
        time.sleep(random.uniform(1, 2))  # 随机延时，避免过快点击
        return True
    except:
        return False

def click_by_xpath(xpath, timeout=10):
  """点击指定 XPath 的元素，动态等待页面切换"""
  try:
    target = WebDriverWait(driver, 5).until(
      EC.presence_of_element_located((By.XPATH, xpath))
    )
    actions = ActionChains(driver)
    driver.execute_script("arguments[0].scrollIntoView(true);", target)
    actions.move_to_element(target).click().perform()
    return True
  except Exception as e:
    if "invalid session id" in str(e):
      print("WebDriver 会话无效，请检查浏览器是否仍在运行或重新连接。")
    else:
      print(f"未找到元素，错误信息: {e}")
    return False


def analyze_course_page(max_retry=3, wait_sec=3):
    """
    分析课程页面，提取章节目录、点击顺序、章节ID，自动重试
    """
    for attempt in range(max_retry):
        try:
            df = pd.DataFrame(columns=["id", "title", "type", "output", "course_name"])
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            # 课程名直接用外部 course_titles，防止抓取失败
            # course_name = sanitize_filename(soup.find('h1').text)
            # 由主循环传入 course_name
            chapter_divs = [
                chapter for chapter in soup.find_all(
                    'div',
                    class_="course-player__chapters-item _chapters-item_1tqvoe ember-view ui-accordion ui-widget ui-helper-reset"
                )
                if not any(keyword in chapter.text for keyword in [
                    "Orientation", "Benchmark Assessment", "Final Assessment", "Course Feedback", "Next Step"
                ])
            ]
            if not chapter_divs:
                raise ValueError("未找到章节目录")
            for i, chapter in enumerate(chapter_divs):
                chapter_text = f"{i}. {sanitize_filename(chapter.find('h2').text.strip())}"
                chapter_id = chapter['id']
                df.loc[len(df)] = [chapter_id, chapter_text, "chapter", None, None]
                lessons = chapter.find_all('a')
                for lesson_idx, lesson in enumerate(lessons):
                    lesson_info = [item.strip() for item in re.findall(r'(.*)', lesson.text.strip())]
                    lesson_text = lesson_info[0]
                    is_video = "Video" in lesson_info
                    is_download = "Download" in lesson_info
                    if is_video:
                        lesson_text = f"{lesson_idx}. {sanitize_filename(lesson_text)}"
                        lesson_id = lessons[lesson_idx]['id']
                        df.loc[len(df)] = [lesson_id, lesson_text, "video", None, None]
                    elif is_download:
                        lesson_text = f"{lesson_idx}. {sanitize_filename(lesson_text)}"
                        lesson_id = lessons[lesson_idx]['id']
                        df.loc[len(df)] = [lesson_id, lesson_text, "resource", None, None]
            return df
        except Exception as e:
            print(f"Error extracting course page (attempt {attempt+1}/{max_retry}): {e}")
            time.sleep(wait_sec)
    # 最终失败返回空DataFrame
    return pd.DataFrame(columns=["id", "title", "type", "output", "course_name"])

    

def monitor_and_click(current_id, current_type, current_path, current_title, timeout=20):
    """
    持续点击并监听，直到提取到目标内容或超时
    """
    start_time = time.time()
    while time.time() - start_time < timeout:
        # 点击当前元素
        if not click_element(current_id, tag_name="a" if current_type != "chapter" else "div"):
            time.sleep(1)
            continue

        # 监听并提取
        if current_type == "video":
            iframes = driver.find_elements(By.TAG_NAME, "iframe")
            if iframes:
                iframe = iframes[0]
                try:
                    driver.switch_to.frame(iframe)
                    iframe_content = driver.page_source
                    m3u8_pattern = r'(https://embed-ssl\.wistia\.com/deliveries/[a-f0-9]+)\.m3u8'
                    matches = re.findall(m3u8_pattern, iframe_content)
                    driver.switch_to.default_content()
                    if matches:
                        url = matches[0] + ".mp4"
                        print(f"#O,{current_path}\n{current_title}.mp4,{url}")
                        return url
                except Exception as e:
                    driver.switch_to.default_content()
        elif current_type == "resource":
            try:
                download_as = driver.find_elements(By.CSS_SELECTOR, 'a._button--default_142a8m._button--link_142a8m')
                links = []
                for a in download_as:
                    download_url = a.get_attribute("href")
                    if download_url:
                        #print(f"#O,{current_path}\n{current_title},{download_url}")
                        links.append(download_url)
                return links if links else None
            except Exception as e:
                pass
        time.sleep(2)
    print(f"监听超时: {current_title}")
    return None

In [None]:


for course_idx, course in enumerate(tqdm(course_links, desc="课程进度")):
    try:  

        
        driver.get(course)
        # 等待课程主内容或章节目录出现
        time.sleep(20)  # 等待页面加载
            
        course_name = sanitize_filename(course_titles[course_idx])

        df = analyze_course_page()
        if df.empty:
            print(f"课程 {course_idx+1} 页面无内容，跳过")
            continue

        root_dir = "C:/Users/75299/Downloads/"
        
        course_dir = os.path.join(root_dir, course_name)
        os.makedirs(course_dir, exist_ok=True)
        df["path"] = None

        # 4. 章节目录与path分配
        chapter_mask = df['type'] == 'chapter'
        for idx in df[chapter_mask].index:
            chapter_dir = os.path.join(course_dir, df.at[idx, 'title'])
            os.makedirs(chapter_dir, exist_ok=True)
            next_idx = idx + 1
            while next_idx < len(df) and df.at[next_idx, 'type'] in ['video', 'resource']:
                df.at[next_idx, 'path'] = chapter_dir
                next_idx += 1

        # 5. 章节/课时处理进度条
        for index in tqdm(range(len(df)), desc="章节/课时进度", leave=False):
            current_id = df.iloc[index]['id']
            current_type = df.iloc[index]['type']
            current_title = df.iloc[index]['title']
            current_path = df.iloc[index]['path']

            result = monitor_and_click(current_id, current_type, current_path, current_title)
            if result:
              if current_type == "video":
                  df.at[index, 'output'] = final_output(current_path, current_title, result)
              elif current_type == "resource":
                  df.at[index, 'output'] = "\n".join(result) if isinstance(result, list) else result
            time.sleep(random.uniform(1, 2))  # 每次提取后延时
            if index % 10 == 0:
              df.to_csv(os.path.join(course_dir, f"{course_name}.csv"), index=False, encoding='utf-8-sig')
            
            

        # 6. 保存本课程数据
        df.to_csv(os.path.join(course_dir, f"{course_name}.csv"), index=False, encoding='utf-8-sig')
        print(f"课程 {course_name} 处理完成")

        video_outputs = df[df['type'] == 'video']['output'].dropna()
        resource_outputs = df[df['type'] == 'resource']['output'].dropna()

        with open(os.path.join(course_dir, f"{course_name}_video.txt"), "w", encoding="utf-8") as f:
            for line in video_outputs:
                f.write(line.strip() + "\n")

        with open(os.path.join(course_dir, f"{course_name}_resource.txt"), "w", encoding="utf-8") as f:
            for line in resource_outputs:
                f.write(line.strip() + "\n")

        # 7. 返回主页
        click_by_xpath("/html/body/div[1]/div[2]/div[3]/div[1]/header/div/div/a")
        WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//a[@href='/learning-plan']"))
        )
        # 8. 回到learning-plan页面
        click_by_xpath("/html/body/div[2]/div/div[2]/aside/nav/div[1]/div/a[2]")
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '[data-intercom-selector="leftNav__learningPlan"]'))
        )

    except Exception as e:
        print(f"课程 {course_idx+1} 处理失败: {e}")