In [1]:
# Ultra-Fast EPO ZIP Downloader - Maximized for Speed
# Only downloads ZIPs, no debugging, minimal logging

!apt-get update >/dev/null 2>&1
!apt-get install -y chromium-browser >/dev/null 2>&1
!pip install selenium requests -q

import os
import time
import requests
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m49.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [25]:
class FastEPODownloader:
    def __init__(self, year='2025', week='30'):
        self.year = str(year)
        self.week = str(week)
        self.download_folder = '/content/drive/MyDrive/Epo_patent'
        os.makedirs(self.download_folder, exist_ok=True)
        self.driver = None
        self.total_downloads = 0
        self.url_list = []

    def setYearWeek(self, year, week):
        self.year = str(year)
        self.week = str(week)

    def setup_driver(self):
        options = Options()
        options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--disable-gpu')
        options.add_argument('--disable-images')
        options.add_argument('--disable-javascript')
        options.add_argument('--disable-css')
        options.add_argument('--disable-plugins')
        options.add_argument('--disable-extensions')
        options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
        self.driver = webdriver.Chrome(options=options)
        return True

    def fill_and_search(self):
        self.driver.get('https://data.epo.org/publication-server/?lg=en')
        time.sleep(2)

        print(f"Start Searching for Year: {self.year}, Week: {self.week} ...")
        self.driver.execute_script("""
            const yearValue = arguments[0];
            const weekValue = arguments[1];
            const publicationDate = "During"

            async function clickDropdownAndSelect(labelPart, value) {
                return new Promise(resolve => {
                    const dropdown = document.querySelector('[aria-label*="' + labelPart + '"]');
                    if (!dropdown) {
                        console.warn(labelPart + ' dropdown not found');
                        return resolve(false);
                    }
                    dropdown.click();
                    setTimeout(() => {
                        const options = Array.from(document.querySelectorAll('[role="option"], li, div'));
                        const target = options.find(el => el.textContent.trim() === value);
                        if (target) {
                            target.click();
                            console.log('✅ ' + labelPart + ' set to ' + value);
                            resolve(true);
                        } else {
                            console.warn('❌ ' + labelPart + ' value "' + value + '" not found');
                            resolve(false);
                        }
                    }, 300);
                });
            }

            (async function run() {
                await clickDropdownAndSelect('Publication date', publicationDate);
                await clickDropdownAndSelect('Year', yearValue);
                await clickDropdownAndSelect('Week', weekValue);


                const searchBtn = document.querySelector('button[data-testid="search_button"], button[type="submit"]');
                if (searchBtn) searchBtn.click();
            })();
        """, self.year, self.week)

        print(f"🔍 Searching...")
        time.sleep(3)
        return True

    def get_all_zip_links(self):
        zip_data = self.driver.execute_script("""
            var zipLinks = [];
            var elements = document.querySelectorAll('a[href*=".zip"]');

            for (var i = 0; i < elements.length; i++) {
                var href = elements[i].href;
                if (href && href.includes('.zip')) {
                    var filename = href.split('/').pop().split('?')[0];
                    if (!filename.endsWith('.zip')) filename += '.zip';
                    zipLinks.push({url: href, filename: filename});
                }
            }
            return zipLinks;
        """)
        return zip_data

    def download_zip(self, url, filename):
        # ✏️ MODIFIED: Save URL instead of downloading
        self.url_list.append(url)
        return True

    def go_next_page(self):
        return self.driver.execute_script("""
            var nextElements = document.querySelectorAll('a, button');
            for (var i = 0; i < nextElements.length; i++) {
                var el = nextElements[i];
                var text = el.textContent.toLowerCase().trim();
                var href = el.href || '';
                var ariaLabel = (el.getAttribute('aria-label') || '').toLowerCase();

                if (el.offsetParent !== null && !el.disabled && !el.classList.contains('disabled')) {
                    if (text.includes('next') || text === '>' || text === '›' ||
                        href.includes('next') || ariaLabel.includes('next')) {
                        el.click();
                        return true;
                    }
                }
            }
            return false;
        """)

    def run(self):
        print("🚀 Ultra-Fast EPO ZIP Downloader Starting...", flush=True)

        if not self.setup_driver():
            return 0

        output_file = os.path.abspath(os.path.join(self.download_folder, f"{self.year}_{self.week}.txt"))

        try:
            if not self.fill_and_search():
                return 0

            page = 1

            with open(output_file, "w") as f:  # Open file once, write incrementally
                while page <= 4000000:
                    zip_links = self.get_all_zip_links()

                    if not zip_links:
                        if page > 1:
                            break
                        else:
                            page += 1
                            continue

                    print(f"Page {page}: {len(zip_links)} files", flush=True)

                    for link in zip_links:
                        self.download_zip(link['url'], link['filename'])  # still appends to self.url_list
                        f.write(link['url'] + "\n")  # write URL per line immediately

                    if not self.go_next_page():
                        break

                    time.sleep(1)
                    page += 1

            print(f"\n✅ Complete! Saved URLs to:\n{output_file}\n", flush=True)
            return len(self.url_list)

        finally:
            if self.driver:
                self.driver.quit()

In [26]:
year_start = 1979
year_end = 2025
week_start = 1
week_end = 52

downloader = FastEPODownloader(year=year_start, week=week_start)

In [27]:
for year in range(year_start, year_end):  # 2025 not included, so 2024 is last
    for week in range(week_start, week_end):  # Weeks 1 to 52
        print(f"\n📅 Running for Year: {year}, Week: {week:02d}")
        downloader.setYearWeek(year,week)
        try:
            downloader.run()
        except Exception as e:
            print(f"❌ Error in Year {year}, Week {week}: {e}")



📅 Running for Year: 1979, Week: 01
🚀 Ultra-Fast EPO ZIP Downloader Starting...
Start Searching for Year: 1979, Week: 1 ...
🔍 Searching...

✅ Complete! Saved URLs to:
/content/drive/MyDrive/Epo_patent/1979_1.txt


📅 Running for Year: 1979, Week: 02
🚀 Ultra-Fast EPO ZIP Downloader Starting...
Start Searching for Year: 1979, Week: 2 ...
🔍 Searching...
Page 1: 20 files
Page 2: 20 files
Page 3: 20 files
Page 4: 20 files
Page 5: 20 files
Page 6: 20 files
Page 7: 20 files
Page 8: 20 files
Page 9: 20 files
Page 10: 20 files
Page 11: 12 files

✅ Complete! Saved URLs to:
/content/drive/MyDrive/Epo_patent/1979_2.txt


📅 Running for Year: 1979, Week: 03
🚀 Ultra-Fast EPO ZIP Downloader Starting...
Start Searching for Year: 1979, Week: 3 ...
🔍 Searching...

✅ Complete! Saved URLs to:
/content/drive/MyDrive/Epo_patent/1979_3.txt


📅 Running for Year: 1979, Week: 04
🚀 Ultra-Fast EPO ZIP Downloader Starting...
Start Searching for Year: 1979, Week: 4 ...
🔍 Searching...
Page 1: 20 files
Page 2: 20 files



KeyboardInterrupt: 