Skip to content

use scrapy-playwright can't login into ti.com #119

@yswtrue

Description

@yswtrue

These is pure playwright code, and it can login ti well.

from playwright_stealth import StealthConfig
from playwright_stealth import stealth_async

async def playwright_ti_jiaocha():
    from playwright.async_api import async_playwright

    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=False,
            executable_path="/usr/bin/google-chrome-stable",
            chromium_sandbox=False,
        )
        context = await browser.new_context(
            locale="zh-CN",
            ignore_https_errors=True,
        )
        page = await context.new_page()
        await stealth_async(
            page,
            StealthConfig(
                chrome_load_times=False,
                languages=["zh-Hans", "zh"],
                navigator_languages=False,
            ),
        )
        await page.add_init_script(
            """
        Object.defineProperty(Object.getPrototypeOf(navigator), 'languages', {
        get: () => opts.languages || ['en-US', 'en']
    })
    """
        )
        await page.goto("https://ti.com/")
        # # 判断是否已经登陆
        await page.locator(':text("Login / Register"), :text("登录/注册")').click()
        await page.wait_for_load_state("domcontentloaded")
        account_info = {
            "username": "hua@sz-guangyidz.cn",
            "password": "Aa2238117",
        }
        username = account_info["username"]
        password = account_info["password"]
        await page.click('input[name="username"]')
        # Fill input[name="username"]
        await page.fill('input[name="username"]', username)
        # Press Enter
        await page.locator(':text("Next")').click()
        await page.wait_for_timeout(2000)
        # Click input[name="password"]
        await page.click('input[name="password"]')
        # Fill input[name="password"]
        await page.fill('input[name="password"]', password)
        await page.wait_for_timeout(10000)
        async with page.expect_navigation():
            await page.locator("ti-button:has-text('Log in')").click()
        while True:
            await asyncio.sleep(10)

But with scrapy can't login well. It can open login page, and fill right account info, but when click login it will return to the login page again. even though use chrome and firefox.
These is the code

from playwright_stealth import StealthConfig
from playwright_stealth import stealth_async
from scrapy.http import HtmlResponse

class TiJiaoCha(scrapy.Spider):
    name = "ti_jiaocha_spyder"
    allowed_domains = [
        "ti.com",
        "www.ti.com",
        "login.ti.com",
        "*.ti.com",
        "www.tij.co.jp",
        "www.ti.com.cn",
    ]
    headers = {}
    proxy: str = None

    def start_requests(self):
        self.headers = {
            "User-Agent": None,
            "Connection": "close",
        }
        logger.info("proxy: %s", proxy)
        url = "https://www.baidu.com/"
        proxy_url = proxy_type = user = password = hostport = None
        if proxy:
            proxy_type, user, password, hostport = _parse_proxy(proxy)
            proxy_url = urlunparse((proxy_type or "", hostport, "", "", "", ""))
        yield scrapy.Request(
            url=url,
            dont_filter=True,
            callback=self.check_login,
            headers=self.headers,
            meta={
                "account_info": account_info,
                "playwright": True,
                # "playwright_context": f"{account_info['username']}context",
                "playwright_page_goto_kwargs": {
                    # "wait_until": "domcontentloaded",
                },
                "playwright_include_page": True,
                "playwright_context_kwargs": {
                    # "java_script_enabled": False,
                    # "headless": False,
                    "locale": "zh-CN",
                    "ignore_https_errors": True,
                    # "user_data_dir": f"data/{account_info['username']}",
                    "proxy": {
                        "server": proxy_url,
                        "username": user or "",
                        "password": password or "",
                    }
                    if proxy
                    else {
                        "server": "localhost:8080",
                        "username": "",
                        "password": "",
                    },
                },
            },
            # meta={
            #     "proxy": self.proxy if self.proxy else None,
            # },
        )

        # yield PlaywrightRequest(
        #     url="https://www.mouser.com/",
        #     headers=self.headers,
        #     meta={
        #         "proxy": self.proxy if self.proxy else None,
        #     },
        #     dont_filter=True,
        #     callback=self.start_search,
        # )

    async def check_login(self, response: HtmlResponse):
        page: Page = response.meta["playwright_page"]
        # await stealth_async(page)
        await stealth_async(
            page,
            StealthConfig(
                languages=["zh-Hans", "zh"],
                navigator_languages=False,
                chrome_load_times=False,
            ),
        )
        await page.add_init_script(
            """
        Object.defineProperty(Object.getPrototypeOf(navigator), 'languages', {
        get: () => opts.languages || ['en-US', 'en']
    })
    """
        )
        await page.goto("https://ti.com/")
        # # 判断是否已经登陆
        await page.locator(':text("Login / Register"), :text("登录/注册")').click()
        await page.wait_for_load_state("domcontentloaded")
        account_info = {
            "username": "hua@sz-guangyidz.cn",
            "password": "Aa2238117",
        }
        username = account_info["username"]
        password = account_info["password"]
        await page.click('input[name="username"]')
        # Fill input[name="username"]
        await page.fill('input[name="username"]', username)
        # Press Enter
        await page.locator(':text("Next")').click()
        await page.wait_for_timeout(2000)
        # Click input[name="password"]
        await page.click('input[name="password"]')
        # Fill input[name="password"]
        await page.fill('input[name="password"]', password)
        # Click text=a365039311@gmail.com 更改 密码 eyeeye-off大写锁定已打开 忘记密码? 登录 >> button
        # with page.expect_navigation(url="https://www.ti.com.cn/product/cn/TPS7H4002-SP?keyMatch=TPS7H4002-SP&tisearch=search-everything&usecase=GPN&login-check=true"):
        await page.wait_for_timeout(10000)
        async with page.expect_navigation():
            await page.locator("ti-button:has-text('Log in')").click()
        while True:
            await asyncio.sleep(10)


import scrapy
from billiard import Pool, Process
from scrapy import signals
from scrapy.crawler import CrawlerProcess, CrawlerRunner
from scrapy.utils.project import get_project_settings
from scrapy.utils.reactor import install_reactor
from twisted.internet import asyncioreactor


class UrlCrawlerScript(Process):
    def __init__(self, spider, accounts=None, proxy_provider="net_not"):

        Process.__init__(self)
        settings = get_project_settings()
        settings["TELNETCONSOLE_ENABLED"] = False
        # settings["DOWNLOADER_MIDDLEWARES"] = {
        #     # utils_tasks.ScraperResponseMiddleware: 1000,
        #     # "scrapy_deltafetch.DeltaFetch": 100,
        # }
        settings["PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT"] = 90000
        settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] = None
        settings["DOWNLOAD_HANDLERS"] = {
            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
            "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
        }
        settings["CONCURRENT_REQUESTS"] = 40
        # settings["PLAYWRIGHT_MAX_CONTEXTS"] = 10
        # settings["PLAYWRIGHT_MAX_PAGES_PER_CONTEXT"] = 1
        # settings["CONCURRENT_ITEMS"] = 1
        # settings["REACTOR_THREADPOOL_MAXSIZE"] = 1
        settings["ROBOTSTXT_OBEY"] = False
        settings["PLAYWRIGHT_BROWSER_TYPE"] = "firefox"
        settings["PLAYWRIGHT_LAUNCH_OPTIONS"] = {
            "headless": False,
            # "channel": "chrome",
            # "executable_path": "/usr/bin/google-chrome-stable",
            # "chromium_sandbox": False,
            # "devtools": True,
        }
        settings["DELTAFETCH_ENABLED"] = False
        block_list = [
            "https://www.ti.com/akam",
            "https://www.ti.com.cn/akam",
            "https://cm.g.doubleclick.net",
            "https://try.abtasty.com",
            "https://connect.facebook.net",
            "https://www.googletagmanager.com",
            "https://img.en25.com",
            "https://collect.tealiumiq.com",
            "https://cdn.decibelinsight.net",
            "https://s.adroll.com",
            "https://analytics.supplyframe.com",
            "https://www.tij.co.jp/akam",
            "https://visitor-service.tealiumiq.com",
            "https://try.abtasty.com",
            "https://metrics.brightcove.com",
            "https://t.supplyframe.com",
            # "https://www.gstatic.cn/recaptcha",
            "https://maps.googleapis.com/maps/api/mapsjs",
            "https://js-agent.newrelic.com",
        ]

        def should_abort_request(req):

            # for block_url in block_list:
            #     if req.url.startswith(block_url):
            #         logger.info(f"Aborting request: {req.url}")
            #         return True
            return False
            if req.resource_type == "image" or req.resource_type == "font":
                logger.info(f"Aborting {req.resource_type} request: {req.url}")
                return True

        settings["PLAYWRIGHT_ABORT_REQUEST"] = should_abort_request

        settings[
            "TWISTED_REACTOR"
        ] = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"

        self.accounts = accounts
        self.proxy_provider = proxy_provider
        self.crawler = CrawlerRunner(settings)
        self.spider = spider

    def run(self):

        scrapy.utils.reactor.install_reactor(
            "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
        )
        is_asyncio_reactor_installed = (
            scrapy.utils.reactor.is_asyncio_reactor_installed()
        )
        print(f"Is asyncio reactor installed: {is_asyncio_reactor_installed}")

        from twisted.internet import reactor

        params = {}
        self.crawler.crawl(self.spider, params=params)
        # self.crawler.start()
        # self.crawler.join()
        # Thread(target=self.crawler.start).start()
        # reactor.run()

        # tp = reactor.getThreadPool()
        # tp.adjustPoolsize(maxthreads=10)
        # reactor.addSystemEventTrigger("before", "shutdown", self.crawler.stop)
        d = self.crawler.join()
        d.addBoth(lambda _: reactor.stop())
        reactor.run()  # the script will block here until all crawling jobs are finished
crawler = UrlCrawlerScript(
    TiJiaoCha, accounts=accounts, proxy_provider=proxy_provider
)
crawler.start()
crawler.join()

Metadata

Metadata

Assignees

No one assigned

    Labels

    supportSupport questions

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions