-
Notifications
You must be signed in to change notification settings - Fork 149
Closed
Labels
supportSupport questionsSupport questions
Description
These is pure playwright code, and it can login ti well.
from playwright_stealth import StealthConfig
from playwright_stealth import stealth_async
async def playwright_ti_jiaocha():
from playwright.async_api import async_playwright
async with async_playwright() as p:
browser = await p.chromium.launch(
headless=False,
executable_path="/usr/bin/google-chrome-stable",
chromium_sandbox=False,
)
context = await browser.new_context(
locale="zh-CN",
ignore_https_errors=True,
)
page = await context.new_page()
await stealth_async(
page,
StealthConfig(
chrome_load_times=False,
languages=["zh-Hans", "zh"],
navigator_languages=False,
),
)
await page.add_init_script(
"""
Object.defineProperty(Object.getPrototypeOf(navigator), 'languages', {
get: () => opts.languages || ['en-US', 'en']
})
"""
)
await page.goto("https://ti.com/")
# # 判断是否已经登陆
await page.locator(':text("Login / Register"), :text("登录/注册")').click()
await page.wait_for_load_state("domcontentloaded")
account_info = {
"username": "hua@sz-guangyidz.cn",
"password": "Aa2238117",
}
username = account_info["username"]
password = account_info["password"]
await page.click('input[name="username"]')
# Fill input[name="username"]
await page.fill('input[name="username"]', username)
# Press Enter
await page.locator(':text("Next")').click()
await page.wait_for_timeout(2000)
# Click input[name="password"]
await page.click('input[name="password"]')
# Fill input[name="password"]
await page.fill('input[name="password"]', password)
await page.wait_for_timeout(10000)
async with page.expect_navigation():
await page.locator("ti-button:has-text('Log in')").click()
while True:
await asyncio.sleep(10)But with scrapy can't login well. It can open login page, and fill right account info, but when click login it will return to the login page again. even though use chrome and firefox.
These is the code
from playwright_stealth import StealthConfig
from playwright_stealth import stealth_async
from scrapy.http import HtmlResponse
class TiJiaoCha(scrapy.Spider):
name = "ti_jiaocha_spyder"
allowed_domains = [
"ti.com",
"www.ti.com",
"login.ti.com",
"*.ti.com",
"www.tij.co.jp",
"www.ti.com.cn",
]
headers = {}
proxy: str = None
def start_requests(self):
self.headers = {
"User-Agent": None,
"Connection": "close",
}
logger.info("proxy: %s", proxy)
url = "https://www.baidu.com/"
proxy_url = proxy_type = user = password = hostport = None
if proxy:
proxy_type, user, password, hostport = _parse_proxy(proxy)
proxy_url = urlunparse((proxy_type or "", hostport, "", "", "", ""))
yield scrapy.Request(
url=url,
dont_filter=True,
callback=self.check_login,
headers=self.headers,
meta={
"account_info": account_info,
"playwright": True,
# "playwright_context": f"{account_info['username']}context",
"playwright_page_goto_kwargs": {
# "wait_until": "domcontentloaded",
},
"playwright_include_page": True,
"playwright_context_kwargs": {
# "java_script_enabled": False,
# "headless": False,
"locale": "zh-CN",
"ignore_https_errors": True,
# "user_data_dir": f"data/{account_info['username']}",
"proxy": {
"server": proxy_url,
"username": user or "",
"password": password or "",
}
if proxy
else {
"server": "localhost:8080",
"username": "",
"password": "",
},
},
},
# meta={
# "proxy": self.proxy if self.proxy else None,
# },
)
# yield PlaywrightRequest(
# url="https://www.mouser.com/",
# headers=self.headers,
# meta={
# "proxy": self.proxy if self.proxy else None,
# },
# dont_filter=True,
# callback=self.start_search,
# )
async def check_login(self, response: HtmlResponse):
page: Page = response.meta["playwright_page"]
# await stealth_async(page)
await stealth_async(
page,
StealthConfig(
languages=["zh-Hans", "zh"],
navigator_languages=False,
chrome_load_times=False,
),
)
await page.add_init_script(
"""
Object.defineProperty(Object.getPrototypeOf(navigator), 'languages', {
get: () => opts.languages || ['en-US', 'en']
})
"""
)
await page.goto("https://ti.com/")
# # 判断是否已经登陆
await page.locator(':text("Login / Register"), :text("登录/注册")').click()
await page.wait_for_load_state("domcontentloaded")
account_info = {
"username": "hua@sz-guangyidz.cn",
"password": "Aa2238117",
}
username = account_info["username"]
password = account_info["password"]
await page.click('input[name="username"]')
# Fill input[name="username"]
await page.fill('input[name="username"]', username)
# Press Enter
await page.locator(':text("Next")').click()
await page.wait_for_timeout(2000)
# Click input[name="password"]
await page.click('input[name="password"]')
# Fill input[name="password"]
await page.fill('input[name="password"]', password)
# Click text=a365039311@gmail.com 更改 密码 eyeeye-off大写锁定已打开 忘记密码? 登录 >> button
# with page.expect_navigation(url="https://www.ti.com.cn/product/cn/TPS7H4002-SP?keyMatch=TPS7H4002-SP&tisearch=search-everything&usecase=GPN&login-check=true"):
await page.wait_for_timeout(10000)
async with page.expect_navigation():
await page.locator("ti-button:has-text('Log in')").click()
while True:
await asyncio.sleep(10)
import scrapy
from billiard import Pool, Process
from scrapy import signals
from scrapy.crawler import CrawlerProcess, CrawlerRunner
from scrapy.utils.project import get_project_settings
from scrapy.utils.reactor import install_reactor
from twisted.internet import asyncioreactor
class UrlCrawlerScript(Process):
def __init__(self, spider, accounts=None, proxy_provider="net_not"):
Process.__init__(self)
settings = get_project_settings()
settings["TELNETCONSOLE_ENABLED"] = False
# settings["DOWNLOADER_MIDDLEWARES"] = {
# # utils_tasks.ScraperResponseMiddleware: 1000,
# # "scrapy_deltafetch.DeltaFetch": 100,
# }
settings["PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT"] = 90000
settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] = None
settings["DOWNLOAD_HANDLERS"] = {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}
settings["CONCURRENT_REQUESTS"] = 40
# settings["PLAYWRIGHT_MAX_CONTEXTS"] = 10
# settings["PLAYWRIGHT_MAX_PAGES_PER_CONTEXT"] = 1
# settings["CONCURRENT_ITEMS"] = 1
# settings["REACTOR_THREADPOOL_MAXSIZE"] = 1
settings["ROBOTSTXT_OBEY"] = False
settings["PLAYWRIGHT_BROWSER_TYPE"] = "firefox"
settings["PLAYWRIGHT_LAUNCH_OPTIONS"] = {
"headless": False,
# "channel": "chrome",
# "executable_path": "/usr/bin/google-chrome-stable",
# "chromium_sandbox": False,
# "devtools": True,
}
settings["DELTAFETCH_ENABLED"] = False
block_list = [
"https://www.ti.com/akam",
"https://www.ti.com.cn/akam",
"https://cm.g.doubleclick.net",
"https://try.abtasty.com",
"https://connect.facebook.net",
"https://www.googletagmanager.com",
"https://img.en25.com",
"https://collect.tealiumiq.com",
"https://cdn.decibelinsight.net",
"https://s.adroll.com",
"https://analytics.supplyframe.com",
"https://www.tij.co.jp/akam",
"https://visitor-service.tealiumiq.com",
"https://try.abtasty.com",
"https://metrics.brightcove.com",
"https://t.supplyframe.com",
# "https://www.gstatic.cn/recaptcha",
"https://maps.googleapis.com/maps/api/mapsjs",
"https://js-agent.newrelic.com",
]
def should_abort_request(req):
# for block_url in block_list:
# if req.url.startswith(block_url):
# logger.info(f"Aborting request: {req.url}")
# return True
return False
if req.resource_type == "image" or req.resource_type == "font":
logger.info(f"Aborting {req.resource_type} request: {req.url}")
return True
settings["PLAYWRIGHT_ABORT_REQUEST"] = should_abort_request
settings[
"TWISTED_REACTOR"
] = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
self.accounts = accounts
self.proxy_provider = proxy_provider
self.crawler = CrawlerRunner(settings)
self.spider = spider
def run(self):
scrapy.utils.reactor.install_reactor(
"twisted.internet.asyncioreactor.AsyncioSelectorReactor"
)
is_asyncio_reactor_installed = (
scrapy.utils.reactor.is_asyncio_reactor_installed()
)
print(f"Is asyncio reactor installed: {is_asyncio_reactor_installed}")
from twisted.internet import reactor
params = {}
self.crawler.crawl(self.spider, params=params)
# self.crawler.start()
# self.crawler.join()
# Thread(target=self.crawler.start).start()
# reactor.run()
# tp = reactor.getThreadPool()
# tp.adjustPoolsize(maxthreads=10)
# reactor.addSystemEventTrigger("before", "shutdown", self.crawler.stop)
d = self.crawler.join()
d.addBoth(lambda _: reactor.stop())
reactor.run() # the script will block here until all crawling jobs are finished
crawler = UrlCrawlerScript(
TiJiaoCha, accounts=accounts, proxy_provider=proxy_provider
)
crawler.start()
crawler.join()Metadata
Metadata
Assignees
Labels
supportSupport questionsSupport questions