/
scraper.py
50 lines (33 loc) · 1.38 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from playwright.sync_api import Playwright, sync_playwright, Page
def write_to_file(filename: str, data: str):
# Saves raw html to name.html files.
with open(filename, "w") as f:
f.write(data)
def run(playwright: Playwright) -> None:
celebrity_names = ["Tom Cruise", "Johnny Depp", "Tom Holland", "Scarlett Johansson"]
browser = playwright.chromium.launch(headless=False)
context = browser.new_context()
# looping through all celebrities
for celebrity in celebrity_names:
print(f"Processing {celebrity}")
# Open new page
page = context.new_page()
# Go to https://www.wikipedia.org/
page.goto("https://www.wikipedia.org/")
# Click input[name="search"]
page.click('input[name="search"]')
# Fill input[name="search"]
page.fill('input[name="search"]', celebrity)
# Click #typeahead-suggestions a >> :nth-match(div, 2)
page.click("#typeahead-suggestions a >> :nth-match(div, 2)")
# file names should be like tom_cruise.html
filename = "_".join(celebrity.lower().split()) + ".html"
# write the html to a file
write_to_file(filename, page.content())
page.close()
print(f"Processing completed for {celebrity}")
# ---------------------
context.close()
browser.close()
with sync_playwright() as playwright:
run(playwright)