/
scraper.py
86 lines (77 loc) 路 3.1 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
'''
Functions that scrape respective news websites and call selected APIs
'''
import os
from sys import executable
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
def configureSelenium():
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = os.environ.get("GOOGLE_CHROME_BIN")
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--no-sandbox")
driver = webdriver.Chrome(executable_path=os.environ.get("CHROMEDRIVER_PATH"), chrome_options=chrome_options)
return driver
def scrapeStar():
staronlineURL = 'https://www.thestar.com.my/'
staronline = requests.get(staronlineURL)
soup = BeautifulSoup(staronline.content, 'html.parser')
stories = {}
featured_stories_nation = {}
for story in soup.find_all('a', { "data-list-type": "Featured Stories", "data-content-category": "News/Nation" }):
featured_stories_nation[story["data-content-title"]] = story['href']
stories['Featured Stories Nation'] = featured_stories_nation
featured_stories_asean = {}
for story in soup.find_all('a', { "data-list-type": "Featured Stories", "data-content-category": "AseanPlus/Aseanplus News"}):
featured_stories_asean[story["data-content-title"]] = story['href']
stories['Featured Stories Asean'] = featured_stories_asean
return stories
def getArticleStar(url: str):
article = requests.get(url);
soup = BeautifulSoup(article.content, 'html.parser')
return ' '.join([p.text for p in soup.find(id='story-body').findChildren()])
#page dynamically generated, have to use selenium
def starFootball():
#if on heroku, configure selenium first
if(os.environ.get("GOOGLE_CHROME_BIN") and os.environ.get("CHROMEDRIVER_PATH")):
driver = configureSelenium()
else:
driver = webdriver.Chrome()
URL = 'https://www.thestar.com.my/sport'
try:
driver.get(URL)
print(driver.title)
footballNews = {}
stories = driver.find_elements_by_xpath("//a[@data-content-category = 'Sport/Football']")
for story in stories:
if 'Soccer-' in story.get_attribute('data-content-title'):
stripped = story.get_attribute('data-content-title').split('Soccer-')[1]
else:
stripped = story.get_attribute('data-content-title')
if stripped not in footballNews.keys():
footballNews[stripped] = story.get_attribute('href')
return footballNews
except:
print('Something Went Wrong')
return {}
def malaysiakini():
URL = 'https://www.malaysiakini.com/'
classNames = {
'Top Story': 'jsx-4226912739 title',
'Top Stories': 'jsx-3163722522',
'Featured': 'jsx-2425286463 tabPanelTitle'
}
def mkiniOpinionPieces():
URL = 'https://www.malaysiakini.com/en/latest/columns'
classNames = {
'Title': {
'tag': 'h3',
'class': 'jsx-196449950'
},
'Summary': {
'tag': 'div',
'class': 'jsx-196449950 summary"'
}
}