forked from gh0std4ncer/geeks-pdf
-
Notifications
You must be signed in to change notification settings - Fork 1
/
list_links.py
117 lines (79 loc) · 2.46 KB
/
list_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python3.6
"""
List all links on all pages of a geeks for geeks tag.
By default, a JSON is generated, which can be edited by hand.
Usage: list_links [options] <URL>
"""
import os
import sys
import json
from collections import OrderedDict
import requests
import pyquery
from bs4 import BeautifulSoup
ROOT_JSON = "JSON/"
URL = sys.argv[1].rstrip('/')
TAG = URL.split('/')[-1].title()
FNAME = ROOT_JSON + f"{TAG}.json"
def abort(msg):
print(msg, file=sys.stderr)
exit(1)
def print_titles(content):
for title in content.find_all('strong'):
print(title.text.strip())
def fetch_post_links(urls, filename=None, combined=False):
if type(urls) is str:
urls = [urls]
links = OrderedDict()
for url in urls:
soup = BeautifulSoup(requests.get(url).text, "lxml")
content = soup.find('div', id="content")
topic = OrderedDict()
for ques in content.find_all("h2", class_="entry-title"):
link = ques.find("a")
print(link['href'].strip())
topic[link.text.strip()] = link['href'].strip()
if combined:
links.update(topic)
else:
topic_name = url.split('/')[-2].title()
links[topic_name] = topic
if not filename:
print(json.dumps(links, indent=4))
else:
with open(filename, "w") as out:
json.dump(links, out, indent=4)
print()
print("Links written to:", filename)
def unique_links(filename):
with open(filename) as inp:
data = json.load(inp, object_pairs_hook=OrderedDict)
uniq = OrderedDict()
for title, link in data.items():
if link not in uniq.values():
uniq[title] = link
with open(filename, "w") as out:
json.dump(uniq, out, indent=4)
def list_pages(root_url):
links = []
num_pages = 0
try:
pq = pyquery.PyQuery(url=root_url)
except: # noqa
abort("URL doesn't exist.")
if pq('.pages'):
num_pages = int(pq('.pages')[0].text.split()[-1])
if not num_pages:
num_pages = 1
print("Only 1 page!")
for page in range(1, num_pages + 1):
links.append(URL + f"/page/{page}/")
return links
if __name__ == '__main__':
if os.path.isfile(FNAME):
unique_links(FNAME)
abort("JSON file already exists: " + FNAME)
else:
page_links = list_pages(URL)
fetch_post_links(page_links, FNAME, combined=True)
unique_links(FNAME)