/
plurascrape_post.py
71 lines (63 loc) · 2.43 KB
/
plurascrape_post.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from plura_dl.scrapeutils import (
store_dict_as_json,
lookaround_tags
)
import json, os, re, requests, codecs, time
SCRIPTPATH = os.path.dirname(os.path.abspath(__file__))
json_in = os.path.join(SCRIPTPATH, "data", "courses.json")
JSON_FILE = os.path.abspath(json_in)
JSON_OUTPUT = os.path.join(SCRIPTPATH, "data", "courses_d.json")
def grab_and_update_course_data():
with open(JSON_FILE, 'rt') as f:
json_load = json.load(f)
with open(JSON_OUTPUT, 'rt') as f:
json_output = json.load(f)
course_ids = list(json_load.keys())
for course in course_ids:
try:
description = json_output[course]["description"]
except:
pass
json_output[course] = json_load[course]
json_output[course]["description"] = description
return course_ids, json_output
def filter_descriptions(course_ids, json_load):
new_course_ids = []
for course in course_ids:
description = json_load[course]["description"]
if description == "":
new_course_ids.append(course)
return new_course_ids
def main():
course_ids, json_load = grab_and_update_course_data()
course_ids = filter_descriptions(course_ids, json_load)
def _case_description(text):
line_segments = text.split('\n')
i=0
for segment in line_segments:
i+=1
if re.search(r'Description', segment):
formatted_desciption = "".join(line_segments[i:]).strip()
break
return formatted_desciption
start = time.time(); i=0; count_desciptions = 0;
for course_id in course_ids:
url = json_load[course_id]["url"]
r = requests.get(url)
scan_next = False
for line in r.text.split('\n'):
if scan_next:
if re.search(r'<p>', line) and re.search(r'</p>', line):
formatted_description = lookaround_tags(r'<p>', r'</p>').search(line).group()
json_load[course_id]["description"] = formatted_description
count_desciptions+=1
break
if re.search(r'<h6>Description</h6>', line):
scan_next = True
i+=1
print("Iteration:", i, "performed in:", time.time() - start, "seconds.")
print("Sucessfully loaded desciptions:", count_desciptions)
print("")
store_dict_as_json(json_load, JSON_OUTPUT)
if __name__ == "__main__":
main()