-
Notifications
You must be signed in to change notification settings - Fork 1
/
gistbl.py
executable file
·150 lines (111 loc) · 3.85 KB
/
gistbl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/python2
import HTMLParser
import subprocess
import tempfile
import os
import sys
import re
GIST_ID=re.compile(r'^.*/gist.github.com/([a-z0-9]+)[^a-z0-9]*.*$')
def get_gist_id(gist_url):
m = GIST_ID.match(gist_url)
if m:
return m.group(1)
return None
def to_starttag(tag, attrs):
return '<%s %s>' % (tag, ' '.join(('%s="%s"' % (k,v) for k,v in attrs)))
def to_endtag(tag):
return '</%s>' % tag
class GistScraper(HTMLParser.HTMLParser):
def __init__(self, *args):
HTMLParser.HTMLParser.__init__(self, *args)
self.collect = None
self.data = []
def handle_starttag(self, tag, attrs):
tag = tag.lower()
attrd = dict(attrs)
attr_class = attrd.get('class', '')
if (not self.collect) and (tag == 'article' or attr_class.find('data') >= 0):
self.collect = tag
if self.collect:
self.data.append(to_starttag(tag, attrs))
def handle_endtag(self, tag):
tag = tag.lower()
if self.collect:
self.data.append(to_endtag(tag))
if tag == self.collect:
self.collect = None
def handle_data(self, data):
if self.collect:
self.data.append(data)
def scrapped_data(self):
return ''.join(self.data)
class Gistbl(object):
def __init__(self, repo_base, htdocs, gitcmd='git', curlcmd='curl'):
self.repo_base = repo_base
self.htdocs = htdocs
self.gitcmd = gitcmd
self.curlcmd = curlcmd
def clone_or_merge_repo(self, repo_id):
repo_dir = self.repo_dir(repo_id)
if os.path.exists(repo_dir):
return self.merge_repo(repo_dir)
return self.clone_repo(repo_id)
def repo_dir(self, repo_id):
return os.path.join(self.repo_base, repo_id)
def htdoc(self, repo_id):
return os.path.join(self.htdocs, repo_id + '.html')
def repo_url(self, repo_id):
return 'https://gist.github.com/%s.git' % repo_id
def page_url(self, repo_id):
return 'https://gist.github.com/%s/' % repo_id
def merge_repo(self, repo_dir):
p = subprocess.Popen([self.gitcmd, 'pull', 'origin'], cwd=repo_dir)
p.communicate()
return p.returncode
def clone_repo(self, repo_id):
repo_url = self.repo_url(repo_id)
p = subprocess.Popen([self.gitcmd, 'clone', repo_url], cwd=self.repo_base)
p.communicate()
return p.returncode
def download(self, page_url):
target_dir = tempfile.mkdtemp(prefix='gistbl-')
target_path = os.path.join(target_dir, 'index.html')
p = subprocess.Popen([self.curlcmd, '-s', '-o', target_path, page_url])
p.communicate()
return target_path
def scrape(self, repo_id):
page_url = self.page_url(repo_id)
file_path = self.download(page_url)
src_file = open(file_path, 'r')
src_doc = src_file.read()
src_file.close()
print('opening: %s' % file_path)
scraper = GistScraper()
scraper.feed(src_doc)
htdoc = self.htdoc(repo_id)
target_file = open(htdoc, 'w')
print('writing: %s' % htdoc)
target_file.write(scraper.scrapped_data())
target_file.close()
def main(argv):
if len(argv) <= 1:
print("Usage: %s gist-url [repo-base] [htdocs]" % argv[0])
sys.exit(1)
gist_url = argv[1]
if len(argv) > 2:
repo_base = argv[2]
else:
repo_base = os.path.abspath('.')
if len(argv) > 3:
htdocs = argv[3]
else:
htdocs = os.path.abspath('.')
repo_id = get_gist_id(gist_url)
if repo_id is None:
print("invalid gist-url. cannot get gist id from the url")
sys.exit(1)
gist = Gistbl(repo_base, htdocs)
gist.clone_or_merge_repo(repo_id)
gist.scrape(repo_id)
if __name__ == '__main__':
main(sys.argv)