/
fdscrape.py
executable file
·383 lines (311 loc) · 13.1 KB
/
fdscrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
#!/usr/bin/env python3
"""fdscrape, an F-droid source code scraper
Usage:
fdscrape.py [options] DOWNLOAD_PATH
fdscrape.py (-h | --help)
fdscrape.py --version
Options:
--no-extract Don't extract source tarball.
--ratings Scrape Google Play ratings (currently broken).
--require-apk Abort if APK isn't available.
--require-src Abort if source tarball isn't available.
-h --help Show this screen.
--version Display version.
fdscrape is written by Quint Guvernator and licensed by the GPLv3.
"""
VERSION = "0.1.0"
FDROID_BROWSE_URL = "https://f-droid.org/repository/browse/"
from docopt import docopt
from subprocess import check_call
from time import sleep
from urllib.error import URLError, HTTPError
import datetime
import json
import os, sys
import pathlib
import shutil
import urllib.request
from bs4 import BeautifulSoup
def bs(*args, **kwargs):
return BeautifulSoup(*args, "html.parser", **kwargs)
def getFile(url, filename):
print("\tDownloading {} to {}".format(filename.name, filename.parent))
while True:
try:
with urllib.request.urlopen(url, timeout=10) as response, filename.open('xb') as outFile:
shutil.copyfileobj(response, outFile)
print("\tDone.")
return
except URLError:
sleep(5)
continue
except FileExistsError:
print("\tPath {} already exists, skipping...".format(filename))
return
except KeyboardInterrupt:
print("\tUser killed program, removing partially downloaded file...")
os.remove(filename.as_posix())
print("\tDone. Exiting...")
sys.exit(1)
return
def getArchive(url, filename):
getFile(url, filename)
print("\tExtracting {}".format(filename))
check_call(["tar", "xf", str(filename), "-C", str(filename.parent)])
theDir = [ str(x) for x in filename.parent.glob("*/") if x.is_dir() ]
if len(theDir) != 1:
raise OSError("Too many directories in file!")
else:
theDir = theDir[0]
check_call(["mv", theDir, str(filename.parent / "src")])
check_call(["rm", str(filename)])
print("\tDone.")
def prefixFromLink(s):
repoPrefix = "https://f-droid.org/repository/browse/?fdid="
repoSuffix = "&fdpage="
s = s.replace(repoPrefix, '', 1)
# get rid of page numbers
if repoSuffix in s:
s = s.rsplit(repoSuffix, maxsplit=1)[0]
return s
def safeSoup(url):
while True:
try:
with urllib.request.urlopen(url, timeout=10) as r:
return bs(r)
except URLError:
sleep(5)
def getAppLinks(url):
'''Gets all app links on a page and returns the links as a list, their
names as a list, the package names as a list, and the next page as a string (or None if it's over)
all bundled together in a tuple.
e.g.
(["http://app1.example.org/", "http://app2.example.com/"],
["app1", "app2"],
["org.mozilla.app1", "com.ableton.app2"],
"http://next-page.example.com")
or
(["http://app1.example.org/", "http://app2.example.com/"],
["app1", "app2"],
["org.mozilla.app1", "com.ableton.app2"],
None)
'''
soup = safeSoup(url)
appBlocks = soup(id="appheader")
appNames = [ b.find('p', recursive=False).find("span").string for b in appBlocks ]
appLinks = [ b.parent.get('href') for b in appBlocks ]
appPrefixes = [ prefixFromLink(l) for l in appLinks ]
nextLink = soup.find('a', text="next>")
if nextLink is not None:
nextLink = nextLink.get('href')
print("Next page: {}".format(nextLink))
return (appLinks, appNames, appPrefixes, nextLink)
def getLink(url, text):
link = safeSoup(url).find('a', text=text)
if link is None:
return
return link.get("href")
def combineDictionaries(*dictionaries) -> dict:
'''Combine dictionaries by adding keys.'''
keys = set()
for d in dictionaries:
keys = keys.union(d.keys())
newDict = dict()
for k in keys:
values = [ d.get(k, 0) for d in dictionaries ]
newDict[k] = sum(values)
return newDict
def decodeSi(si: str) -> int:
prefixes = ['k', 'm', 'g', 't']
prefixes = { prefix: 10**(3 * (i + 1)) for i, prefix in enumerate(prefixes) }
si = si.lower()
for prefix, multiplier in prefixes.items():
num, prefix, _ = si.partition(prefix)
if not prefix:
continue
num = num.replace(',', '')
num = float(num)
return int(num * multiplier)
def getPlayInfobox(boxTitle: str, soup) -> str:
stat = soup.find("div", class_="title", text=lambda x: boxTitle in x)
stat = stat.parent.find(class_="content")
# TODO: handle potential error here
return stat.text.strip()
def reviewPhrases(text) -> dict:
'''Returns a dict with the amount of matching phrases.'''
keys = ["incompatible", "uninstall", "crash", "slow", "lag", "black screen",
"white screen", "blank screen"]
# s.count(sub) counts occurrances of sub in s
return { k.replace(' ', '-'): text.lower().count(k) for k in keys }
def getPlayStats(package):
prefix = "https://play.google.com/store/apps/details?id="
try:
with urllib.request.urlopen(prefix + package, timeout=10) as r:
soup = bs(r)
except HTTPError:
return
hist = soup.find("div", class_="rating-histogram")
# if we don't have ratings, tell caller function to abort
if hist is None:
return
# pull ratings from soup
ratings = hist(class_="rating-bar-container")
ratings = [ r.find("span", class_="bar-number").text for r in ratings ]
ratings = [ int(r.replace(',', '')) for r in ratings ]
ratings.reverse() # one-to-five in increasing order
ratingCount = sum(ratings)
# if we don't have ratings, tell caller function to abort
if ratingCount == 0:
return
# dict maps weighting (how many stars) to count (how many people rated at
# this weight)
ratingWeights = { i + 1: num for i, num in enumerate(ratings) }
# calculate the mean from the weighting dictionary above
theSum = sum(( weight * count for weight, count in ratingWeights.items() ))
theMean = theSum / ratingCount
# make a str:int statistics dictionary
stats = { "star_{}".format(k): v for k, v in ratingWeights.items() }
stats["play_star_mean"] = theMean
stats["play_star_count"] = ratingCount
# count number of characters in the description as a control variable
description = soup.find("div", class_="id-app-orig-desc")
description = description.text
stats["play_description_length"] = len(description)
# this doesn't work because it's injected into the source with jquery
'''
# how many users +1'd this app on Google Play?
plusOnes = soup.find(text=lambda x: "Recommend this on Google" in x)
plusOnes = plusOnes.partition(' ')[0] #FIXME
plusOnes = plusOnes.partition('+')[2]
stats["play_google_plus"] = plusOnes
'''
# what sort of contact information does the developer provide?
contact = soup.find("div", class_="title", text=lambda x: "Contact Developer" in x)
contact = contact.parent.find(class_=["content", "contains-text-link"])
availability = lambda x: "unavailable" if x is None else "available"
web = contact.find("a", text=lambda x: "Visit Developer's Website" in x)
stats["play_developer_web"] = availability(web)
email = contact.find("a", text=lambda x: "Email Developer" in x)
stats["play_developer_email"] = availability(email)
privacy = contact.find("a", text=lambda x: "Privacy Policy" in x)
stats["play_developer_privacy"] = availability(privacy)
# application binary size
rawSize = getPlayInfobox("Size", soup)
if "Varies with device" in rawSize:
stats["play_size"] = None
else:
stats["play_size"] = decodeSi(getPlayInfobox("Size", soup))
# content rating
stats["play_content_rating"] = getPlayInfobox("Content Rating", soup)
# last updated
dateFormat = "%B %d, %Y"
lastUpdated = getPlayInfobox("Updated", soup)
lastUpdated = datetime.datetime.strptime(lastUpdated, dateFormat).date()
stats["play_updated"] = (datetime.date.today() - lastUpdated).days
# category
# we're using the backend link because the frontend text on the Web display
# isn't specific enough and may change
catLink = soup.find("span", itemprop="genre").parent
catLink = catLink.get("href")
# it's the last part of the category link
catLink = catLink.rpartition('/')[2]
stats["play_category"] = catLink
# text reviews
reviews = soup.find("div", class_="reviews")
reviews = reviews.find("div", class_="all-reviews")
if reviews is not None:
reviews = reviews("div", class_="review-body")
reviews = [ " ".join(review.stripped_strings) for review in reviews ]
phrases = [ reviewPhrases(review) for review in reviews ]
phrases = combineDictionaries(*phrases)
wordCount = sum([ len(review.split(" ")) for review in reviews ])
stats["review_words"] = wordCount
for phrase, count in phrases.items():
stats["review_frequency_" + phrase] = count / wordCount
else:
print("No reviews!")
return stats
def getAllApps(downloadPath, url=FDROID_BROWSE_URL, ratings=True, require_apk=False, require_src=False, extract=True):
page = 0
nextUrl = url
while nextUrl is not None:
page += 1
print("Scraping app index, page {}...".format(page))
appLinks, names, packages, nextUrl = getAppLinks(nextUrl)
print("Got page {}.".format(page))
print('')
print("Downloading source of all available apps on page {}...".format(page))
for appLink, name, package in zip(appLinks, names, packages):
print('')
print('"{}"'.format(name))
# test for a directory with the same package name
downloadFilename = pathlib.Path(downloadPath) / package
if downloadFilename.exists():
print("\tPath {} already exists, skipping download...".format(downloadFilename))
continue
# save google play rating to a file (rating.json) in the same path
if ratings:
print("\tLooking for Google Play rating (as {})...".format(package))
rating = getPlayStats(package)
if rating is None:
print("\tCouldn't find rating data on the Google Play store.")
print("\tSkipping download...")
continue
print("\tMaking a directory for the application: {}...".format(downloadFilename))
downloadFilename.mkdir()
if ratings:
ratingFilename = downloadFilename / "rating.json"
print("\tSaving rating to file ({})...".format(ratingFilename))
with ratingFilename.open('x') as f:
json.dump(rating, f)
print("\tDone.")
print('')
print("\tGetting remote link to apk...")
apkLink = getLink(appLink, "download apk")
if apkLink is None:
print("\tNo source code available for \"{}\" from f-droid.org.".format(name))
print("\tConsider visiting the f-droid detail page manually at:")
print("\t\t{}".format(appLink))
print("\tand looking for the link to the apk.")
print('')
if require_apk:
print("\tSkipping...")
continue
print("\tGetting remote link to source...")
srcLink = getLink(appLink, "source tarball")
if srcLink is None:
print("\tNo source code available for \"{}\" from f-droid.org.".format(name))
print("\tConsider visiting the f-droid detail page manually at:")
print("\t\t{}".format(appLink))
print("\tand looking for the link to the apk.")
print('')
if require_src:
print("\tSkipping...")
continue
# calculate filename
safename = ''
for c in name:
if c.isalnum():
safename += c.lower()
elif c.isspace():
safename += '_'
# download the apk
getFile(apkLink, downloadFilename / (safename + ".apk") )
# download the source
(getArchive if extract else getFile)(srcLink, downloadFilename / (safename + ".tar.gz") )
print("Page {} complete.".format(page))
print('')
print("Downloaded {} pages of apps to {}".format(page, downloadPath))
if __name__ == "__main__":
args = docopt(__doc__, version=VERSION)
downloadPath=pathlib.Path(args["DOWNLOAD_PATH"])
try:
downloadPath.mkdir()
except FileExistsError:
pass
getAllApps(downloadPath,
ratings=args['--ratings'],
require_apk=args['--require-apk'],
require_src=args['--require-src'],
extract=not args['--no-extract'])
print("\nExiting...")