Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#1 #2

Merged
merged 4 commits into from
Feb 2, 2020
Merged

#1 #2

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
44 changes: 22 additions & 22 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,28 @@ Python based Google Play Crawler

## Example of crawled data:

App name: Amazon Shopping <br>
Installs Range: 50,000,000 - 100,000,000
Rating Value: 4.268221378326416<br>
Rating Count: 449554<br>
Reviews Count: 449,554<br>
Rating: 5 <br>
Rating count: 281,323<br>
Rating: 4 <br>
Rating count: 86,924<br>
Rating: 3 <br>
Rating count: 34,203<br>
Rating: 2 <br>
Rating count: 14,772<br>
Rating: 1 <br>
Rating count: 32,332<br>
Author Name: michelle slaughter<br>
Review Date: 2 June 2016<br>
Reviewer Link: /store/apps/details?id=com.amazon.mShop.android.shopping&reviewId=Z3A6QU9xcFRPSEdKRjFBeEJjVFNZMHdfLUptRnprTkhnOGpacWhaSzhUb1NOa29Ca3lYeEtxZi1PeXZZUXVtdVlieExuaG1wbmtSRF83emZLeE1iRXg3dVE<br>
Reviewer Ratings: 3<br>
Review Title: Alright<br>
Review Body: Alright The app itself is great. This is my first user on Amazon after being a longtime user of Ebay. Just not too happy with delivery as promised. I ordered 2 instock items directly from Amazon on may 30th, says will ship today (june 2) and have it by the 6th..has yet to be shipped. Funny thing is the next day the 31st I ordered 2 items from a 3rd party on Amazon and they shipped yesterday. Sneaky part tho offering 30 days free of Prime, have to give credit card. I did, mine is expired so they cancelled free trial Full Review <br>
Developer Reply: <br>
App name: Flipkart Online Shopping App<br>
Installs Range: 100,000,000+<br>
Rating Value: 4.455704689025879<br>
Reviews Count: 7689096<br>
Rating: 5<br>
Rating count: 5,003,740<br>
Rating: 4<br>
Rating count: 1,836,612<br>
Rating: 3<br>
Rating count: 465,871<br>
Rating: 2<br>
Rating count: 114,706<br>
Rating: 1<br>
Rating count: 268,167<br>
<br>
review:1<br>
Author Name: <br>
Review Date:<br>
Reviewer Ratings:<br>
Review Body: <br>
Developer Reply: <br>


## Requirements

Expand Down
187 changes: 99 additions & 88 deletions crawl_play_store.py
Original file line number Diff line number Diff line change
@@ -1,88 +1,99 @@
import time
from bs4 import BeautifulSoup
import sys, io
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.proxy import *

# @author Ranjeet Singh <ranjeetsingh867@gmail.com>
# Modify it according to your requirements

no_of_reviews = 1000

non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
driver = webdriver.Firefox(executable_path='/usr/local/bin/geckodriver')

wait = WebDriverWait( driver, 10 )


# Append your app store urls here
urls = ["https://play.google.com/store/apps/details?id=com.flipkart.android&hl=en",
"https://play.google.com/store/apps/details?id=com.amazon.mShop.android.shopping"]

for url in urls:

driver.get(url)
page = driver.page_source

soup_expatistan = BeautifulSoup(page, "html.parser")

expatistan_table = soup_expatistan.find("div", class_="id-app-title")

print("App name: ", expatistan_table.string)

expatistan_table = soup_expatistan.find("div", itemprop="numDownloads")

print("Installs Range: ", expatistan_table.string)

expatistan_table = soup_expatistan.find("meta", itemprop="ratingValue")

print("Rating Value: ", expatistan_table['content'])

expatistan_table = soup_expatistan.find("meta", itemprop="ratingCount")

print("Rating Count: ", expatistan_table['content'])

expatistan_table = soup_expatistan.find("span", class_="reviews-num")

print("Reviews Count: ", expatistan_table.string)

soup_histogram = soup_expatistan.find("div", class_="rating-histogram")

rating_bars = soup_histogram.find_all('div', class_="rating-bar-container")

for rating_bar in rating_bars:
print("Rating: ", rating_bar.find("span").text)
print("Rating count: ", rating_bar.find("span", class_="bar-number").string)

next_button = driver.find_element_by_xpath('//*[@id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]')

for i in range(0,no_of_reviews):
try:
next_button.click()
except Exception:
time.sleep(5)

reviews_div = driver.find_element_by_xpath('//div[@data-load-more-section-id="reviews"]').get_attribute("innerHTML")
soup_expatistan = BeautifulSoup(reviews_div, "html.parser")

expand_pages = soup_expatistan.find_all("div", class_="single-review")

for expand_page in expand_pages:
print("Author Name: ", str(expand_page.find("span", class_="author-name").string.encode("utf-8")))
print("Review Date: ", expand_page.find("span", class_="review-date").string.encode("utf-8"))
print("Reviewer Link: ", expand_page.find("a", class_="reviews-permalink")['href'])
reviewer_ratings = expand_page.find("div", class_="review-info-star-rating").find_next()['aria-label'];
reviewer_ratings = ''.join(x for x in reviewer_ratings if x.isdigit())
print("Reviewer Ratings: ", reviewer_ratings)
print("Review Title: ", str(expand_page.find("span", class_="review-title").string))
print("Review Body: ", str(expand_page.find("div", class_="review-body").text.encode("utf-8")))
developer_reply = expand_page.find_parent().find("div", class_="developer-reply")
if hasattr(developer_reply, "text"):
print("Developer Reply: ", str(developer_reply.text.encode("utf-8")))
else:
print("Developer Reply: ", "")


driver.quit()


# coding: utf-8

# In[2]:

import time
from bs4 import BeautifulSoup
import sys, io
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.proxy import *

# @author Ranjeet Singh <ranjeetsingh867@gmail.com>
# Modify it according to your requirements

no_of_reviews = 1000

non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
driver = webdriver.Chrome(r"C:\Users\user\Anaconda3\Scripts\chromedriver.exe")

wait = WebDriverWait( driver, 10 )


# Append your app store urls here
urls = ["https://play.google.com/store/apps/details?id=com.flipkart.android&hl=en"]

for url in urls:

driver.get(url)

page = driver.page_source

soup_expatistan = BeautifulSoup(page, "html.parser")

expatistan_table = soup_expatistan.find("h1", class_="AHFaub")

print("App name: ", expatistan_table.string)

expatistan_table = soup_expatistan.findAll("span", class_="htlgb")[4]

print("Installs Range: ", expatistan_table.string)

expatistan_table = soup_expatistan.find("meta", itemprop="ratingValue")

print("Rating Value: ", expatistan_table['content'])

expatistan_table = soup_expatistan.find("meta", itemprop="reviewCount")

print("Reviews Count: ", expatistan_table['content'])

soup_histogram = soup_expatistan.find("div", class_="VEF2C")

rating_bars = soup_histogram.find_all('div', class_="mMF0fd")

for rating_bar in rating_bars:
print("Rating: ", rating_bar.find("span").text)
print("Rating count: ", rating_bar.find("span", class_="L2o20d").get('title'))

# open all reviews
url = url+'&showAllReviews=true'
driver.get(url)
time.sleep(5) # wait dom ready
for i in range(1,10):
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')#scroll to load other reviews
time.sleep(1)
page = driver.page_source

soup_expatistan = BeautifulSoup(page, "html.parser")
expand_pages = soup_expatistan.findAll("div", class_="d15Mdf")
counter = 1
for expand_page in expand_pages:
try:
print("\n===========\n")
print("review:"+str(counter))
print("Author Name: ", str(expand_page.find("span", class_="X43Kjb").text))
print("Review Date: ", expand_page.find("span", class_="p2TkOb").text)
'''
//didn't find reviewer link
print("Reviewer Link: ", expand_page.find("a", class_="reviews-permalink")['href'])
'''
reviewer_ratings = expand_page.find("div", class_="pf5lIe").find_next()['aria-label'];
reviewer_ratings = reviewer_ratings.split('(')[0]
reviewer_ratings = ''.join(x for x in reviewer_ratings if x.isdigit())
print("Reviewer Ratings: ", reviewer_ratings)
'''
//didn't find review title
print("Review Title: ", str(expand_page.find("span", class_="review-title").string))
'''
print("Review Body: ", str(expand_page.find("div", class_="UD7Dzf").text))
developer_reply = expand_page.find_parent().find("div", class_="LVQB0b")
if hasattr(developer_reply, "text"):
print("Developer Reply: "+"\n", str(developer_reply.text))
else:
print("Developer Reply: ", "")
counter+=1
except:
pass
driver.quit()