# App Reviews with Selenium Web Scraper

- Selenium: https://selenium-python.readthedocs.io/
- Beautiful Soup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np
from datetime import datetime
from datetime import date, timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
driver = webdriver.Chrome()

driver.get('https://sensortower.com/ios/US/robinhood-markets-inc/app/robinhood-investing-for-all/938003185/review-history?selected_tab=reviews')

In [3]:
def get_page():
    doc = BeautifulSoup(driver.page_source, features='html.parser')
    rows = doc.select("tbody tr")

    reviews = []
    for row in rows:
        cells = row.select("td")
        data = {
            'Country': cells[0].text.strip(),
            'Date': cells[1].text.strip(),
            'Rating': cells[2].select_one('.gold')['style'],
            'Review': cells[3].select_one('.break-wrap-review').text.strip(),
            'Version': cells[4].text.strip()
        }
        reviews.append(data)
    return reviews

In [4]:
all_reviews = []
wait = WebDriverWait(driver, 5, poll_frequency=0.5)

while True:
    wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, '.ajax-loading-cover')))

    results = get_page()    
    all_reviews.extend(results)

    # Define next page button
    next_button = driver.find_element_by_xpath("//div[@class='filter-block top']//div[contains(@class, 'pagination')]/button[2]")
    
    # Break when the button become disabled
    if next_button.get_attribute('disabled'):
        break
    next_button.click()
    time.sleep(3)

In [5]:
dataset = pd.DataFrame(all_reviews)

In [6]:
driver.close()

In [7]:
dataset.shape

(27139, 5)

In [9]:
#Converting HTML start percentage to numbers

dataset.Rating.value_counts()

width: 19%;    14900
width: 99%;     8623
width: 79%;     1549
width: 39%;     1099
width: 59%;      968
Name: Rating, dtype: int64

In [10]:
dataset.Rating = dataset.Rating.replace({
    'width: 99%;': 5,
    'width: 79%;': 4,
    'width: 59%;': 3,
    'width: 39%;': 2,
    'width: 19%;': 1
    })

In [11]:
dataset.describe()

Unnamed: 0,Rating
count,27139.0
mean,2.554
std,1.834617
min,1.0
25%,1.0
50%,1.0
75%,5.0
max,5.0


In [12]:
dataset.head(100)

Unnamed: 0,Country,Date,Rating,Review,Version
0,US,03/18/2021,1,"Knowing my shares aren’t real, means RH IS MAN...",-
1,US,03/18/2021,1,This company is currently under investigation ...,-
2,US,03/18/2021,1,"They sell your data to MM, halt trading when i...",-
3,US,03/18/2021,1,Easy and simple to use but for the love of god...,-
4,US,03/18/2021,5,Easy to learn & Use,-
...,...,...,...,...,...
95,US,03/17/2021,1,Be careful with this app... worst to trade,9.6.0
96,US,03/17/2021,1,Robinhood let me buy stock of my choice and wh...,9.6.0
97,US,03/17/2021,1,this is the worst broker ever. during 2021 sto...,9.6.0
98,US,03/17/2021,5,"Easy interface, just beginning but still mobil...",9.5.0


In [13]:
dataset['Date'] = pd.to_datetime(dataset['Date'])

In [14]:
dataset.head()

Unnamed: 0,Country,Date,Rating,Review,Version
0,US,2021-03-18,1,"Knowing my shares aren’t real, means RH IS MAN...",-
1,US,2021-03-18,1,This company is currently under investigation ...,-
2,US,2021-03-18,1,"They sell your data to MM, halt trading when i...",-
3,US,2021-03-18,1,Easy and simple to use but for the love of god...,-
4,US,2021-03-18,5,Easy to learn & Use,-


In [15]:
time = datetime.now().strftime('%Y%m%d_%H%M')
dataset.to_csv(str(time) + "ios_app.csv", index=False)