# Introduction
***
Collect data from another resources (website) using Python. <br>
URL Target: <br>
https://www.indeed.com/cmp/Google/reviews?fcountry=ALL&start= <br>
We will collect employee attributes (rating, review title, review description, pros, and cons)

# Install Required Packages

# Include Libraries

In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as BS
import requests as rq

# Start Scraping

In [2]:
def scraping(full_url):
    page_content = BS(full_url.content, 'lxml')
    containers = page_content.findAll('div', {
        'itemprop': 'review'
    })
    data = list()
    for item in containers:
        # find the rating
        try:
            rating = item.find('div', {
                'itemprop': 'reviewRating'
            }).find('button').text
        except:
            rating = None

        # find the rating title
        try:
            rating_title = item.find('h2', {
                'data-testid': 'title'
            }).find('span').text
        except:
            rating_title = None

        # find rating descriptions
        try:
            review_descriptions = item.find('span', {
                'itemprop': 'reviewBody'
            }).find('span').text.replace('\r', '. ')
        except:
            review_descriptions = None
        
        data.append([rating,rating_title,review_descriptions])
    
    df = pd.DataFrame(
        columns = ['rating', 'rating_title', 'rating_descriptions'], data=data
    )
    return df

In [3]:
base_url = 'https://www.indeed.com/cmp/Google/reviews?fcountry=ALL&start='
num_reviews = 20
all_reviews_df = pd.DataFrame(
    columns = ['rating', 'rating_title', 'rating_descriptions']
)

# retrieve 100 number of reviews
while num_reviews<101:
    full_url = base_url + str(num_reviews)
    get_url = rq.get(full_url, timeout=5)
    partial_review_df = scraping(get_url)
    all_reviews_df = pd.concat([all_reviews_df,partial_review_df],ignore_index=True)
    num_reviews+=20


In [4]:
# show reviews data from scraping
all_reviews_df

Unnamed: 0,rating,rating_title,rating_descriptions
0,5.0,Great culture great benefit,"Great company to work with, hoping to stay for..."
1,1.0,Run!!!! Never work here their turnover rate is...,Google the name is great but management is hor...
2,5.0,Excelente,"El trabajo es muy bueno, saben dividir entre e..."
3,4.0,Buena,Es una empresa muy buena con un buen ambiente ...
4,3.0,"Wonderful place, great people, poor management",The GCCs were a great place to work before COV...
...,...,...,...
100,3.0,Meh,Bureaucratic. Managers can be condescending. ...
101,4.0,Butler Service,I really enjoy coding at this amazing company....
102,5.0,fun place to work at,i learned to code better and allot of people a...
103,5.0,Great place,Love it here. Awesome company to work for. Opp...


In [5]:
# save reviews data into csv file
all_reviews_df.to_csv('dataset/reviews_dataset.csv', index=False)