In [1]:
import requests
import pickle as pkl
from time import sleep
from tqdm import tqdm
import pandas as pd 
from bs4 import BeautifulSoup
import cloudscraper 

def parse(html):
    """Parses the HTML for the price of the product."""
    soup = BeautifulSoup(html, 'html.parser')
    try:
        # Find the span element with the class "product-price"
        output = soup.find('span', class_='common__EiReviewDetailsStyle__newUiJobLine').text
        return output
    except:
        print("Error parsing for location")

In [2]:
API_TOKEN = input("Enter your API token: ")

In [3]:
dataset = []

### Pool the reviews
This code is used to pool the reviews and create a CSV file (for each company separatelly). We keep only *US-based* and *English* reviews.

In [4]:
url = "https://wextractor.com/api/v1/reviews/indeed"

##Specify page numbers
POSTS_PER_PAGE = 20
offset_range = [i * POSTS_PER_PAGE for i in range(0, 125)] # post ids

for offset in tqdm(offset_range, desc="Downloading data"):
    params = {
        "id": "Postmates",
        "auth_token": API_TOKEN,
        "offset": offset,
        "country": 'US'
    }

    # Make the API request
    response = requests.get(url, params=params)

    # Process the response
    if response.status_code == 200:
        data = response.json()
        ### Add to the list
        dataset.extend(data["reviews"])

    else:
        print("Error:", response.status_code)


Downloading data: 100%|██████████| 125/125 [04:22<00:00,  2.10s/it]


In [5]:
with open("indeed_%s_reviews.pkl"%params["id"], "wb") as f:
    pkl.dump(dataset, f)

### Convert to Dataframe

In [6]:
df = pd.DataFrame(dataset).drop_duplicates(subset=['id']).set_index('id')
df = df[df["language"] == "en"]
df["state"] = df.apply(lambda x: x["location"].split(",")[-1].strip().lower(), axis=1)
df.to_csv("indeed_%s_reviews.csv" %params["id"])

In [7]:
df

Unnamed: 0_level_0,title,text,rating,language,reviewer,location,cons,pros,url,datetime,reviewer_employee_type,job_work_and_life_balance_rating,compensation_and_benefits_rating,job_security_and_advancement_rating,management_rating,job_culture_rating,state
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1h3thou16k27a802,A very good place to work before the merger.,This was a very good job for both IT and drive...,5,en,Sr. Sales Account Executive,Ohio,Merger and mass layoffs,"Good pay, remote work",https://indeed.com/cmp/Postmates/reviews/a-ver...,2023-06-26T00:00:00,Former Employee,5,5,1,1,1,ohio
1h2uffrq2ln2p800,"Not many issues at this job, decent place to w...","Not a bad side job, but when you add the gas p...",3,en,Independent Contractor,"Las Vegas, NV",The change in payouts with almost no warning,Decent schedule and an easy reach customer sup...,https://indeed.com/cmp/Postmates/reviews/not-m...,2023-06-14T00:00:00,Former Employee,3,3,3,4,4,nv
1h2qg5bq8281h000,Meh,Nothing too crazy about the company. Mostly us...,3,en,Delivery Driver,"Kingman, AZ",,,https://indeed.com/cmp/Postmates/reviews/meh?i...,2023-06-13T00:00:00,Former Employee,1,1,1,1,1,az
1h2pcpfq0ir13800,Fine,"Not the best, flexible but little support. Ult...",2,en,Courier,"Los Angeles, CA",,,https://indeed.com/cmp/Postmates/reviews/fine?...,2023-06-12T00:00:00,Former Employee,0,0,0,0,0,ca
1h21f2d63ipap800,Be your boss set your own hours don’t get bett...,I loved working For postmates . I loved being...,5,en,Delivery Driver,"Chesapeake, VA",,,https://indeed.com/cmp/Postmates/reviews/be-yo...,2023-06-03T00:00:00,Former Employee,0,0,0,0,0,va
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192lng23oak8g8k6,Courier Services,Courier Services. Lots of driving and discover...,3,en,Courier Services,California,,,https://indeed.com/cmp/Postmates/reviews/couri...,2014-09-25T00:00:00,Current Employee,5,1,2,2,3,california
1929gco5pak8gcul,"Nice idea, but you won't make $20/per hour on ...",I am currently an independent contractor with ...,2,en,Courier,"Chicago, IL",Inconsistent income and average pay that is lo...,Independent contractor so can set your own hou...,https://indeed.com/cmp/Postmates/reviews/nice-...,2014-09-20T00:00:00,Current Employee,2,2,4,3,4,il
18u01k0du5n8g8o2,creative,We link people with products they want in less...,5,en,Independent Contractor,"Washington, DC",,,https://indeed.com/cmp/Postmates/reviews/creat...,2014-07-28T00:00:00,Current Employee,5,5,5,5,5,dc
18jqqakap5n8gf7f,"Fun, friendly, and wonderful team work.",I normally work on my own hours which is on th...,5,en,Independent Contractor,"Washington, DC",On call,good exercise and wonderful team supports,https://indeed.com/cmp/Postmates/reviews/fun-f...,2014-03-24T00:00:00,Current Employee,0,0,0,0,0,dc


#### Archive

In [14]:
# Create cloudscraper instance 
scraper = cloudscraper.create_scraper() 
# Start scrapping
for i, row in tqdm(df.iterrows(), desc="Scraping locations"):
    ###### in case of error #################
    # aka if we already have the data -> skip
    if row["raw_meta"] != "":
        continue
    #########################################
    page = scraper.get(df.loc[i, "url"])
    loc = parse(page.content)
    print(loc)
    df.loc[i, "raw_meta"] = loc
# Or: scraper = cloudscraper.CloudScraper() # CloudScraper inherits from requests.Session 


Scraping locations: 954it [11:08,  1.87s/it]

Feb 20, 2022 - Coordinator in Warsaw, Masovia


Scraping locations: 955it [11:10,  1.73s/it]

Feb 20, 2022 - Team Leader in Angeles, Pampanga


Scraping locations: 956it [11:12,  1.80s/it]

Feb 20, 2022 - Community Operations Manager in Sydney


Scraping locations: 957it [11:13,  1.75s/it]

Feb 20, 2022 - Team Lead in Phoenix, AZ


Scraping locations: 958it [11:14,  1.59s/it]

Feb 20, 2022 - Regional Head of Operations in Amsterdam


Scraping locations: 959it [11:16,  1.55s/it]

Feb 20, 2022 - Delivery Driver 


Scraping locations: 960it [11:18,  1.42it/s]

Feb 20, 2022 - Delivery Driver 





In [27]:
def get_location(x):
    try:
        if "in " in x:
            return x.split("in")[-1].strip()
        else: 
            return None
    except:
        return None
df["location"] = df.apply(lambda x: get_location(x["raw_meta"]), axis=1)
df.to_csv("glassdoor_2500_3500_with_loc.csv")

id
66362518               None
66359165               None
66344375    Los Angeles, CA
66335091       New York, NY
66329753               None
                 ...       
60008846             Sydney
60005224        Phoenix, AZ
59999562          Amsterdam
59999533               None
59996398               None
Name: location, Length: 960, dtype: object