In [1]:
import requests
import pickle as pkl
from time import sleep
from tqdm import tqdm
import pandas as pd 
from bs4 import BeautifulSoup
import cloudscraper 

def parse(html):
    """Parses the HTML for the price of the product."""
    soup = BeautifulSoup(html, 'html.parser')
    try:
        # Find the span element with the class "product-price"
        output = soup.find('span', class_='common__EiReviewDetailsStyle__newUiJobLine').text
        return output
    except:
        print("Error parsing for location")

In [2]:
API_TOKEN = input("Enter your API token: ")

In [3]:
dataset = []

### Pool the reviews
This code is used to pool the reviews and create a CSV file (for each company separatelly). We keep only *US-based* and *English* reviews.

In [4]:
url = "https://wextractor.com/api/v1/reviews/indeed"

##Specify page numbers
POSTS_PER_PAGE = 20
offset_range = [i * POSTS_PER_PAGE for i in range(0, 125)] # post ids

for offset in tqdm(offset_range, desc="Downloading data"):
    params = {
        "id": "Postmates",
        "auth_token": API_TOKEN,
        "offset": offset,
        "country": 'US'
    }

    # Make the API request
    response = requests.get(url, params=params)

    # Process the response
    if response.status_code == 200:
        data = response.json()
        ### Add to the list
        dataset.extend(data["reviews"])

    else:
        print("Error:", response.status_code)


Downloading data: 100%|██████████| 125/125 [04:22<00:00,  2.10s/it]


In [87]:
with open("indeed_%s_reviews.pkl"%params["id"], "wb") as f:
    pkl.dump(dataset, f)

### Convert to Dataframe

In [88]:
df = pd.DataFrame(dataset).drop_duplicates(subset=['id']).set_index('id')
df = df[df["language"] == "en"]
df["state"] = df.apply(lambda x: x["location"].split(",")[-1].strip().lower(), axis=1)
df.to_csv("indeed_%s_reviews.csv" %params["id"])

In [83]:
df

Unnamed: 0_level_0,title,text,rating,language,reviewer,location,cons,pros,url,datetime,reviewer_employee_type,job_work_and_life_balance_rating,compensation_and_benefits_rating,job_security_and_advancement_rating,management_rating,job_culture_rating,state
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1h58962o0k7ja800,Fun place,Grub hub a fun place to work for a little extr...,4,en,Service Advisor,"Central Islip, NY",,,https://indeed.com/cmp/Grubhub/reviews/fun-pla...,2023-07-13T00:00:00,Former Employee,0,0,0,0,0,ny
1h4otgcarj21i801,Fun Side Job - Easy Cash for Fun but Not to Pa...,I was able to use this as a side job on top of...,3,en,Driver/Delivery,"Denver, CO",,,https://indeed.com/cmp/Grubhub/reviews/fun-sid...,2023-07-07T00:00:00,Former Employee,0,0,0,0,0,co
1h4knlsnhgfqq800,Fun job where you can make a good amount of mo...,What is the best part of working at the compan...,5,en,Delivery Driver,"Elyria, OH",,,https://indeed.com/cmp/Grubhub/reviews/fun-job...,2023-07-05T00:00:00,Current Employee,0,0,0,0,0,oh
1h4hqi9djk27a800,Cool until you don’t comply with vaxx,I liked working here up until the point the I ...,3,en,Senior Elite Restaurant Care Specialist,Chicago,,,https://indeed.com/cmp/Grubhub/reviews/cool-un...,2023-07-04T00:00:00,Former Employee,0,0,0,0,0,chicago
1h409dkphj5mt801,on the go job with lots of new learning experi...,"It has it's stressful moments, orders can ...",3,en,Delivery Driver,"Salem, OR",a lot of technical errors with the app,discount on gas,https://indeed.com/cmp/Grubhub/reviews/on-the-...,2023-06-28T00:00:00,Current Employee,5,1,4,3,5,or
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17v303tro5n8g80j,Fun workplace,the company had plenty of things to offer to k...,3,en,Customer Service Specialist lll,"Chicago, IL",,Free lunch at end of week. Gift card giveaways.,https://indeed.com/cmp/Grubhub/reviews/fun-wor...,2013-07-09T00:00:00,Current Employee,1,3,2,2,3,il
17trbgolladsg8t4,"Overall, a really great place to work",Great environment and people; Great benefits p...,4,en,Customer Service Specialist,"Chicago, IL",,,https://indeed.com/cmp/Grubhub/reviews/overall...,2013-06-24T00:00:00,Current Employee,2,5,3,5,3,il
17q85drf4b84g9jh,"Friendly Work Envornment, Great Training and A...",People are very friendly and nobody has a prob...,5,en,"Aliso Viejo, customer Service Rep",Aliso Viejo CA,Anybody is lucky to work here. I do need more ...,All around great company,https://indeed.com/cmp/Grubhub/reviews/friendl...,2013-05-10T00:00:00,Current Employee,5,5,5,5,5,aliso viejo ca
17m7a84d7adsgc77,Fun place to work filled with good people and ...,Good co-workers and fun environment. At Grubhu...,4,en,Graphic Designer,Chicago IL,,,https://indeed.com/cmp/Grubhub/reviews/fun-pla...,2013-03-21T00:00:00,Former Employee,4,4,3,4,4,chicago il


#### Archive

In [14]:
# Create cloudscraper instance 
scraper = cloudscraper.create_scraper() 
# Start scrapping
for i, row in tqdm(df.iterrows(), desc="Scraping locations"):
    ###### in case of error #################
    # aka if we already have the data -> skip
    if row["raw_meta"] != "":
        continue
    #########################################
    page = scraper.get(df.loc[i, "url"])
    loc = parse(page.content)
    print(loc)
    df.loc[i, "raw_meta"] = loc
# Or: scraper = cloudscraper.CloudScraper() # CloudScraper inherits from requests.Session 


Scraping locations: 954it [11:08,  1.87s/it]

Feb 20, 2022 - Coordinator in Warsaw, Masovia


Scraping locations: 955it [11:10,  1.73s/it]

Feb 20, 2022 - Team Leader in Angeles, Pampanga


Scraping locations: 956it [11:12,  1.80s/it]

Feb 20, 2022 - Community Operations Manager in Sydney


Scraping locations: 957it [11:13,  1.75s/it]

Feb 20, 2022 - Team Lead in Phoenix, AZ


Scraping locations: 958it [11:14,  1.59s/it]

Feb 20, 2022 - Regional Head of Operations in Amsterdam


Scraping locations: 959it [11:16,  1.55s/it]

Feb 20, 2022 - Delivery Driver 


Scraping locations: 960it [11:18,  1.42it/s]

Feb 20, 2022 - Delivery Driver 





In [27]:
def get_location(x):
    try:
        if "in " in x:
            return x.split("in")[-1].strip()
        else: 
            return None
    except:
        return None
df["location"] = df.apply(lambda x: get_location(x["raw_meta"]), axis=1)
df.to_csv("glassdoor_2500_3500_with_loc.csv")

id
66362518               None
66359165               None
66344375    Los Angeles, CA
66335091       New York, NY
66329753               None
                 ...       
60008846             Sydney
60005224        Phoenix, AZ
59999562          Amsterdam
59999533               None
59996398               None
Name: location, Length: 960, dtype: object