# Glassdoor Reviews WebScraper

### Created by Shaik Nawazuddin

Importing all the necessary Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

Specifying User-Agent

Just google 'My user agent' and paste it as the value for key 'User-Agent'

In [2]:
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'}

Creating a temporary list to append data of each page into it

In [3]:
temp = list()

Specify the last page number

In [4]:
start_page = 1
print('Please enter the last page number')
end_page = (int(input())) + 1

Please enter the last page number
5


Example:

https://www.glassdoor.co.in/Reviews/Glassdoor-Reviews-E100431.htm

If the link looks like the above, kindly take only the 'https://www.glassdoor.co.in/Reviews/Glassdoor-Reviews-E100431' part as the first_half and I'd recommend not to change the second_half unless there is a specific filtering applied.

In [5]:
for page in range(start_page, end_page):
    print('Page number is: ', page)

    try:
#Kindly specify the correct URL link in the right format without including the page number as shown in the example        
        first_half = 'https://www.glassdoor.co.in/Reviews/Glassdoor-Reviews-E100431'
        second_half = '.htm?filter.iso3Language=eng'     
        html = requests.get(first_half + '_P' + str(page) + second_half, verify = True, headers = headers)
        html.raise_for_status()
    except requests.exceptions.HTTPError as errh:
        print('HTTP Error: ', errh)
    except requests.exceptions.ConnectionError as errc:
        print('Error Connecting: ', errc)
        time.sleep(30)
    except requests.exceptions.Timeout as errt:
        print('Timeout Error: ', errt)
        time.sleep(30)
    except requests.exceptions.RequestException as err:
        print('Something else: ', err)
   
    bsobj = BeautifulSoup(html.text, 'html.parser')

    reviews = bsobj.find_all(class_='gdReview')
   
    for review in reviews:
        try:
            desc = review.find(class_='reviewLink').get_text()
        except:
            desc = None
        try:
            date_job_title = review.find(class_='authorJobTitle').get_text()
            date_job_title = date_job_title.split('-')
            review_date = date_job_title[0]
            reviewer_job_title = date_job_title[1]
        except:
            review_date = None
            reviewer_job_title = None
        try:
            location = review.find(class_='authorLocation').get_text()
        except:
            location = None
        try:
            rate = review.find(class_='ratingNumber').get_text()
        except:
            rate = None
        try:
            state = review.find(class_='pt-xsm').get_text()
        except:
            state_exp = None
        try:
            pros =review.select('span[data-test="pros"]')[0].text
            pros = pros.replace('\t', '')
            pros = pros.replace('\r', ' ')
            pros = pros.replace('\n', ' ')
        except:
            pros = None
        try:
            cons = review.select('span[data-test="cons"]')[0].text
            cons = cons.replace('\t', '')
            cons = cons.replace('\r', ' ')
            cons = cons.replace('\n', ' ')
        except:
            cons = None
       
        data = {
                'Title': desc,
                'Date': review_date,
                'Job Title': reviewer_job_title,
                'Location' : location,
                'Rating': rate,
                'State and Experience': state,
                'Pros' : pros,
                'Cons': cons
            }
        temp.append(data)

Page number is:  1
Page number is:  2
Page number is:  3
Page number is:  4
Page number is:  5


Loading the temp data into a DataFrame

In [6]:
df_temp = pd.DataFrame(temp)
try:
    df_temp[['State of Reviewer', 'Experience']] = df_temp['State and Experience'].str.split(', ', expand = True)
except:
    df_temp['State of Reviewer'] = df_temp['State and Experience']
    df_temp['Experience'] = ''
df = df_temp[['Title', 'Date', 'Job Title', 'Location', 'Rating', 'State of Reviewer', 'Experience', 'Pros', 'Cons']]
df

Unnamed: 0,Title,Date,Job Title,Location,Rating,State of Reviewer,Experience,Pros,Cons
0,Glassdoor leads with transparency!,"Jul 12, 2022",Senior Customer Success Manager,Melbourne,5.0,Current Employee,more than 3 years,- I genuinely enjoy my day-to-day job & cowork...,- There can be delays due to internal system e...
1,Great company,"Jul 19, 2022",Technical Support,,5.0,Current Employee,,You can build a good network here that always ...,Nothing i can think of
2,"Very good company, many opportunities available","May 25, 2021",Anonymous Employee,,5.0,Current Employee,,"Good company, friendly , great exposure, happy...",No cons to be specified
3,Crazy amazing people.,"Jan 15, 2018",Research Associate,,5.0,Current Employee,,"Now for an organisation running the show ,it i...","You don't wanna learn , not for you."
4,Wonderful Internship experience,"Mar 26, 2017",,,4.0,Former Employee,,Open organization flexible timings. Company ta...,None as such everyone enjoys here
5,Get What You Give,"Aug 26, 2022",Enterprise Employer Branding Specialist,,5.0,Current Employee,less than 1 year,The people here are everything; I've never met...,You have to wear a lot of hats in this role. T...
6,Great Collaborative Environment,"Aug 16, 2022",Engineer,,5.0,Current Employee,less than 1 year,"I love working here, it's the best job I've ha...",People are very focused on their work here so ...
7,Meaningful mission and authentic people,"Aug 29, 2022",Lead Product Designer,"Los Angeles, CA",4.0,Current Employee,,Everyone is really bought into the mission.,Comp and pay are a bit on the lower end of the...
8,Great Culture - but sales rep are not prepared...,"Aug 18, 2022","Account Manager, Upper Mid Market",,2.0,Current Employee,more than 1 year,Glassdoor is one of the best companies I have ...,"But in regards to overall sales experience, I ..."
9,Great place to work at!,"Aug 3, 2022",Anonymous Employee,,5.0,Current Employee,,I miss the days working at the Mill Valley off...,not data driven; hard to grow a career here


Exporting the DataFrame to an excel file

In [7]:
df.to_excel('Glassdoor Reviews.xlsx', index = False)