In [1]:
import os
import time

import numpy as np
import pandas as pd

import ssl
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup

## User Input

In [2]:
baseURL = "https://www.glassdoor.co.in/Reviews/Cognizant-Technology-Solutions-Reviews-E8014.htm"
outputFile = 'mydata.csv'
companyName = 'CTS' # Just a identifier
waitTime = 30 # Time in seconds
columnNames = ['pageNo', 'company', 'datetime', 'summary', 
               'overallRating', 'WorkLifeBalance', 'CultureValues', 
               'CareerOpportunities', 'CompensationBenefits', 'SeniorManagement', 
               'pros', 'cons', 'location', 'jobtitle',
              'recommend', 'outlook', 'CEO']

In [3]:
subRatingTitle = {'Work/Life Balance':'WorkLifeBalance',
                 'Culture & Values':'CultureValues',
                 'Career Opportunities':'CareerOpportunities',
                 'Compensation and Benefits':'CompensationBenefits',
                 'Senior Management':'SeniorManagement'}

In [4]:
# url = "https://www.glassdoor.co.in/Reviews/Cognizant-Technology-Solutions-Reviews-E8014_P21.htm?sort.sortType=RD&sort.ascending=false"
# hdr = {'User-Agent': 'Mozilla/5.0'}
# req = Request(url, headers=hdr)
# gcontext = ssl.SSLContext()
# page = urlopen(req, context=gcontext)
# soup = BeautifulSoup(page, "html.parser")

In [5]:
# reviews = soup.find_all('li', attrs={'class':'empReview'})
# review = reviews[2]

In [6]:
# recommends = review.find_all('i', attrs={'class':'sqLed middle sm mr-xsm green'})
# recommends[1].next_sibling.text

In [7]:
def download_page(url):
    hdr = {'User-Agent': 'Mozilla/5.0'}
    req = Request(url, headers=hdr)
    gcontext = ssl.SSLContext()
    page = urlopen(req, context=gcontext)
    soup = BeautifulSoup(page, "html.parser")
    return soup

In [9]:
def process_content(review):
    # Initilize the page with default content
    content = {'company':companyName,
              'location':'',
              'jobtitle':'',
              'recommend':'',
              'outlook':'',
              'CEO':''}
    content['summary'] = review.a.text
    content['datetime'] = review.time['datetime']
    
    overallRating = review.find_all('span', attrs = {'class':'value-title'})
    content['overallRating'] = overallRating[0]['title']
    
    subRatings = review.find_all('div', attrs = {'class':'subRatings module stars__StarsStyles__subRatings'})
    if len(subRatings) > 0:
        subRatings = subRatings[0].find_all('div', attrs = {'class':'minor'})
        for rate in subRatings:
            content[subRatingTitle[rate.text]] = rate.next_sibling['title']
    
    proscons = review.find_all('p', attrs = {'class':'mt-0 mb-xsm v2__EIReviewDetailsV2__bodyColor v2__EIReviewDetailsV2__lineHeightLarge'})
    content['pros'] = proscons[0].text
    content['cons'] = proscons[1].text
    
    location = review.find_all('span', attrs={'class':'authorLocation'})
    if len(location) > 0:
        content['location'] = location[0].text
        
    jobtitle = review.find_all('span', attrs={'class':'authorJobTitle middle reviewer'})
    if len(jobtitle) > 0:
        content['jobtitle'] = jobtitle[0].text
        
        
    recommends = review.find_all('i', attrs={'class':'sqLed middle sm mr-xsm green'})
    if len(recommends) > 0:
        for recommend in recommends:
            recommendText = recommend.next_sibling.text
            if 'CEO' in recommendText:
                content['CEO'] = recommendText
            elif 'Outlook' in recommendText:
                content['outlook'] = recommendText
            elif 'Recommend' in recommendText:
                content['recommend'] = recommendText
        
    return content

In [10]:
#Define a function that is reusable. We need to scrape many pages of many companies
def scraper(url, pageNo):
    print("URL: ", url)
    soup = download_page(url)
    reviews = soup.find_all('li', attrs={'class':'empReview'})
    if len(reviews) < 1:
        print(soup)
    result = []
    for review in reviews:
        content = process_content(review)
        content['pageNo'] = pageNo
        result.append(content)
        
    result = pd.DataFrame(result)
    return result

In [11]:
def generate_URL(baseURL, pageNo):
    return baseURL[:-4] + '_P' + str(pageNo) + ".htm?sort.sortType=RD&sort.ascending=false"    

In [12]:
def update_stats(noRec, outputFile):
    fullData = pd.read_csv(outputFile, encoding='utf-8')
    print("{} new records, {} total records".format(noRec, len(fullData)))

In [13]:
def get_reviews(baseURL, startPage, endPage, outputFile):
    for pageNo in range(startPage, endPage + 1):
        print('Processing Page No:', str(pageNo))
        url = generate_URL(baseURL, pageNo)
        result = scraper(url, pageNo)
        if len(result) > 0:
            result = pd.DataFrame(result, columns=columnNames)
            with open(outputFile, 'a') as f:
                result.to_csv(f, header=f.tell() == 0, index=False, encoding='utf-8')
        update_stats(len(result), outputFile)
        print('Idealing')
        if pageNo != endPage: 
            time.sleep(waitTime)

## Main Function

In [None]:
# Adjust the start and end page number as required
get_reviews(baseURL=baseURL, startPage=1, endPage=3, outputFile=outputFile)

Processing Page No: 1
URL:  https://www.glassdoor.co.in/Reviews/Cognizant-Technology-Solutions-Reviews-E8014_P1.htm?sort.sortType=RD&sort.ascending=false
9 new records, 9 total records
Idealing
