# Data Collection

Data collection is done by scraping the Trustpilot website to get feedback data.

In [1]:
import re
from bs4 import BeautifulSoup
import mechanize
import http.cookiejar as cookielib
import json
import pandas as pd
import time
from pandas.io.json import json_normalize

cookiejar =cookielib.LWPCookieJar()

First, identifies the Trustpilot URL for each company

In [2]:
competitor_url = {
    'Avado':'https://uk.trustpilot.com/review/avadolearning.com',
    'ICS_Learn':'https://uk.trustpilot.com/review/icslearn.co.uk',
    'DPG_plc':'https://uk.trustpilot.com/review/www.dpgplc.co.uk',
    'QA_ltd':'https://uk.trustpilot.com/review/qa.com',
    'Whitehat_Jr':'https://uk.trustpilot.com/review/whitehatjr.com',
    'Firebrand_Training':'https://uk.trustpilot.com/review/firebrand.training',
    'Udemy':'https://uk.trustpilot.com/review/udemy.com',
    'Baltic_Training':'https://uk.trustpilot.com/review/balticapprenticeships.com',
    'BPP':'https://uk.trustpilot.com/review/www.bpp.com',
    'Coursera':'https://uk.trustpilot.com/review/coursera.org',
    'Degreed':'https://uk.trustpilot.com/review/degreed.com',
    'Deloitte':'https://uk.trustpilot.com/review/deloitte.com',
    'edx':'https://uk.trustpilot.com/review/www.edx.org',
    'FutureLearn':'https://uk.trustpilot.com/review/www.futurelearn.com',
    'General_Assembly':'https://uk.trustpilot.com/review/generalassemb.ly',
    'LearnDirect':'https://uk.trustpilot.com/review/www.learndirect.com',
    'Linkedin_Learning':'https://uk.trustpilot.com/review/uk.linkedin.com',
    'PluralSight':'https://uk.trustpilot.com/review/pluralsight.com',
    'Udacity':'https://uk.trustpilot.com/review/udacity.com'
}

Setup scraping script

In [3]:
def setup_soup_page(url):
    br = mechanize.Browser()
    br.set_cookiejar(cookiejar)
    br.set_handle_robots(False)
    br.open(url)
    response = br.response()
    html = response.read()
    
    soup = BeautifulSoup(html,'html.parser')
    return br, soup
 
def get_reviews_from_page(soup):
    scripts = soup.find_all("script")
    
    for script in scripts:
        if script.get('data-business-unit-json-ld')=="":
            review_script_json=script.string.strip()
            break
            
    review_data = json.loads(review_script_json)
    return review_data[0]['review']

def open_next_page(br):
    links = br.links()
    is_next_page = [ x for x in links if x.text=='Next page']
    
    if len(is_next_page) == 0:
        return
    
    next_page_link = is_next_page[0]
    response = br.follow_link(next_page_link)
    html = response.read()
    soup = BeautifulSoup(html,'html.parser')

    return br, soup

For every company, iterate the scraping process. Save the data into a csv for each company.

In [4]:
for key in competitor_url:
    data = []
    br, soup = setup_soup_page(competitor_url[key])
    data.extend(get_reviews_from_page(soup))

    print('{} Reviews | Page number 1'.format(key))
    
    try:
        br, soup = open_next_page(br)
    except:
        print('No more pages')
        continue
        
    page_number = 2
    while soup:
        data.extend(get_reviews_from_page(soup))
        
        print('{} Reviews | Page number {}'.format(key, page_number))
        if page_number % 20 == 0:
            time.sleep(5)
        
        page_number += 1
        try:
            br, soup = open_next_page(br)
        except:
            print('No more pages')
            break
            
    if len(data) == 0:
        print('No data:', key)
        continue
        
    df = pd.DataFrame(data)
    drop_columns = ['author','itemReviewed','publisher','reviewRating']
    df_dropped = df.drop(columns=drop_columns)

    df_normalize = []
    for column in drop_columns:
        df_normalize.append(json_normalize(df[column]).add_prefix(column+'_'))

    df = pd.concat([df_dropped]+df_normalize, axis=1).drop(columns=['reviewRating_worstRating','reviewRating_bestRating'])
    df.to_csv('../Dataset/competitors/trustpilot_reviews_{}.csv'.format(key))
    print(key)

Avado Reviews | Page number 1
Avado Reviews | Page number 2
Avado Reviews | Page number 3
Avado Reviews | Page number 4
Avado Reviews | Page number 5
Avado Reviews | Page number 6
Avado Reviews | Page number 7
Avado Reviews | Page number 8
Avado Reviews | Page number 9
Avado Reviews | Page number 10
Avado Reviews | Page number 11
Avado Reviews | Page number 12
Avado Reviews | Page number 13
Avado Reviews | Page number 14
Avado Reviews | Page number 15
Avado Reviews | Page number 16
Avado Reviews | Page number 17
Avado Reviews | Page number 18
Avado Reviews | Page number 19
Avado Reviews | Page number 20
Avado Reviews | Page number 21
Avado Reviews | Page number 22
Avado Reviews | Page number 23
Avado Reviews | Page number 24
Avado Reviews | Page number 25
Avado Reviews | Page number 26
Avado Reviews | Page number 27
Avado Reviews | Page number 28
Avado Reviews | Page number 29
Avado Reviews | Page number 30
Avado Reviews | Page number 31
Avado Reviews | Page number 32
Avado Reviews | P

ICS_Learn Reviews | Page number 140
ICS_Learn Reviews | Page number 141
ICS_Learn Reviews | Page number 142
ICS_Learn Reviews | Page number 143
ICS_Learn Reviews | Page number 144
ICS_Learn Reviews | Page number 145
ICS_Learn Reviews | Page number 146
ICS_Learn Reviews | Page number 147
ICS_Learn Reviews | Page number 148
ICS_Learn Reviews | Page number 149
ICS_Learn Reviews | Page number 150
ICS_Learn Reviews | Page number 151
ICS_Learn Reviews | Page number 152
ICS_Learn Reviews | Page number 153
ICS_Learn Reviews | Page number 154
ICS_Learn Reviews | Page number 155
ICS_Learn Reviews | Page number 156
ICS_Learn Reviews | Page number 157
ICS_Learn Reviews | Page number 158
ICS_Learn Reviews | Page number 159
ICS_Learn Reviews | Page number 160
ICS_Learn Reviews | Page number 161
ICS_Learn Reviews | Page number 162
ICS_Learn Reviews | Page number 163
ICS_Learn Reviews | Page number 164
ICS_Learn Reviews | Page number 165
ICS_Learn Reviews | Page number 166
ICS_Learn Reviews | Page num

Whitehat_Jr Reviews | Page number 25
Whitehat_Jr Reviews | Page number 26
Whitehat_Jr Reviews | Page number 27
Whitehat_Jr Reviews | Page number 28
Whitehat_Jr Reviews | Page number 29
Whitehat_Jr Reviews | Page number 30
Whitehat_Jr Reviews | Page number 31
Whitehat_Jr Reviews | Page number 32
Whitehat_Jr Reviews | Page number 33
Whitehat_Jr Reviews | Page number 34
Whitehat_Jr Reviews | Page number 35
Whitehat_Jr Reviews | Page number 36
Whitehat_Jr Reviews | Page number 37
Whitehat_Jr Reviews | Page number 38
Whitehat_Jr Reviews | Page number 39
Whitehat_Jr Reviews | Page number 40
Whitehat_Jr Reviews | Page number 41
Whitehat_Jr Reviews | Page number 42
Whitehat_Jr Reviews | Page number 43
Whitehat_Jr Reviews | Page number 44
Whitehat_Jr Reviews | Page number 45
Whitehat_Jr Reviews | Page number 46
Whitehat_Jr Reviews | Page number 47
Whitehat_Jr Reviews | Page number 48
Whitehat_Jr Reviews | Page number 49
Whitehat_Jr Reviews | Page number 50
Whitehat_Jr Reviews | Page number 51
W

edx Reviews | Page number 25
edx Reviews | Page number 26
edx Reviews | Page number 27
edx Reviews | Page number 28
edx Reviews | Page number 29
edx Reviews | Page number 30
edx Reviews | Page number 31
edx Reviews | Page number 32
edx Reviews | Page number 33
edx Reviews | Page number 34
edx Reviews | Page number 35
edx Reviews | Page number 36
edx Reviews | Page number 37
edx Reviews | Page number 38
edx Reviews | Page number 39
edx Reviews | Page number 40
edx Reviews | Page number 41
edx Reviews | Page number 42
edx Reviews | Page number 43
edx Reviews | Page number 44
edx Reviews | Page number 45
edx Reviews | Page number 46
edx Reviews | Page number 47
edx Reviews | Page number 48
edx Reviews | Page number 49
edx Reviews | Page number 50
edx Reviews | Page number 51
edx Reviews | Page number 52
edx Reviews | Page number 53
edx Reviews | Page number 54
edx Reviews | Page number 55
edx Reviews | Page number 56
edx Reviews | Page number 57
edx Reviews | Page number 58
edx Reviews | 

PluralSight Reviews | Page number 7
PluralSight Reviews | Page number 8
PluralSight Reviews | Page number 9
PluralSight Reviews | Page number 10
PluralSight Reviews | Page number 11
PluralSight Reviews | Page number 12
PluralSight Reviews | Page number 13
PluralSight Reviews | Page number 14
PluralSight Reviews | Page number 15
PluralSight Reviews | Page number 16
PluralSight Reviews | Page number 17
PluralSight Reviews | Page number 18
PluralSight Reviews | Page number 19
PluralSight Reviews | Page number 20
PluralSight Reviews | Page number 21
PluralSight Reviews | Page number 22
PluralSight Reviews | Page number 23
PluralSight Reviews | Page number 24
PluralSight Reviews | Page number 25
PluralSight Reviews | Page number 26
PluralSight Reviews | Page number 27
PluralSight Reviews | Page number 28
PluralSight Reviews | Page number 29
PluralSight Reviews | Page number 30
PluralSight Reviews | Page number 31
PluralSight Reviews | Page number 32
PluralSight Reviews | Page number 33
Plur

All companies feedback are scraped and saved as individual csv.