In [None]:
class SC_Scraper:

    def __init__(self, debug=False, name = 'sc-scraper',platform = None):
        self.name = name
        self.debug = debug
        self.store_identifier = None
        self.platform = platform
        self.driver = self.__get_driver()
        #self.logger = self.__get_logger()
        self.location = None
        self.query = None
        self.store_url_list = []
        self.store_desc_list = []
        self.store_url_available = []
        self.store_list_url = None
        self.num_of_stores = None
        self.store_df = None
        self.MAX_WAIT = 20
        self.break_flag = False
        self.header = ['query','location','store_id','store_str','store_score_total','id_review', 'caption', 'relative_date', 'retrieval_date', 'rating', 'username', 'n_review_user', 'n_photo_user', 'url_user']
        
        
    def change_header(self,h):
        self.header = h

    def __enter__(self):
        return self

    def close(self):
        self.driver.close()
        self.driver.quit()
        return True

    def __scroll(self):
        scrollable_div = self.driver.find_element_by_css_selector('div.section-layout.section-scrollbox.scrollable-y.scrollable-show')
        self.driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scrollable_div)

    def __get_logger(self):
        # create logger
        logger = logging.getLogger(self.name)
        logger.setLevel(logging.DEBUG)

        # create console handler and set level to debug
        fh = logging.FileHandler('log/'+self.name+'.log')
        fh.setLevel(logging.DEBUG)
        # create formatter
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')

        # add formatter to ch
        fh.setFormatter(formatter)

        # add ch to logger
        logger.addHandler(fh)

        return logger
    
    def __loaded_check_by_classname(self,class_name):
        try:
            element = WebDriverWait(self.driver,15).until(EC.presence_of_element_located((By.CLASS_NAME,class_name)))
        except:
            self.logger.warn('Failed to move to page.')
        return
        
    def __get_driver(self, debug=False):
        options = Options()

        if not self.debug:
            options.add_argument("--headless")
        
        options.add_argument("--window-size=1366,768")
        options.add_argument("--disable-notifications")
        options.add_argument("--lang=en-GB")
        
        if self.platform == 'Mobile':
            options.add_argument('--user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) CriOS/56.0.2924.75 Mobile/14E5239e Safari/602.1')

        input_driver = webdriver.Chrome('./chromedriver', options =options)

        return input_driver


    # util function to clean special characters
    def __filter_string(self, str):
        strOut = str.replace('\r', ' ').replace('\n', ' ').replace('\t', ' ')
        return strOut
    
    def __scroll(self):
        scrollable_div = self.driver.find_element_by_css_selector('div.section-layout.section-scrollbox.scrollable-y.scrollable-show')
        self.driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scrollable_div)
    
    def scroll_10(self):
        k = 0
        #self.logger.info('Scrolling loop...')
        while k<10:
            k += 1
            self.__scroll()
            time.sleep(2)

        return True
    
    
    def to_url(self,url):
        
        self.store_list_url = url
        
        self.driver.get(self.store_list_url)
        
    
        
    
    
    def csv_writer(self,path='data/', outfile='gm_reviews.csv'):
        try:
            targetfile = open(path + outfile, mode='w', encoding='utf-8', newline='\n')
        except:
            os.makedirs(path)
            targetfile = open(path + outfile, mode='w', encoding='utf-8', newline='\n')
            
        writer = csv.writer(targetfile, quoting=csv.QUOTE_MINIMAL)
        h = self.header
        writer.writerow(h)
        return writer
    
    def __sc_parse(self,review):
        item = {}
        id_review = review.find('button', class_='section-review-action-menu')['data-review-id']
        username = review.find('div', class_='section-review-title').find('span').text

        try:
            tmp_text = review.find('span', class_='section-review-text').tex

            review_text = tmp_text.replace('\r', ' ').replace('\n', ' ').replace('\t', ' ')


        except Exception as e:
            review_text = None

        rating = float(review.find('span', class_='section-review-stars')['aria-label'].split(' ')[1])
        relative_date = review.find('span', class_='section-review-publish-date').text

        try:
            n_reviews_photos = review.find('div', class_='section-review-subtitle').find_all('span')[1].text
            metadata = n_reviews_photos.split('\xe3\x83\xbb')
            if len(metadata) == 3:
                n_photos = int(metadata[2].split(' ')[0].replace('.', ''))
            else:
                n_photos = 0

            idx = len(metadata)
            n_reviews = int(metadata[idx - 1].split(' ')[0].replace('.', ''))

        except Exception as e:
            n_reviews = 0
            n_photos = 0

        user_url = review.find('a')['href']

        item['id_review'] = id_review
        item['caption'] = review_text

            # depends on language, which depends on geolocation defined by Google Maps
            # custom mapping to transform into date shuold be implemented
        item['relative_date'] = relative_date

            # store datetime of scraping and apply further processing to calculate
            # correct date as retrieval_date - time(relative_date)
        item['retrieval_date'] = datetime.now()
        item['rating'] = rating
        item['username'] = username
        item['n_review_user'] = n_reviews
        item['n_photo_user'] = n_photos
        item['url_user'] = user_url

        return item
 
      

import os

from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from datetime import datetime
import time
import re
import logging
import traceback
import pandas as pd
import csv

GM_WEBPAGE = 'https://www.google.com/maps/'
MAX_WAIT = 10
MAX_RETRY = 5
MAX_SCROLLS = 40



In [2]:
def scroll():
    for _ in range(25):
        html = sc_scraper.driver.find_element_by_tag_name('html')
        html.send_keys(Keys.END)
        
    time.sleep(2)
    
    for _ in range(25):
        html = sc_scraper.driver.find_element_by_tag_name('html')
        html.send_keys(Keys.END)


In [3]:

sc_scraper = SC_Scraper(debug = False, platform = None)

In [4]:

sc_scraper.to_url("https://www.bilibili.com/bangumi/play/ep302023")
time.sleep(20)

In [5]:
episode_block = sc_scraper.driver.find_element_by_xpath('//*[@id="eplist_module"]/div[2]/ul')

In [6]:
single_episode_list = episode_block.find_elements_by_xpath("./*")

In [7]:
len(single_episode_list)

15

In [8]:
sc_scraper.change_header(['user_id','user_homepage','level','timestamp','text','like','episode'])

In [9]:
for episode_idx in range(4,9):
    print("________________")
    print(episode_idx)

    single_episode_list[episode_idx].click()
    time.sleep(3)
    scroll()
    time.sleep(10)
    page_number = int(sc_scraper.driver.find_element_by_xpath('//*[@id="comment_module"]/div[2]/div/div[2]/div[2]/span[1]').text.replace("共",'').replace("页",''))

    comment_writer = sc_scraper.csv_writer(path='data/', outfile=f'v0_episode_{str(episode_idx)}.csv')

    for tmp_page in range(page_number):
        print(f"{tmp_page+1}/{page_number}")
        scroll()
        page_inbox = sc_scraper.driver.find_element_by_class_name('page-jump').find_element_by_xpath('./input')
        page_inbox.clear()
        page_inbox.send_keys(int(tmp_page+1))
        page_inbox.send_keys(Keys.RETURN)
        scroll()
        time.sleep(1)

        comment_block = sc_scraper.driver.find_element_by_class_name('comment-list').find_elements_by_xpath("./*")
        for idx, tmp_comment in enumerate(comment_block):
            text = tmp_comment.find_elements_by_class_name("text")[0].text
            try:
                user_homepage = tmp_comment.find_elements_by_class_name("user")[0].find_elements_by_xpath("./*")[0].get_attribute('href')
            except:
                user_homepage = 'None'
            try:
                if user_homepage != 'None':
                    user_id = user_homepage.split("/")[-1]
                else: 
                    user_id = 'None'
            except:
                user_id = 'None'
            try:   
                level = tmp_comment.find_elements_by_class_name("user")[0].find_elements_by_class_name("level-link")[0].find_elements_by_xpath("./*")[0].get_attribute('class')
            except:
                level = 'None'
            timestamp = tmp_comment.find_elements_by_class_name("info")[0].find_elements_by_class_name("time")[0].text
            like = tmp_comment.find_elements_by_class_name("info")[0].find_elements_by_class_name("like")[0].text

            row_val = [user_id,user_homepage,level,timestamp,text,like,episode_idx]
            comment_writer.writerow(row_val)
    

________________
4
1/84
2/84
3/84
4/84
5/84
6/84
7/84
8/84
9/84
10/84
11/84
12/84
13/84
14/84
15/84
16/84
17/84
18/84
19/84
20/84
21/84
22/84
23/84
24/84
25/84
26/84
27/84
28/84
29/84
30/84
31/84
32/84
33/84
34/84
35/84
36/84
37/84
38/84
39/84
40/84
41/84
42/84
43/84
44/84
45/84
46/84
47/84
48/84
49/84
50/84
51/84
52/84
53/84
54/84
55/84
56/84
57/84
58/84
59/84
60/84
61/84
62/84
63/84
64/84
65/84
66/84
67/84
68/84
69/84
70/84
71/84
72/84
73/84
74/84
75/84
76/84
77/84
78/84
79/84
80/84
81/84
82/84
83/84
84/84
________________
5
1/96
2/96
3/96
4/96
5/96
6/96
7/96
8/96
9/96
10/96
11/96
12/96
13/96
14/96
15/96
16/96
17/96
18/96
19/96
20/96
21/96
22/96
23/96
24/96
25/96
26/96
27/96
28/96
29/96
30/96
31/96
32/96
33/96
34/96
35/96
36/96
37/96
38/96
39/96
40/96
41/96
42/96
43/96
44/96
45/96
46/96
47/96
48/96
49/96
50/96
51/96
52/96
53/96
54/96
55/96
56/96
57/96
58/96
59/96
60/96
61/96
62/96
63/96
64/96
65/96
66/96
67/96
68/96
69/96
70/96
71/96
72/96
73/96
74/96
75/96
76/96
77/96
78/96
79/96
80