In [1]:
import os, json, requests, pickle
import time
import pandas as pd
from selenium.common.exceptions import WebDriverException, TimeoutException
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from threading import Thread
from queue import Queue

from urllib.parse import parse_qs, urlparse

os.environ['chromedriver'] = './chromedriver.exe'

options = Options()
options.add_argument("--headless")
options.binary_location = 'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'

In [2]:
class FindCityThread(Thread):
    
    def __init__(self, work_queue, base_url, city_dict, max_time_out=120, verbose=0):
        Thread.__init__(self)
        self.work_queue = work_queue
        self.city_id_not_found = -1
        self.base_url = base_url
        self.city_dict = city_dict
        self.max_time_out = max_time_out
        self.city_id_not_found = -1
        self.verbose = verbose
    
    def custom_log(self, string):
        if self.verbose == 1:
            print(string)
    
    def get_city_id(self, city_name):
        driver = webdriver.Chrome(executable_path=os.path.abspath('chromedriver'), chrome_options=options)

        input_selector = (By.CSS_SELECTOR, 'input[data-selenium="textInput"]')
        btn_selector = (By.CSS_SELECTOR, 'button[data-selenium="searchButton"]')

        self.custom_log('Searching %s\'s ID ...' % city_name)
        driver.get(self.base_url)

        wait = WebDriverWait(driver, self.max_time_out)

        self.custom_log('Wait until input presence ...')
        text_input = wait.until(EC.presence_of_element_located(input_selector))
        self.custom_log('Type %s in input tag ...' % city_name)
        text_input.clear()
        text_input.send_keys(city_name)

        self.custom_log('Wait until button presence ...')
        btn = wait.until(EC.element_to_be_clickable(btn_selector))
        self.custom_log('Button click ...')
        btn.click()

        self.custom_log('Wait until url changes ...')
        wait.until(EC.url_changes(self.base_url))
        parsed_url = urlparse(driver.current_url)
        query = parse_qs(parsed_url.query)
        driver.close()
        try:
            self.custom_log(query['city_id'])
            return query['city_id'][0]
        except KeyError as e:
            print(e)
            return self.city_id_not_found
        
        
    def run(self):
        while not self.work_queue.empty():
            w = self.work_queue.get()
            city_id = self.get_city_id(city_name=w)
            try:
                if city_id != self.city_id_not_found:
                    self.city_dict[w] = city_id
                    print(self.work_queue.qsize())
                else:
                    self.work_queue.put(w)
            except(WebDriverException, TimeoutError) as error:
                print(error)
            
            

In [3]:
class CityRepository:
    
    def __init__(self, country, max_thread=4, max_time_out=120):
        self.city_dict = {}
        self.city_id_not_found = -1
        self.country = country
        # In seconds
        self.max_time_out = max_time_out
        self.max_thread = max_thread
        self.base_url = 'https://www.agoda.com/th-th/'
    
    def run(self):
        self.fill_city_names()
        work_queue = Queue()
        for k in self.city_dict: work_queue.put(k)
        print(work_queue.qsize())
        threads = [ FindCityThread(work_queue, self.base_url, self.city_dict) for _ in range(self.max_thread) ]
        for t in threads: t.start()
        for t in threads: t.join()
        self.save('city_dict')
        return self.city_dict
    
    def save(self, filename='city_dict'):
        output_file = open(filename, 'wb')
        pickle.dump(self.city_dict, output_file)
        output_file.close()
    
    def load(self, filename='city_dict'):
        input_file = open(filename, 'rb')
        city_dict = pickle.load(input_file)
        input_file.close()
        return self.city_dict
    
    def fill_city_names(self):
        city_names = self.get_city_names_in_country(self.country) 
        init_city_ids = [ -1 for _ in range(len(city_names))]
        self.city_dict = dict(zip(city_names, init_city_ids))
    
    def get_city_names_in_country(self, country='thailand'):
        driver = webdriver.Chrome(executable_path=os.path.abspath('chromedriver'), chrome_options=options)
        driver.get(f'{self.base_url}/country/{country}.html')

#         city_link_doms = driver.find_elements(By.CSS_SELECTOR, 'a[data-selenium="city-link"]')
        city_name_doms = driver.find_elements(By.CSS_SELECTOR, 'dt[data-selenium="city-name"]')   

        city_names = [ c.text.replace('จังหวัด', '') for c in city_name_doms ]    
        driver.close()
        return city_names
    
    def fill_city_ids(self):
        while self.filled_count != len(self.city_dict):
            for city_name in self.city_dict:
                if self.city_dict[city_name] == self.city_id_not_found:
                    try:
                        self.city_dict[city_name] = self.get_city_id(city_name=city_name)
                        self.filled_count += 1
                    except WebDriverException as error:
                        print(error)
                        self.city_dict[city_name] = self.get_city_id(city_name=city_name)
                        self.filled_count += 1
                else:
                    continue
                    
    def get_city_id(self, city_name):
        driver = webdriver.Chrome(executable_path=os.path.abspath('chromedriver'), chrome_options=options)
        
        input_selector = (By.CSS_SELECTOR, 'input[data-selenium="textInput"]')
        btn_selector = (By.CSS_SELECTOR, 'button[data-selenium="searchButton"]')

        print('Searching %s\'s ID ...' % city_name)
        driver.get(self.base_url)

        wait = WebDriverWait(driver, self.max_time_out)

        print('Wait until input presence ...')
        text_input = wait.until(EC.presence_of_element_located(input_selector))
        print('Type %s in input tag ...' % city_name)
        text_input.clear()
        text_input.send_keys(city_name)

        print('Wait until button presence ...')
        btn = wait.until(EC.element_to_be_clickable(btn_selector))
        print('Button click ...')
        btn.click()

        print('Wait until url changes ...')
        wait.until(EC.url_changes(self.base_url))
        parsed_url = urlparse(driver.current_url)
        query = parse_qs(parsed_url.query)
        driver.close()
        try:
            print(query['city_id'][0])
            return query['city_id'][0]
        except KeyError as e:
            print(e)
            return self.city_id_not_found
            

In [4]:
city_repo = CityRepository(country='thailand')
try:
    city_repo.load('city_dict')
except Exception as error:
    print(error)
    city_repo.run()

[Errno 2] No such file or directory: 'city_dict'
77
73
72
71
70
69
68
67
'city_id'
'city_id'
66


Exception in thread Thread-8:
Traceback (most recent call last):
  File "C:\Users\u6069627\AppData\Local\Continuum\anaconda3\lib\threading.py", line 916, in _bootstrap_inner
    self.run()
  File "<ipython-input-2-fb5a6f965b0b>", line 55, in run
    city_id = self.get_city_id(city_name=w)
  File "<ipython-input-2-fb5a6f965b0b>", line 40, in get_city_id
    wait.until(EC.url_changes(self.base_url))
  File "C:\Users\u6069627\AppData\Local\Continuum\anaconda3\lib\site-packages\selenium\webdriver\support\wait.py", line 80, in until
    raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message: 




65
64
'city_id'
'city_id'
'city_id'
63
'city_id'
62
'city_id'
'city_id'
61
'city_id'
'city_id'
'city_id'
'city_id'
'city_id'
'city_id'
60
59
58
57


Exception in thread Thread-6:
Traceback (most recent call last):
  File "C:\Users\u6069627\AppData\Local\Continuum\anaconda3\lib\threading.py", line 916, in _bootstrap_inner
    self.run()
  File "<ipython-input-2-fb5a6f965b0b>", line 55, in run
    city_id = self.get_city_id(city_name=w)
  File "<ipython-input-2-fb5a6f965b0b>", line 37, in get_city_id
    btn.click()
  File "C:\Users\u6069627\AppData\Local\Continuum\anaconda3\lib\site-packages\selenium\webdriver\remote\webelement.py", line 80, in click
    self._execute(Command.CLICK_ELEMENT)
  File "C:\Users\u6069627\AppData\Local\Continuum\anaconda3\lib\site-packages\selenium\webdriver\remote\webelement.py", line 628, in _execute
    return self._parent.execute(command, params)
  File "C:\Users\u6069627\AppData\Local\Continuum\anaconda3\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 312, in execute
    self.error_handler.check_response(response)
  File "C:\Users\u6069627\AppData\Local\Continuum\anaconda3\lib\site-pa

56
'city_id'
55
54
53
52
51
50
49
48
'city_id'
'city_id'
'city_id'
47
'city_id'
46
'city_id'
'city_id'
45
44
43
42
41
40
39
38
37
36
35
34
33
32
31
30
29
28
'city_id'
27
26
'city_id'
25
24
23
22
21


Exception in thread Thread-9:
Traceback (most recent call last):
  File "C:\Users\u6069627\AppData\Local\Continuum\anaconda3\lib\threading.py", line 916, in _bootstrap_inner
    self.run()
  File "<ipython-input-2-fb5a6f965b0b>", line 55, in run
    city_id = self.get_city_id(city_name=w)
  File "<ipython-input-2-fb5a6f965b0b>", line 37, in get_city_id
    btn.click()
  File "C:\Users\u6069627\AppData\Local\Continuum\anaconda3\lib\site-packages\selenium\webdriver\remote\webelement.py", line 80, in click
    self._execute(Command.CLICK_ELEMENT)
  File "C:\Users\u6069627\AppData\Local\Continuum\anaconda3\lib\site-packages\selenium\webdriver\remote\webelement.py", line 628, in _execute
    return self._parent.execute(command, params)
  File "C:\Users\u6069627\AppData\Local\Continuum\anaconda3\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 312, in execute
    self.error_handler.check_response(response)
  File "C:\Users\u6069627\AppData\Local\Continuum\anaconda3\lib\site-pa

In [8]:
city_repo.city_dict

{}

In [None]:
city_dict = city_repo.city_dict
out_file_city_dict = open('city_dict', 'wb')
pickle.dump(city_dict, out_file_city_dict)
out_file_city_dict.close()

In [None]:
in_file_city_dict = open('city_dict', 'rb')
city_dict = pickle.load(in_file_city_dict)
in_file_city_dict.close()
city_dict

In [None]:
def get_hotel_ids(city_id=16056):
    
    headers = {
        'origin': 'https://www.agoda.com',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'en-US,en;q=0.9',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36',
        'content-type': 'application/json; charset=UTF-8',
        'accept': '*/*',
        'referer': 'https://www.agoda.com/th-th/pages/agoda/default/DestinationSearchResult.aspx?asq=u2qcKLxwzRU5NDuxJ0kOF3T91go8JoYYMxAgy8FkBH1BN0lGAtYH25sdXoy34qb9cZRkSjfcPhFQAJ9M0k06kxNsx%2BZRBEZTqOziqEyzq5Po2NtcoTIf2N5Oe6Npt0h0bd02buIKMOAijAOG9EwcI9X%2BLTLQpFDnBugEEhZpLmJfX3ZNjMlF6HPYMw9SHQ4lBo9GW1pMRHqhXI1ruIdBF%2BL2AUnfOhFRTEDVteJxPyI%3D&city=16056&tick=636636353591&txtuuid=eda1019d-863c-4647-b01a-4914f6acaeb3&languageId=22&userId=70054530-b60b-4336-8ee2-f4023ec04bfd&pageTypeId=1&origin=TH&locale=th-TH&cid=-1&aid=130243&currencyCode=THB&htmlLanguage=th-th&cultureInfoName=th-TH&ckuid=70054530-b60b-4336-8ee2-f4023ec04bfd&prid=0&checkIn=2018-06-12&checkOut=2018-06-13&rooms=1&adults=2&children=0&priceCur=THB&los=1&textToSearch=%E0%B8%A0%E0%B8%B9%E0%B9%80%E0%B8%81%E0%B9%87%E0%B8%95&productType=-1&sort=agodaRecommended',
        'authority': 'www.agoda.com',
        'x-requested-with': 'XMLHttpRequest',
        'dnt': '1',
    }

    data = {
#         "SearchMessageID":"db6d6e37-c257-4283-80ed-36c426a31482",
        "IsPollDmc": 'false',
        "SearchType":1,
        "ObjectID":0,
        "HashId": 'null',
        "Filters":{
            "ProductType":[-1],
            "HotelName":"",
            "PriceRange":{
                "Min":0,
                "Max":0,
                "IsHavePriceFilterQueryParamter": 'false'
            }
        },
        "SelectedColumnTypes":{
            "ProductType":[-1]
        },
        "IsUseSelectedColumnType": 'true',
        "RateplanIDs": 'null',
#         "TotalHotels": 2152,
        "PlatformID": 1001,
        "CurrentDate":"2018-06-03T15:59:04.5270637+07:00",
#         "SearchID":991110603155904500,
        "CityId": city_id,
#         "Latitude":0,
#         "Longitude":0,
        "Radius":0,
        "RectangleSearchParams": 'null',
        "PageNumber": 1,
        "PageSize":45,
        "SortOrder":1,
        "SortField":0,
        "PointsMaxProgramId":0,
        "PollTimes":0,
        "MaxPollTimes":4,
#         "CityName":"\\u0e20\\u0e39\\u0e40\\u0e01\\u0e47\\u0e15",
#         "ObjectName":"\\u0e20\\u0e39\\u0e40\\u0e01\\u0e47\\u0e15",
        "AddressName": 'null',
        "CountryName":"Thailand",
        "CountryId": 106,
        "IsAllowYesterdaySearch": 'false',
        "CultureInfo":"th-TH",
        "UnavailableHotelId":0,
        "IsEnableAPS": 'false',
        "SelectedHotelId": 0,
        "IsComparisonMode": 'false',
        "HasFilter": 'false',
#         "LandingParameters":{
#             "HeaderBannerUrl": 'null',
#             "FooterBannerUrl": 'null',
#             "SelectedHotelId":0,
#             "LandingCityID":16056
#         },
        "NewSSRSearchType":0,
        "IsWysiwyp": 'false',
        "RequestPriceView": 'null',
        "FinalPriceView": 'null',
        "MapType":1,
        "IsShowMobileAppPrice": 'false',
        "IsApsPeek": 'false',
        "IsRetailPeek": 'false',
        "IsRetina": 'false',
        "IsCriteriaDatesChanged": 'false',
#         "TotalHotelsFormatted":"2,152",
        "PreviewRoomFinalPrice": 'null',
        "ReferrerUrl": 'null',
#         "CountryEnglishName":"Thailand",
#         "CityEnglishName":"Phuket",
        "Cid":1,
        "ProductType":-1,
        "FreeStayingChildrenOnly": 'false',
        "ShouldHideSoldOutProperty":'false',
        "isAgMse": 'false',
        "ccallout": 'false',
        "Adults":2,
        "Children":0,
        "Rooms":1,
        "CheckIn":"2018-06-12T00:00:00",
        "LengthOfStay": 1,
        "ChildAges": [],
        "DefaultChildAge":8,
        "ChildAgesStr": 'null',
        "CheckOut":"2018-06-13T00:00:00",
#         "Text":"\\u0e20\\u0e39\\u0e40\\u0e01\\u0e47\\u0e15",
        "IsDateless": 'false',
        "CheckboxType":0,
    }
    
    url = 'https://www.agoda.com/api/th-th/Main/GetSearchResultList'
    
    response = requests.post(url, headers=headers, data=str(data))
    total_page = response.json()['TotalPage']
#     results = [ r for r in json_result['ResultList'] ]
    results = []
    for i in range(1, total_page + 1):
        time.sleep(1)
        data['PageNumber'] = i
        res = requests.post('https://www.agoda.com/api/th-th/Main/GetSearchResultList', headers=headers, data=str(data))
        for i in res.json()['ResultList']: results.append(i)
        
    hotel_ids = [ r['HotelID'] for r in results ]
    eng_hotel_names = [ r['EnglishHotelName'] for r in results ]
    translated_hotel_names = [ r['TranslatedHotelName'] for r in results ]
    hotel_display_names = [ r['HotelDisplayName'] for r in results ]
    
    return (hotel_ids, eng_hotel_names, translated_hotel_names, hotel_display_names)


In [None]:
(hotel_ids, eng_hotel_names, translated_hotel_names, hotel_display_names) = get_hotel_ids()
'Total Hotel ID: %d' % len(hotel_ids)

In [None]:
eng_hotel_names[:5]

In [None]:
def get_reviews_count(hotel_id=1280163):
    url = 'https://www.agoda.com/NewSite/th-th/Review/HotelReviews'
    headers = {
        'origin': 'https://www.agoda.com',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'en-US,en;q=0.9',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36',
        'content-type': 'application/json; charset=UTF-8',
        'accept': 'application/json',
        'referer': 'https://www.agoda.com/th-th/renaissance-phuket-resort-spa-a-marriott-luxury-lifestyle-hotel/hotel/phuket-th.html?checkin=2018-06-12&los=1&adults=2&rooms=1&cid=-1&searchrequestid=faf88ea5-6a98-462c-a029-1d10c3181815&isMRS=0&tabbed=true',
        'authority': 'www.agoda.com',
        'x-requested-with': 'XMLHttpRequest',
        'dnt': '1',
    }
    data = {
        "hotelId": hotel_id,
        "hotelProviderId": 332,
        "demographicId": 0,
        "pageNo": 1,
        "pageSize": 20,
        "sorting": 1,
        "reviewProviderIds":[332,3038,27901,28999],
        "isReviewPage": 'false',
        "isCrawlablePage": 'true',
        "paginationSize": 5,
    }
    
    response = requests.post(url, headers=headers, data=str(data))
    from_agoda = 0
    from_booking = 1
    review_count = response.json()['reviewTabs']['reviewTabs'][from_agoda]['reviewCount']
    return review_count

In [None]:
get_reviews_count(hotel_ids[2])

In [None]:
def get_review_comments(hotel_id=1280163):
    url = 'https://www.agoda.com/NewSite/th-th/Review/ReviewComments'
    headers = {
        'origin': 'https://www.agoda.com',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'en-US,en;q=0.9',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36',
        'content-type': 'application/json; charset=UTF-8',
        'accept': 'application/json',
        'referer': 'www.agoda.com',
        'authority': 'www.agoda.com',
        'x-requested-with': 'XMLHttpRequest',
        'dnt': '1',
    }
    
    data = {
        'hotelId': hotel_id,
        'demographicId': 0,
        'page': 1,
        'pageSize':20,
        'sorting':1,
        'isReviewPage': 'false',
        'isCrawlablePage': 'true',
        'filters':{'language':[],'room':[]},
        'searchKeyword':'',
    }
    
    comments = []
    
    reviews_per_page = 40
    reviews_count = get_reviews_count(hotel_id)
    total_review_page = int(reviews_count / data['pageSize'])
    
    for page_number in range(1, total_review_page + 1):
        time.sleep(1)
        data['page'] = page_number
        str_data = str(data)
        response = requests.post(url, headers=headers, data=str_data)
        for each in response.json()['comments']:
            comments.append(each)
    
    return comments

In [None]:
def flat_reviewer_info(review_comments):
    reviewer_infos = pd.DataFrame([ reviewer for reviewer in review_comments['reviewerInfo'] ])
    results = pd.concat([review_comments.drop(columns=['reviewerInfo', 'encryptedReviewData', 'formattedReviewHelpfulText'], axis=1), reviewer_infos], axis=1)
    return results

In [None]:
reviews_dir = './review_hotels'

try:
    os.mkdir(reviews_dir)
except FileExistsError:
    pass

def get_review_comments_dataframe(hotel_id):
    print('Get review comments from HotelID %d' % i )
    review_comments = pd.DataFrame(get_review_comments(hotel_id=i))
    return flat_reviewer_info(review_comments)

for i in hotel_ids:
    get_review_comments_dataframe(i).replace(to_replace="'", value="").to_csv(f'{reviews_dir}/h{i}.csv', index=False)

In [None]:
d[3:]