In [1]:
# to ignore warning message
import warnings
warnings.filterwarnings("ignore")
# to make request website
import requests
# to make html to soup 
from bs4 import BeautifulSoup
# to use open source data analysis and manipulation
import pandas as pd
import os

In [50]:
# base url website to scrape data
base_url = 'https://www.khmer24.com/en'
# category to scrape
category = 'cars'
# page param for number of records
page_number = '?per_page='
# the records/row per page
rpp = 50
dataset = 'data_scraped'
category_directory = str.format('{}/{}', dataset, category)
category_directory_url = str.format('{}/urls', category_directory, category)
category_directory_url_file = str.format('{}/all_{}_url.csv', category_directory_url, category)
category_directory_data = str.format('{}/datasets', category_directory, category)


In [2]:
# function to request with user-agent and parse to html
def get_request(url):
    header = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
    response = requests.get(url, headers = header)
    return BeautifulSoup(response.text, 'html.parser')

In [52]:
# to get all urls from category and its type 
def get_category_urls(url = base_url, category = category):
    data_return = []
    page_url = str.format('{}/{}', url, category)
    soup = get_request(page_url)
    items = soup.findChildren('li', attrs = [{'class', ''}, {'class', 'd-none'}])
    for i in items:
        categories = {}
        categories['url'] = i.a['href']
        categories['model'] = i.a['title'].lower()
        data_return.append(categories)
    return data_return

In [53]:
# to get all url from each type
def get_all_url(url, model):
    data_return = []
    page = 0
    while(True):
        page_url = str.format('{}/{}{}', url, page_number, page)
        soup = get_request(page_url)
        items = soup.findChildren('a', attrs = {'class', 'border post'})
        if(len(items) == 0):
            break
        for item in items:
            data = {}
            data['url'] = item['href']
            data['model'] = model
            data_return.append(data)
        page = page + rpp
    return data_return


In [54]:
def remove_special_char(my_str):
    return my_str

In [55]:
# to get the detail of each url record
def get_data_detail(url):
    data = {}
    soup = get_request(url)
    # to get id, category, localtion and post date
    posting_infos = soup.findChildren('ul', attrs = {'class', 'list-unstyled item-info m-0'})
    for i in posting_infos:
        for j in i.findChildren('li'):
            data[remove_special_char(j.findChildren('span')[0].text)] = j.findChildren('span')[1].text
    # to get description model, car makes, year, tax type, .... 
    descriptions = soup.findChildren('ul', attrs = {'class', 'list-unstyled item-fields'})
    for i in descriptions:
        for j in i.findChildren('li'):
            data[remove_special_char(j.findChildren('span')[0].text)] = j.findChildren('span')[1].text
    price = soup.findChild('p', attrs = {'class', 'price price_tag'})
    if price is not None:
        data['price'] = price.text
    phone_numbers = soup.findChildren('ul', attrs = {'class', 'list-unstyled m-0'})
    for i in phone_numbers:
        for j in i.findChildren('li', attrs = {'class', 'number'}):
            data[remove_special_char(j.find('a')['class'][0])] = j.a.find(attrs = {'class', 'num'}).text
    return data

In [56]:
def create_directory_if_not_exist(file):
    directory = os.path.dirname(file)
    if directory is not '' and not os.path.exists(directory):
        os.makedirs(directory)

In [57]:
def read_file_to_dict(file_name):
    data_return = None
    try:
        data_return = pd.read_csv(file_name).to_dict('records')
    except Exception as er:
        print(str.format('Read file error accured {}', er))
    finally:
        return data_return

In [58]:
#  to write data to csv file 
def write_data_to_csv(data, file_name):
    file_name = remove_special_char(file_name)
    create_directory_if_not_exist(file_name)
    local_df = pd.DataFrame(data)
    if file_name.endswith('.csv') == False:
        file_name = str.format('{}.{}', file_name, 'csv')
    local_df.to_csv(file_name)

In [59]:
def get_detail_page(urls):
    content_data = []
    model = urls[0]['model']
    file_name = str.format('{}/all_records_{}.csv', category_directory_data, model)
    data = read_file_to_dict(file_name)
    if data is None:
        for j in urls:
            item = get_data_detail(j['url'])
            content_data.append(item)
        write_data_to_csv(content_data, file_name)

In [60]:
category_urls = read_file_to_dict(category_directory_url_file)
if category_urls is None:
    category_urls = get_category_urls()
    write_data_to_csv(category_urls, category_directory_url_file)

In [61]:
for i in category_urls:
    model = i['model']
    file_name = str.format('{}/all_detail_url_{}.csv', category_directory_url, model)
    data = read_file_to_dict(file_name)
    if data is None:
        data = get_all_url(i['url'], model)
        write_data_to_csv(data, file_name)
    else:
        records = len(data)
        print('{:0=4}'.format(records), 'items', 'for', model)
        if records == 0:
            # skip in case no records
            continue
    get_detail_page(data)


2293 items for apple
1018 items for samsung
0281 items for huawei
0044 items for sony
0798 items for oppo
0034 items for lg
0367 items for vivo
0086 items for nokia
0006 items for meizu
0056 items for oneplus
0002 items for blackberry
0002 items for htc
0001 items for acer
0098 items for google
0316 items for xiaomi
0029 items for motorola
0001 items for alcatel
0050 items for vertu
0010 items for zte
0037 items for asus
0346 items for other - ផ្សេងៗ


In [62]:
# byd