<a href="https://colab.research.google.com/github/previoip/web-scrapper-collections/blob/main/Indonesia_Commercial_Vehicle_Truck_Spec_Scrapper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [122]:
from bs4 import BeautifulSoup
import requests, os, shutil

In [123]:
cache_folder = './cache'
result_folder = './result'
alias_correction_file = './aliases.txt'

csv_delim = ';'
csv_delim_alt = ','

download_result = True
download_cache = False

In [124]:
def mkdir(path: str):
    if not os.path.exists(path) or not os.path.isdir(path):
        os.mkdir(path)

def url2fn(url: str):
    while url.endswith('/'):
        url = url[:-1]
    for r in ['https://', 'http://', 'ftps://', 'ftp://']:
        url = url.replace(r, '')
    for r in ['.', '?', '%']:
        url = url.replace(r, '-')
    url = url.replace('/', '_')
    return url

def norm_val(val: str):
  for n in ['\n\r', '\n', '\r', '\t']:
    val = val.replace(n, '')
  val = val.replace(csv_delim, csv_delim_alt)
  return val.strip()

def get_html_content(url, cache_response=True, strict=True) -> bytes:
    print(f'processing: {url}')

    fname = url2fn(url) + '.html'
    cache_path = os.path.join(cache_folder, fname)
    data = None

    def request(url, strict=False):
        res = requests.get(url, verify=False)
        if not res.status_code == 200:
            if not strict:
                return None
            else:
                res.raise_for_status()
                exit(1)
        return res.content

    if cache_response:
        mkdir(cache_folder)
        cache_path = os.path.join(cache_folder, fname)
        if os.path.exists(cache_path):
            print(f'using cached result: {cache_path}')
            with open(cache_path, 'rb') as fp:
                data = fp.read()
        else:
            print(f'caching requested result: {fname}')
            data = request(url)
            with open(cache_path, 'wb') as fp:
                fp.write(data)
    else:
        data = request(url)
    return data

def bs4htmlParser(url):
    return BeautifulSoup(get_html_content(url), 'html.parser')

def parseAlias(textfp: str):
  a = [[None, None, None, None]]
  if os.path.exists(textfp) and os.path.isfile(textfp):
    with open(textfp, 'r') as fp:
      a = [i.split('\t') for i in fp.read().split('\n')]
    a = [i for i in a if len(i)==4]
    r = {}
    for b in a:
      if not r.get(b[0]):
        r[b[0]] = {}
      r[b[0]][b[1]] = [b[2], b[3]]
  return r

def correct_alias(alias, category, param):
  cat = alias.get(category)
  if cat:
    res = cat.get(param)
    if res:
      return res

  return category, param

In [125]:
mkdir(result_folder)

In [126]:
alias = parseAlias('aliases.txt')

# Isuzu Astra Lineup

In [127]:
url = 'https://isuzu-astra.com/'

In [116]:
ret = csv_delim.join(['Manufacturer', 'Emission Standard', 'Series', 'Variant', 'Specification Category', 'Specification Parameter', 'Unit', 'Value'])
ret += '\n'

page = bs4htmlParser(url)
product_dropdown = [i for i in page.find_all('div', {'class':'fNavMobileMenu'}) if 'PRODUCTS' in i.find('div', {'class':'fNavMobileTitle'}).text][0]

for product_link in product_dropdown.find_all('a'):
  vehicle_series = product_link.text.strip()
  vehicle_series_link = product_link['href']
  vehicle_series_page = bs4htmlParser(vehicle_series_link)
  vehicle_spec_list = vehicle_series_page.find(class_='vehicleSpecificationList').find(class_='row')
  row = []
  for n, i in enumerate(vehicle_spec_list.find_all(class_="accordion-item")):
    spec_category = i.find('h5').text
    units = [j.text.strip() if j else '-' for j in i.find_all(class_="col-1")]
    subcategories = [j.text.strip() if j else '-' for j in i.find_all(class_="col-3")]
    values = [j.text.strip() if j else '-' for j in i.find_all(class_="col-2")]
    len_values = len(values)
    row.append((spec_category, subcategories, units, values))
  
  vehicle_variants = row.pop(0)[-1]
  vehicle_variants_count = len(vehicle_variants)
  


  for n in range(vehicle_variants_count):
    variant = vehicle_variants[n]
    for spec_category, subcategories, units, values in row:
      # print(spec_category, subcategories, units, values[n::vehicle_variants_count])
      v_values = values[n::vehicle_variants_count]
      for m, v in enumerate(v_values):
        corrected_category, corrected_sub_category = correct_alias(alias, spec_category, subcategories[m])
        ret += csv_delim.join(['IAMI', 'euro4', vehicle_series, variant, corrected_category, corrected_sub_category, units[m], norm_val(v)])
        ret += '\n'

with open(os.path.join(result_folder, 'isuzu_astra_lineup_data.csv'), 'w') as fp:
  fp.write(ret)

processing: https://isuzu-astra.com/
using cached result: ./cache/isuzu-astra-com.html
processing: https://isuzu-astra.com/traga-pick-up
using cached result: ./cache/isuzu-astra-com_traga-pick-up.html
processing: https://isuzu-astra.com/all-new-isuzu-d-max/
using cached result: ./cache/isuzu-astra-com_all-new-isuzu-d-max.html
processing: https://isuzu-astra.com/elf-nlr/
using cached result: ./cache/isuzu-astra-com_elf-nlr.html
processing: https://isuzu-astra.com/elf-nmr/
using cached result: ./cache/isuzu-astra-com_elf-nmr.html
processing: https://isuzu-astra.com/elf-nps/
using cached result: ./cache/isuzu-astra-com_elf-nps.html
processing: https://isuzu-astra.com/giga-fvr/
using cached result: ./cache/isuzu-astra-com_giga-fvr.html
processing: https://isuzu-astra.com/giga-fvm/
using cached result: ./cache/isuzu-astra-com_giga-fvm.html
processing: https://isuzu-astra.com/giga-fvz/
using cached result: ./cache/isuzu-astra-com_giga-fvz.html
processing: https://isuzu-astra.com/giga-gxz/
us

# Hino Lineup

In [117]:
url = 'https://hino.co.id/'

In [118]:
page = bs4htmlParser(url)
a_elems = page.find_all('a')
product_list_links = []
for i in a_elems:
    i = i['href']
    if 'product-list' in i and i not in product_list_links and 'www' not in i:
        product_list_links.append(i)
        product_list_links.append(i.replace('euro4/', '')) # euro2

ret = csv_delim.join(['Manufacturer', 'Emission Standard', 'Series', 'Variant', 'Specification Category', 'Specification Parameter', 'Unit', 'Value'])
ret += '\n'

for product_link in product_list_links:
    data = bs4htmlParser(product_link)
    emmision_type = 'euro2'
    if 'euro4' in product_link:
      emmision_type = 'euro4'
    vehicle_series = data.body.find_all('section', style="padding:50px 0 0;")[0].find('h3').text
    content_body = data.body.find_all('section', style="padding-bottom: 100px;")[0]

    for item in content_body.find_all('a'):
        item_link = item.get('href')

        vehicle_variant = item.find('label').text

        data = bs4htmlParser(item_link)
        data_spec = data.find('div', id='accordion')
        for data_item in data_spec.find_all('div', {'class':'panel panel-default'}):
          spec_category = data_item.find('h4').text.strip()
          spec_imgs = data_item.find_all('img')

          
          for tab_panel in data_item.find_all('div', {'role': 'tabpanel'}):
            
              tdata = []
              table = data_item.find('div', {'role':'tabpanel'})
              table_body = table.find('tbody')
              if not table_body:
                break
              rows = table_body.find_all('tr')
              for row in rows:
                  cols = row.find_all('td')
                  cols = [ele.text.strip() for ele in cols]
                  tdata.append([ele for ele in cols if ele])

              # puts data as str into buffer
              for row in tdata:
                param, unit, val = row
                corrected_category, corrected_param = correct_alias(alias, spec_category, param)
                ret += csv_delim.join(['HINO', emmision_type, vehicle_series, vehicle_variant, corrected_category, corrected_param, unit, norm_val(val)])
                ret += '\n'

with open(os.path.join(result_folder, 'hino_lineup_data.csv'), 'w') as fp:
  fp.write(ret)


processing: https://hino.co.id/
using cached result: ./cache/hino-co-id.html
processing: https://hino.co.id/product-list/euro4/1
using cached result: ./cache/hino-co-id_product-list_euro4_1.html
processing: https://hino.co.id/product-detail/euro4/1/136-hd-xpower
using cached result: ./cache/hino-co-id_product-detail_euro4_1_136-hd-xpower.html
processing: https://hino.co.id/product-detail/euro4/1/136-hdx-mixer
using cached result: ./cache/hino-co-id_product-detail_euro4_1_136-hdx-mixer.html
processing: https://hino.co.id/product-detail/euro4/1/136-hdl
using cached result: ./cache/hino-co-id_product-detail_euro4_1_136-hdl.html
processing: https://hino.co.id/product-detail/euro4/1/115-sd-std
using cached result: ./cache/hino-co-id_product-detail_euro4_1_115-sd-std.html
processing: https://hino.co.id/product-detail/euro4/1/136-hd
using cached result: ./cache/hino-co-id_product-detail_euro4_1_136-hd.html
processing: https://hino.co.id/product-detail/euro4/1/115-sdl-std
using cached result: 

# Mitsubishi FUSO lineup

In [119]:
url = 'https://ktbfuso.co.id'

In [120]:
ret = csv_delim.join(['Manufacturer', 'Emission Standard', 'Series', 'Variant', 'Specification Category', 'Specification Parameter', 'Unit', 'Value'])
ret += '\n'

page = bs4htmlParser(url)
emmision_type = 'euro4'

elems_product_category = page.find_all(id='nav-tab')[0].find_all('a')
product_categories = [i.text for i in elems_product_category]
product_categories_links = [i['href'] for i in elems_product_category]

for n, product_url in enumerate(product_categories_links):
  product_category = product_categories[n]
  product_category_page = bs4htmlParser(product_url)
  product_cat_elem = product_category_page.find(class_='tabs_content')
  product_list_elems = product_cat_elem.find_all(class_='col-lg-custom-5 col-4 text-center col-product')

  vehicle_series = product_category_page.find('div', class_="title border_bottom_image").find('h3').text.strip()

  for el in product_list_elems:
    media = el.find('img', class_='img-fluid')['src']
    vehicle_link = el.find_all('a')[0]['href']
    vehicle_variant = el.find('div', class_="product_name mt-3").text.strip()

    variant_spec_page = bs4htmlParser(vehicle_link)

    variant_spec_category_elems = variant_spec_page.find('section', class_="spesifikasi_wrapper py-5").find('div', class_="accordion_wrapper").find_all('div', class_="accordion")
    for spec_cat_el in variant_spec_category_elems:
      spec_category = spec_cat_el.find('div', class_='title mr-auto').text
      spec_params = spec_cat_el.find_all('div', class_="row row_spec no-gutters")

      for param in spec_params:
        row = [i.text for i in param.find_all('div', class_='col-lg-4')]
        param, unit, val = row
        corrected_category, corrected_param = correct_alias(alias, spec_category, param)
        ret += csv_delim.join(['MITSUBISHI', emmision_type, vehicle_series, vehicle_variant, corrected_category, corrected_param, unit, norm_val(val)])
        ret += '\n'


with open(os.path.join(result_folder, 'mitsubishi_lineup_data.csv'), 'w') as fp:
  fp.write(ret)


processing: https://ktbfuso.co.id
using cached result: ./cache/ktbfuso-co-id.html
processing: https://ktbfuso.co.id/product-category/1/light-duty
using cached result: ./cache/ktbfuso-co-id_product-category_1_light-duty.html
processing: https://ktbfuso.co.id/product-detail/1/fe-71
using cached result: ./cache/ktbfuso-co-id_product-detail_1_fe-71.html
processing: https://ktbfuso.co.id/product-detail/2/fe-71-bc
using cached result: ./cache/ktbfuso-co-id_product-detail_2_fe-71-bc.html
processing: https://ktbfuso.co.id/product-detail/3/fe-71l-bcl-non-cabin
using cached result: ./cache/ktbfuso-co-id_product-detail_3_fe-71l-bcl-non-cabin.html
processing: https://ktbfuso.co.id/product-detail/4/fe-71l
using cached result: ./cache/ktbfuso-co-id_product-detail_4_fe-71l.html
processing: https://ktbfuso.co.id/product-detail/5/fe-71l-bc
using cached result: ./cache/ktbfuso-co-id_product-detail_5_fe-71l-bc.html
processing: https://ktbfuso.co.id/product-detail/6/fe-73
using cached result: ./cache/ktbf

# Download result

In [121]:
from google.colab import files

if download_cache:
  shutil.make_archive(cache_folder, 'zip', cache_folder, verbose=True)
  files.download(cache_folder + '.zip')

if download_result:
  shutil.make_archive(result_folder, 'zip', result_folder, verbose=True)
  files.download(result_folder + '.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>