# Summary
This notebook performs a demo task with the Elasticsearch client in the Python.


Include:
- Step 1: Collect data related to tax codes from the page: https://hosocongty.vn/

- Step 2: Store data with Elasticsearch

- Step 3: Query information on Elasticsearch

# Install necessary package

In [1]:
!pip install elasticsearch

Collecting elasticsearch
  Downloading elasticsearch-7.6.0-py2.py3-none-any.whl (88 kB)
Installing collected packages: elasticsearch
Successfully installed elasticsearch-7.6.0


# Collect data

In [4]:
import pandas as pd
import requests
import re
import html
import json

from bs4 import BeautifulSoup

def get_link_content(url):
    '''
    This function is used to get the information of 10 companies (if there is no error) listed in the url
    The information collected is:
        - Name (Tên công ty)
        - Tax code (Mã số thuế)
        - Address (Địa chỉ)
        
    ----------
    Input:
        - url: <string>
            url of page that want to get data
            
    ----------
    Return:
        <list[str]>
            list of 10 company information has been called json.dumps
    '''
    
    result = []

    html_page = requests.get(url).content
    soup = BeautifulSoup(html_page, 'html.parser')
    infors = soup.find('ul', {'class': 'hsdn'})

    list_company = infors.findAll('li')

    for c in list_company:
        try:
            name = c.find('h3').text

            address_code = str(c.find('div').text).split('Mã số thuế: ')
            address = address_code[0].replace('Địa chỉ: ', '')
            code = address_code[-1]
            temp = {'name': name, 'tax_code': code, 'address': address}

            result.append(json.dumps(temp, ensure_ascii=False))
        except:
            continue

    return result


def get_data(num_pages=10):
    '''
    This function is used to get the information of companies from https://hosocongty.vn/
        
    ----------
    Input:
        - num_pages: <int> (default: 10)
            number of pages that want to get data from web https://hosocongty.vn/
            
    ----------
    Return:
        <list[str]>
            list of num_pages*10 company information has been called json.dumps
    '''
    
    links = []
    results = []

    # Generate links
    for i in range(1, num_pages + 1):
        links.append('https://hosocongty.vn/page-' + str(i))
    
    # Browse each link to get information
    for link in links:
        results += get_link_content(link)
        
    return results

In [5]:
get_data(1)

['{"name": "CÔNG TY TNHH THƯƠNG MẠI XÂY DỰNG VÀ ĐẦU TƯ KHÔI NGUYÊN", "tax_code": "0109148251", "address": "25 TT1 Khu đô thị Mỹ Đình Sông Đà, Phường Mỹ Đình 1, Quận Nam Từ Liêm, Hà Nội"}',
 '{"name": "CÔNG TY CỔ PHẦN OFFICE360 - VĂN PHÒNG ĐẠI DIỆN TƯ VẤN THUẾ TẠI QUẢNG NINH", "tax_code": "2901635961-001", "address": "Tầng 4, Số 28, Phố Hải Phượng, Phường Hồng Hải, Thành phố Hạ Long, Quảng Ninh"}',
 '{"name": "CHI NHÁNH CÔNG TY TNHH MỘT THÀNH VIÊN ĐẦU TƯ XÂY DỰNG THƯƠNG MẠI QUỐC THẮNG TẠI QUẢNG NGÃI", "tax_code": "0400544974-002", "address": "Thôn Vạn Lý, Xã Phổ Phong, Thị xã Đức Phổ, Quảng Ngãi"}',
 '{"name": "CÔNG TY TNHH XUẤT NHẬP KHẨU VÀ PHÂN PHỐI ĐÔNG PHƯƠNG VIỆT NAM - CỬA HÀNG AKEMI U(DDKD)", "tax_code": "0305432170-004", "address": "Lô B2-R3-25-26 tầng B2 TTTM Vincom, Tòa nhà Vincom Mega Mall, Phường Thượng Đình, Quận Thanh Xuân, Hà Nội"}',
 '{"name": "CHI NHÁNH MIỀN TRUNG - CÔNG TY TNHH KIỂM TOÁN HDT VIỆT NAM", "tax_code": "0102936447-004", "address": "88 đường Kim Đồng, Phường 

# Store data with Elasticsearch

In [10]:
from elasticsearch import Elasticsearch

## Create index

In [11]:
def create_index(es_object, index_name):
    '''
    This function is used to create an index in Elasticsearch
        
    ----------
    Input:
        - es_object: <elasticsearch.client.Elasticsearch>
            Elasticsearch object
        - index_name: <str>
            name of index that want to create    
        
    ----------
    Return:
        <bool>
            True: if create successfully
            False: whereas  
    '''
    
    created = False
    
    # Index settings
    settings = {
        "settings": {
            "number_of_shards": 2,
            "number_of_replicas": 1
        },
        "mappings": {
            "company_info": {
                "dynamic": "strict",
                "properties": {
                    "name": {
                        "type": "text"
                    },
                    "tax_code": {
                        "type": "text"
                    },
                    "address": {
                        "type": "text"
                    }
                }  
            }
        }
    }

    try:
        if not es_object.indices.exists(index_name):
            # Ignore 400 means to ignore "Index Already Exist" error.
            es_object.indices.create(index=index_name, ignore=400, body=settings)
            
            print('Created Index')
            
        created = True
        
    except Exception as ex:
        print(str(ex))
        
    finally:
        return created

## Store record

In [None]:
def store_record(es_object, index_name, record):
    is_stored = True
    try:
        outcome = elastic_object.index(index=index_name, doc_type='salads', body=record)
        print(outcome)
    except Exception as ex:
        print('Error in indexing data')
        print(str(ex))
        is_stored = False
    finally:
        return is_stored