# Summary
This notebook performs a demo task with the Elasticsearch client in Python.


Include:
- Step 1: Collect data related to tax codes from the web: https://hosocongty.vn/

- Step 2: Store data with Elasticsearch

- Step 3: Query information on Elasticsearch

# Install necessary package

In [1]:
!pip install elasticsearch

Collecting elasticsearch
  Downloading elasticsearch-7.6.0-py2.py3-none-any.whl (88 kB)
Installing collected packages: elasticsearch
Successfully installed elasticsearch-7.6.0


# Collect data

In [32]:
import pandas as pd
import requests
import re
import html
import json

from bs4 import BeautifulSoup

def get_link_content(url):
    '''
    This function is used to get the information of 20 companies (if there is no error) listed in the url
    The information collected is:
        - Name (Tên công ty)
        - Tax code (Mã số thuế)
        - Address (Địa chỉ)
        
    ----------
    Input:
        - url: <string>
            url of page that want to get data
            
    ----------
    Return:
        <list[str]>
            list of 10 company information has been called json.dumps
    '''
    
    result = []

    html_page = requests.get(url).content
    soup = BeautifulSoup(html_page, 'html.parser')
    infors = soup.find('ul', {'class': 'hsdn'})

    list_company = infors.findAll('li')

    for c in list_company:
        try:
            name = c.find('h3').text

            address_code = str(c.find('div').text).split('Mã số thuế: ')
            address = address_code[0].replace('Địa chỉ: ', '')
            code = address_code[-1]
            temp = {'name': name, 'tax_code': code, 'address': address}

            result.append(json.dumps(temp, ensure_ascii=False))
        except:
            continue

    return result


def get_data(num_pages=5):
    '''
    This function is used to get the information of companies from https://hosocongty.vn/
        
    ----------
    Input:
        - num_pages: <int> (default: 5)
            number of pages that want to get data from web https://hosocongty.vn/
            
    ----------
    Return:
        <list[str]>
            list of num_pages*20 company information has been called json.dumps
    '''
    
    links = []
    results = []

    # Generate links
    for i in range(1, num_pages + 1):
        links.append('https://hosocongty.vn/page-' + str(i))
    
    # Browse each link to get information
    for link in links:
        results += get_link_content(link)
        
    return results

Run function get data

In [2]:
records = get_data(1)

# Store data with Elasticsearch

In [3]:
from elasticsearch import Elasticsearch

## Connect Elasticsearch

In [4]:
def connect_elasticsearch():
    '''
    This function is used to connect Elasticsearch
        
    ----------
    Input:
        None  
        
    ----------
    Return:
        _es
            <elasticsearch.client.Elasticsearch>: if connect successfully
            None: whereas  
    '''
    
    _es = None
    _es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    
    if _es.ping():
        print('Yay Connected')
    else:
        print('Awww it could not connect!')
        
    return _es


## Create index

In [5]:
def create_index(es_object, index_name):
    '''
    This function is used to create an index in Elasticsearch
        
    ----------
    Input:
        - es_object: <elasticsearch.client.Elasticsearch>
            Elasticsearch object
        - index_name: <str>
            name of index that want to create    
        
    ----------
    Return:
        <bool>
            True: if create successfully
            False: whereas  
    '''
    
    is_created = False
    
    # Index settings
    settings = {
        "settings": {
            "number_of_shards": 2,
            "number_of_replicas": 1
        },
        "mappings": {
            "company_info": {
                "dynamic": "strict",
                "properties": {
                    "name": {
                        "type": "text"
                    },
                    "tax_code": {
                        "type": "text"
                    },
                    "address": {
                        "type": "text"
                    }
                }  
            }
        }
    }

    try:
        if not es_object.indices.exists(index_name):
            # Ignore 400 means to ignore "Index Already Exist" error.
            es_object.indices.create(index=index_name, ignore=400, body=settings)
            
            print('Created Index')
            
        is_created = True
        
    except Exception as ex:
        print(str(ex))
        
    finally:
        return is_created

## Store record

In [12]:
def store_record(es_object, index_name, record):
    '''
    This function is used to store a record to Elasticsearch
        
    ----------
    Input:
        - es_object: <elasticsearch.client.Elasticsearch>
            Elasticsearch object
        - index_name: <str>
            name of index, where the record is stored 
        - record_name: <str> (dumps by json)
            record that want to store
        
    ----------
    Return:
        <bool>
            True: if store successfully
            False: whereas  
    '''
    
    is_stored = True
    try:
        outcome = es_object.index(index=index_name, doc_type='company_info', body=record)
        print(outcome)
        
    except Exception as ex:
        print('Error in indexing data')
        print(str(ex))
        
        is_stored = False
        
    finally:
        return is_stored

Run function and store data

In [13]:
from time import sleep

# Note that elasticsearch server must be enabled on your computer 

# Check if data exists
if len(records) > 0:
    es_object = connect_elasticsearch()

is_created = False
if es_object is not None:
    is_created = create_index(es_object=es_object, index_name='tax_code_database')

# Check connect elasticsearch and create index
if is_created:
    for record in records:
        sleep(2)
        is_stored = store_record(es_object=es_object, index_name='tax_code_database', record=record)
        
        if is_stored:
            print('Data indexed successfully')

Yay Connected
Created Index
{'_index': 'tax_code_database', '_type': 'company_info', '_id': '_o_MLHEBleNWgl66V8Ft', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}
Data indexed successfully
{'_index': 'tax_code_database', '_type': 'company_info', '_id': '_4_MLHEBleNWgl66YcHY', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1}
Data indexed successfully
{'_index': 'tax_code_database', '_type': 'company_info', '_id': 'AI_MLHEBleNWgl66acLt', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 2, '_primary_term': 1}
Data indexed successfully
{'_index': 'tax_code_database', '_type': 'company_info', '_id': 'AY_MLHEBleNWgl66csIE', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 3, '_primary_term': 1}
Data indexed successfully
{'_index': 'tax_

# Query

In [30]:
from pprint import pprint

def search(es_object, index_name, query):
    '''
    This function is used to search data on Elasticsearch
        
    ----------
    Input:
        - es_object: <elasticsearch.client.Elasticsearch>
            Elasticsearch object
        - index_name: <str>
            name of index that want to search
        - query: <dict>
            query that is used to search database  
    '''
    res = es_object.search(index=index_name, body=query)
    pprint(res)

In [33]:
# Test search all
search_all_query = {
    'query': {
        'match_all' : {}
    }
}

search(es_object=es_object, index_name='tax_code_database', query=search_all_query)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [{'_id': 'Bo_MLHEBleNWgl66m8J9',
                    '_index': 'tax_code_database',
                    '_score': 1.0,
                    '_source': {'address': 'Lô A102, Đường Số 1, KCN Thái Hòa, '
                                           'Ấp Tân Hòa, Xã Đức Lập Hạ, Huyện '
                                           'Đức Hoà, Long An',
                                'name': 'CHI NHÁNH CÔNG TY CỔ PHẦN HỢP TÁC ĐẦU '
                                        'TƯ VÀ PHÁT TRIỂN NÔNG NGHIỆP VIỆT NAM',
                                'tax_code': '0314259109-002'},
                    '_type': 'company_info'},
                   {'_id': '_4_MLHEBleNWgl66YcHY',
                    '_index': 'tax_code_database',
                    '_score': 1.0,
                    '_source': {'address': 'Tầng 4, Số 28, Phố Hải Phượng, '
                                           'Phường Hồng Hải, Thành phố Hạ '
         

In [44]:
# Test search text
# Get all commpy is 'VĂN PHÒNG ĐẠI DIỆN'

search_vp_query = {
    'query': {
        "match": {
            "name": "VĂN PHÒNG ĐẠI DIỆN"
        }
    }
}

search(es_object=es_object, index_name='tax_code_database', query=search_vp_query)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [{'_id': 'A4_MLHEBleNWgl66gsKp',
                    '_index': 'tax_code_database',
                    '_score': 6.3095236,
                    '_source': {'address': 'Xóm Vạn Thọ, Xã Diễn Mỹ, Huyện '
                                           'Diễn Châu, Nghệ An',
                                'name': 'VĂN PHÒNG ĐẠI DIỆN CÔNG TY CỔ PHẦN '
                                        'GOSEE VIỆT NAM TẠI NGHỆ AN',
                                'tax_code': '0108945913-001'},
                    '_type': 'company_info'},
                   {'_id': '_4_MLHEBleNWgl66YcHY',
                    '_index': 'tax_code_database',
                    '_score': 6.136423,
                    '_source': {'address': 'Tầng 4, Số 28, Phố Hải Phượng, '
                                           'Phường Hồng Hải, Thành phố Hạ '
                                           'Long, Quảng Ninh',
                              