In [10]:
import pymysql
from datetime import datetime
from elasticsearch import Elasticsearch
from tqdm.notebook import tqdm
import json

In [11]:
with open("./envs.json","r",encoding="utf8") as f:
    envs = json.load(f)

In [12]:
es = Elasticsearch(
    hosts=envs['es_host'],
    port=envs['es_port'],
    http_auth=(envs['es_user'],envs['es_password']),
    timeout=envs['es_timeout'],
    max_retries=envs['es_max_retries']
)

In [13]:
data_types = [
    "naver_news",
    "naver_blog",
    "naver_cafe",
    "jobplanet_statistic",
    "jobplanet_review",
    "jobplanet_premium",
    "saramin",
    "nice_biz_info",
    "kisti_article",
    "kisti_patent",
    "kci",
    "ntis_assign",
    "ntis_accomp",
    "ntis_rnd_paper",
    "ntis_org_info",
    "naver_trend",
]

In [14]:
es.search(
    index="source_data", 
    query={
        "bool":{
            "must":[
                {"match":{"DataType":"naver_news"}}
            ]
        }
    },
    track_total_hits=True,
    size=0
)

{'took': 1,
 'timed_out': False,
 '_shards': {'total': 6, 'successful': 6, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 719207, 'relation': 'eq'},
  'max_score': None,
  'hits': []}}

In [15]:
results = []
for data_type in data_types:
    res = es.search(
        index="source_data", 
        query={
            "bool":{
                "must":[
                    {"match":{"DataType":data_type}}
                ]
            }
        },
        track_total_hits=True,
        size=0
    )
    results.append((data_type, res['hits']['total']['value']))


In [16]:
results

[('naver_news', 719207),
 ('naver_blog', 157036382),
 ('naver_cafe', 58943899),
 ('jobplanet_statistic', 238863),
 ('jobplanet_review', 524034),
 ('jobplanet_premium', 498284),
 ('saramin', 423435),
 ('nice_biz_info', 1029115),
 ('kisti_article', 319958),
 ('kisti_patent', 13958826),
 ('kci', 11687771),
 ('ntis_assign', 1512609),
 ('ntis_accomp', 822889),
 ('ntis_rnd_paper', 725983),
 ('ntis_org_info', 565054),
 ('naver_trend', 668835)]

In [17]:
results = []
for data_type in data_types:
    res = es.search(
        index="source_data", 
        query={
            "bool":{
                "must":[
                    {"match":{"DataType":data_type}}
                ],
                "filter":[
                    {
                        "range":{
                            "SearchDate":{
                                "gte":"2023-10-12",
                                "format":"yyyy-MM-dd"
                            }
                        }
                    }
                ]
            }
        },
        track_total_hits=True,
        size=0
    )
    results.append((data_type, res['hits']['total']['value']))


In [18]:
results

[('naver_news', 646),
 ('naver_blog', 0),
 ('naver_cafe', 0),
 ('jobplanet_statistic', 0),
 ('jobplanet_review', 0),
 ('jobplanet_premium', 0),
 ('saramin', 0),
 ('nice_biz_info', 0),
 ('kisti_article', 0),
 ('kisti_patent', 0),
 ('kci', 0),
 ('ntis_assign', 0),
 ('ntis_accomp', 0),
 ('ntis_rnd_paper', 0),
 ('ntis_org_info', 0),
 ('naver_trend', 0)]