In [4]:
import json
import requests
import pandas as pd

In [28]:
# Joseon summary data

INPUT_FILE = "../oil_crisis.json"
ROOT_FOLDER = "../joseon-data/"

data = {}
data['text'] = []
data['id'] = []
id_num = 0

import os

def read_all_files(root_folder):
    global id_num
    # root_folder 안의 모든 폴더 및 파일을 재귀적으로 탐색
    for root, dirs, files in os.walk(root_folder):
        for file_name in files:
            file_path = os.path.join(root, file_name)
            
            # 텍스트 파일 여부 확인
            if os.path.isfile(file_path) and file_name.lower() != '.ds_store':
                # print(file_path)
                with open(file_path, 'r') as file:
                    content = file.read()
                    data['text'].append(content)
                    data['id'].append(id_num)
                    id_num += 1

# 폴더 경로를 지정하여 함수 호출
read_all_files(ROOT_FOLDER)


In [34]:
# Data lookup
print(len(data['text']))
data['text'][0:5]

6133


["Sunjo of Joseon (29 July 1790 – 13 December 1834, reigned 1800–1834) was the 23rd king of the Korean Joseon Dynasty. Sunjo was the 2nd son of King Jeongjo with Royal Noble Consort Su, one of King Jeongjo's concubines.\n\nBiography\nHe was born with the title of His Royal Highness Prince Yi Gong on 29 July 1790 (18 June 1790 in lunar calendar). Upon the death of his father, King Jeongjo, Yi Gong ascended to the throne at age 10 on 4 July 1800, lunar calendar.\nIn 1802, aged 13, King Sunjo married Lady Kim, known posthumously as Queen Sunwon, daughter of Kim Jo-sun, who was a leader of the Andong Kim clan.\nSince he ascended the throne at a young age, Queen Dowager Jeongsun, the second queen of King Yeongjo, ruled as queen regent, which allowed her to wield power over state affairs. Despite King Sunjo's efforts to reform politics, the fundamental principles of government deteriorated. The state examination became disordered and corruption in the government personnel administration prev

In [5]:
# test for a single sample
data = {}

data['text'] = ['''Taejo of Joseon (4 November 1335 – 27 June 1408),[ii][iii][iv] personal name Yi Seong-gye (Korean: 이성계; Hanja: 李成桂), later Yi Dan (Korean: 이단; Hanja: 李旦), was the founder and first ruler of the Joseon dynasty of Korea. After overthrowing the Goryeo dynasty, he ascended to the throne in 1392 and abdicated six years later during a strife between his sons. He was honored as Emperor Go (Korean: 고황제; Hanja: 高皇帝) following the establishment of the Korean Empire.

Taejo emphasized continuity over change. No new institutions were created and no massive purges occurred during his reign. His new dynasty was largely dominated by the same ruling families and officials that had served the previous regime.[4] He re-established amicable ties with Japan and improved relations with Ming China.''']
data['id'] = [0]

# Information extraction pipeline with Diffbot

In [6]:
FIELDS = "entities, facts"
HOST = "nl.diffbot.com"
DIFF_TOKEN = "653acbe41b357b90a43aa2319d156682"


def nlp_request(payload):
    try:
        res = requests.post(
            f"https://{HOST}/v1/?fields={FIELDS}&token={DIFF_TOKEN}", json=payload
        )
        return res.json()
    except Exception as e:
        print(f"Failed NLP request due to {e}")

In [7]:
batch_size = 50
nlp_results = list()

for offset in range(0, len(data), batch_size):
    # Batch data
    batch = data["text"][offset : offset + batch_size]
    batch_ids = data["id"][offset : offset + batch_size]
    payload = [
        {"content": el, "format": "plain text", "lang": "en"} for el in batch
    ]
    # Make request to NLP API
    nlp_response = nlp_request(payload)
    # Parse information
    entities = []
    facts = []
    for row in nlp_response:
        # Parse sentiments
        # sentiments.append(row['sentiment'])
        # Parse entities
        row_entity = [
            {
                "name": x["name"],
                "uri": x.get("diffbotUri"),
                "confidence": x["confidence"],
                # "sentiment": x["sentiment"],
                "type": x["allTypes"][0].get("name"),
            }
            for x in row["entities"]
            if x["allTypes"]
        ]
        entities.append(row_entity)
        # Parse facts
        row_facts = [
            {
                "source": {
                    "name": x["entity"]["name"],
                    "uri": x["entity"]["allTypes"][0].get("diffbotUri")
                },
                "relationship": x["property"]["name"],
                "target": {
                    "name": x["value"]["name"],
                    "uri": x["value"]["allTypes"][0].get("diffbotUri")
                },
                "confidence": x["confidence"],
            }
            for x in row["facts"]
            if x["entity"]["allTypes"]
            and x["value"]["allTypes"]
            and x["entity"]["name"] != x["value"]["name"]
        ]
        facts.append(row_facts)

    # Construct results by appending the article id to extracted NLP information
    for id_num, entity, fact in zip(batch_ids, entities, facts):
        nlp_results.append({"id": id_num, "entity": entity, "fact": fact})

[{'entities': [{'name': 'Korean Empire', 'diffbotUri': 'https://diffbot.com/entity/E9mn6Csf5MYmLwEaU_ECGCQ', 'confidence': 0.9885841, 'salience': 0.9871738, 'isCustom': False, 'allUris': ['http://www.wikidata.org/entity/Q28233'], 'allTypes': [{'name': 'location', 'diffbotUri': 'https://diffbot.com/entity/EiCyWUm4lNziqgLHx47iAIQ', 'dbpediaUri': 'http://dbpedia.org/ontology/Place'}, {'name': 'administrative area', 'diffbotUri': 'https://diffbot.com/entity/EcTIu1tWKPouIa6qZtSpc4A', 'dbpediaUri': 'http://dbpedia.org/ontology/PopulatedPlace'}, {'name': 'country', 'diffbotUri': 'https://diffbot.com/entity/E9bmxGo8aNgCDUxsGSFl_9A', 'dbpediaUri': 'http://dbpedia.org/ontology/Country'}], 'mentions': [{'text': 'Korean Empire', 'beginOffset': 446, 'endOffset': 459, 'confidence': 0.9885841}], 'location': {'latitude': 37.533333, 'longitude': 126.98333, 'precision': 469.8404}}, {'name': 'Taejo of Joseon', 'diffbotUri': 'https://diffbot.com/entity/EJvyPjCUxNG6Lo_dzHSN1aA', 'confidence': 0.9999341, 's

In [8]:
OUTPUT_FILE = "../data/taejo.json"
with open(OUTPUT_FILE, "w") as fout:
    json.dump(nlp_results, fout)