# Naver clova
https://www.ncloud.com/

In [1]:
from dotenv import load_dotenv
import os

import requests
import uuid
import time
import json


#### load keys
load_dotenv()
ROOT_DIR = os.getenv("ROOT_DIR")

api_url = os.getenv("invoke_url")
secret_key = os.getenv("secret_key")


#### paths
# pdf_path = os.path.join(ROOT_DIR, "data", "soil_suitability", "sample_image_01.pdf")
# pdf_path = os.path.join(ROOT_DIR, "data", "soil_suitability", "sample_image_05.pdf")
# pdf_path = os.path.join(ROOT_DIR, "data", "soil_suitability", "soil_suitability_160.pdf")

pdf_path_list = [os.path.join(ROOT_DIR, "data", "soil_suitability", f"{str(num).zfill(2)}0_{str(num).zfill(2)}9.pdf")for num in range (0, 16)]

In [146]:
#### input path
pdf_path = pdf_path_list[15]


#### api params
request_json = {
    'images': [
        {
            'format': 'pdf',
            'name': 'demo'
        }
    ],
    'requestId': str(uuid.uuid4()),
    'version': 'V2',
    'timestamp': int(round(time.time() * 1000))
}

payload = {'message': json.dumps(request_json).encode('UTF-8')}
files = [
    ('file', open(pdf_path,'rb'))
]
headers = {
  'X-OCR-SECRET': secret_key
}

In [147]:
#### api
response = requests.request("POST", api_url, headers=headers, data = payload, files = files)
print(response)

<Response [200]>


In [148]:
file_name = pdf_path[-11:-4]
file_name

'150_159'

#### save response (middle)

In [149]:
#### for checking data (to text)
txt_path = os.path.join(ROOT_DIR, "data", "response", file_name+".txt")
fw = open(txt_path, "w")


pages = response.json()['images']
for page in pages:
    texts_list = []
    
    
    for i_obj, obj in enumerate(page['fields']):
        texts_list += [obj['inferText']]
        
        if obj['lineBreak']:
            texts_str = " ".join(texts_list)
            print(texts_str, file = fw)
            texts_list = []
        
fw.close()

In [150]:
#### save response
print(type(response.json()))


json_path = os.path.join(ROOT_DIR, "data", "response", file_name+".json")
with open(json_path, "w") as f:
    f.write(json.dumps(response.json(), ensure_ascii=False, indent=4))

<class 'dict'>


#### json to json (for test)

In [None]:
# pdf_path = pdf_path_list[15]
# file_name = pdf_path[-11:-4]
# json_path = os.path.join(ROOT_DIR, "data", "response", file_name+".json")
# file_name

'150_159'

## parsing ocr_text

In [477]:
#### load raw response
# pages = response.json()['images'] # from memory
with open(json_path, "r") as f: # from .json
    response_json = json.load(f)
    pages = response_json['images']    


#### page to dict
crop_dict = {}

for page in pages:
    crop_name = ""
    full_text = ""
    find_crop = False
    
    
    #### object to full text
    for i_obj, obj in enumerate(page['fields']):
        text = obj['inferText']
        
        
        #### extract crop name
        if i_obj == 0 and " " in text: # number + crop name
            crop_name = text[text.find(" ")+1:]
            find_crop = True
            continue
        
        elif i_obj == 0: # number
            continue
        
        elif i_obj == 1 and not find_crop: # crop name
            crop_name = text
            continue
        
        else:
            full_text += text.replace(" ", "") + " "
            
        
        #### break line
        if obj['lineBreak']:
            full_text += "\n"
    
    crop_dict[crop_name] = full_text
    
list(crop_dict.keys())
# crop_dict[list(crop_dict.keys())[-2]]

['파프리카', '팥', '포도', '플럼코트', '피(사료용)', '피망', '호박', '황금', '황기', '황련']

In [478]:
temp_dict = {}
#### col variation of ocr text
filter_list = """
    식물명
    
    학명(정명)
    학명 (정명)
    
    분류
    
    지역별다른이름
    지역별 다른이름
    
    작물의생태
    작물의 생태
    
    작물생육에따른
    작물생육에 따른
    작물 생육에따른
    작물 생육에 따른
    
    작물재배에알맞은
    작물 재배에알맞은
    작물재배에 알맞은
    작물 재배에 알맞은
""".replace("  ", "").split("\n")

filter_list = list(filter(None, filter_list))

filt_remove_list = ["(°C)", "°C)", "(°C", "°C", "C", "°", "·"]

except_crop_list = ["줄맨드라미(아마란스)", "천마"]


#### parsing table
for crop_name in crop_dict.keys():
    #### line
    line_list = crop_dict[crop_name].split("\n")
    line_list = list(filter(None, line_list))
    idx_list = []
    
    
    #### extract col idx
    for idx_l, line in enumerate(line_list):
        for col_name in filter_list:
            if col_name in line:
                idx_list.append(idx_l)
    
    
    #### error
    if not len(idx_list) == 8 and crop_name not in except_crop_list:
        for idx_l in idx_list: # check idx & col name
            print(line_list[idx_l])
        raise ValueError(f"error : {file_name} : crop {crop_name} : divide per table : \n" + 
                            f"{len(idx_list)} \n {idx_list} \n " + f"{line_list}")
    
    
    #### process
    inner_dict = {}
    lines_text = ""
    
    
    #### divide to table
    # print("\n crop_name :", crop_name, "idx_list :", idx_list)
    col_name = "이름"
    inner_dict[col_name] = ""
    for idx_l, line in enumerate(line_list):
        if idx_l in idx_list[4:]: # table 2~
            line = line.replace(" ", "")
            for str_remove in filt_remove_list:
                line = line.replace(str_remove, "")
            col_name = line
            inner_dict[line] = ""
        
        else:
            inner_dict[col_name] += line + "\n"
            
            
            
    temp_dict[crop_name] = inner_dict
    print(inner_dict.keys(), crop_name)

dict_keys(['이름', '작물의생태생리특성', '작물생육에따른기상특성', '작물재배에알맞은토양형태적특성및물리성', '작물재배에알맞은토양화학성']) 파프리카
dict_keys(['이름', '작물의생태생리특성', '작물생육에따른기상특성', '작물재배에알맞은토양형태적특성및물리성', '작물재배에알맞은토양화학성']) 팥
dict_keys(['이름', '작물의생태생리특성', '작물생육에따른기상특성', '작물재배에알맞은토양형태적특성및물리성', '작물재배에알맞은토양화학성']) 포도
dict_keys(['이름', '작물의생태생리특성', '작물생육에따른기상특성', '작물재배에알맞은토양형태적특성및물리성', '작물재배에알맞은토양화학성']) 플럼코트
dict_keys(['이름', '작물의생태생리특성', '작물생육에따른기상특성', '작물재배에알맞은토양형태적특성및물리성', '작물재배에알맞은토양화학성']) 피(사료용)
dict_keys(['이름', '작물의생태생리특성', '작물생육에따른기상특성', '작물재배에알맞은토양형태적특성및물리성', '작물재배에알맞은토양화학성']) 피망
dict_keys(['이름', '작물의생태생리특성', '작물생육에따른기상특성', '작물재배에알맞은토양형태적특성및물리성', '작물재배에알맞은토양화학성']) 호박
dict_keys(['이름', '작물의생태생리특성', '작물생육에따른기상특성', '작물재배에알맞은토양형태적특성및물리성', '작물재배에알맞은토양화학성']) 황금
dict_keys(['이름', '작물의생태생리특성', '작물생육에따른기상특성', '작물재배에알맞은토양형태적특성및물리성', '작물재배에알맞은토양화학성']) 황기
dict_keys(['이름', '작물의생태생리특성', '작물생육에따른기상특성', '작물재배에알맞은토양형태적특성및물리성', '작물재배에알맞은토양화학성']) 황련


In [479]:
crop_dict = temp_dict

### 이름

In [480]:
temp_dict = {}

#### parsing inner-inner (이름)
for crop_name in crop_dict.keys():
    temp_dict[crop_name] = {}
    inner_inner_dict = {}
    temp_line_list = []
    
    line_list = crop_dict[crop_name]['이름'].split("\n")
    line_list = list(filter(None, line_list))
    idx_list = []
    
    
    #### extract col idx
    for idx_l, line in enumerate(line_list):
        for col_name in filter_list:
            if col_name in line:
                idx_list.append(idx_l)
                
    
    #### parssing table
    # print("\n crop_name :", crop_name, "len_line_list :", len(line_list),  "idx_list :", idx_list)
    for idx_l, line in enumerate(line_list):
        if idx_l in idx_list: # row 1
            if idx_l == 0:
                idx_cut = line.find(" ")
                inner_inner_dict[line[:idx_cut]] = line[idx_cut:].strip()
            elif idx_l-1 in idx_list or idx_l+1 in idx_list:
                idx_cut = line.find(" ")
                inner_inner_dict[line[:idx_cut]] = line[idx_cut:].strip()
            else:
                col_name = line.replace(" ", "")
                inner_inner_dict[col_name] = ""
        else:
            temp_line_list += [line] # 2 line sepc (execpt >3)
            if len(temp_line_list) >= 2:
                for line in temp_line_list:
                    inner_inner_dict[col_name] += line
                inner_inner_dict[col_name] = inner_inner_dict[col_name].strip()
                temp_line_list = []
        
            
    temp_dict[crop_name]["이름"] = inner_inner_dict
    # print(inner_inner_dict)
    

In [481]:
for crop_name in crop_dict.keys():
    crop_dict[crop_name]["이름"] = temp_dict[crop_name]["이름"]

### 작물의 생태 생리특성

In [None]:
O_list = ["O", "o" ,"○" ,"0"]
X_list = ["X", "x"]
blank_list = ["", " "]
temp_dict = {}


#### parsing inner-inner (작물의생태생리특성)
for crop_name in crop_dict.keys():
    temp_dict[crop_name] = {}
    inner_inner_dict = {}
    
    
    #### parssing table
    for val in crop_dict[crop_name]["작물의생태생리특성"][:-1].split("\n"):
        
        if "생태형" in val: inner_inner_dict["생태형"] = val[3:].strip()
        
        elif "월동여부" in val:
            val = val[4:].strip()
            if val in O_list: inner_inner_dict["월동여부"] = True
            elif val in X_list: inner_inner_dict["월동여부"] = False
            elif val in blank_list: inner_inner_dict["월동여부"] = None
            else: ValueError(f"error : parssing : 월동여부 : val : {val, len(val)}")
            
        elif "생리특성" in val: inner_inner_dict["생리특성"] = val[4:].strip()
        
        elif "질소고정균" in val:
            val = val[10:].strip()
            if val in O_list: inner_inner_dict["질소고정균공생여부"] = True
            elif val in X_list: inner_inner_dict["질소고정균공생여부"] = False
            elif val in blank_list: inner_inner_dict["질소고정균공생여부"] = None
            else: ValueError(f"error : parssing : 질소고정균공생여부 : val : {val, len(val)}")
            
        elif "초본/목본" in val: inner_inner_dict["초본/목본"] = val[8:].strip()
        
        else: ValueError(f"error : parssing : 작물의생태생리특성 : column : {val}")
            
            
    temp_dict[crop_name]["작물의생태생리특성"] = inner_inner_dict

In [483]:
for crop_name in crop_dict.keys():
    crop_dict[crop_name]["작물의생태생리특성"] = temp_dict[crop_name]["작물의생태생리특성"]

### 작물 생육에 따른 기상특성

In [None]:
#### parsing inner-inner ()
temp_dict = {}
flt_isdigit_rlist = [" ", "-", "~", ".", ",",
                        ">", "<", "(", ")",
                        "mm", "ml", "mL", "L",
                    ]


for crop_name in crop_dict.keys():
    temp_dict[crop_name] = {}
    inner_inner_dict = {}
    temp_str = []
    
    
    #### parssing table
    for line in crop_dict[crop_name]["작물생육에따른기상특성"][:-1].split("\n"):
        line = line.strip()
        vals_list = line.split(" ")
        isdigit_list = []
        
        for i_flt, val in enumerate(vals_list):
            for flt_str in flt_isdigit_rlist:
                val = val.replace(flt_str, "")
            
            if val.isdigit():
                isdigit_list += [i_flt]
        
        
        #### case - len vals
        if len(vals_list) == 2: # pair 1
            inner_inner_dict[vals_list[0].strip()] = vals_list[1].strip()
            
        elif len(vals_list) == 4:
            if isdigit_list == [1, 3]: # pair 2
                inner_inner_dict[vals_list[0].strip()] = vals_list[1].strip()
                inner_inner_dict[vals_list[2].strip()] = vals_list[3].strip()
                
            elif "(" in vals_list[1] or ")" in vals_list[1]: # pair 1 (with space in first col)
                inner_inner_dict[" ".join(vals_list[0:3]).strip()] = vals_list[3].strip()
                
            elif isdigit_list == [1, 2, 3]:
                
                if "," in vals_list[1]:
                    inner_inner_dict[vals_list[0].strip()] = ", ".join(vals_list[1:3]).strip()
                    inner_inner_dict[temp_str.strip()] = vals_list[3].strip()
                    
                elif "," in vals_list[2]:
                    inner_inner_dict[vals_list[0].strip()] = vals_list[1].strip()
                    inner_inner_dict[temp_str.strip()] = ", ".join(vals_list[2:4]).strip()
                
                
            elif isdigit_list == [0, 1, 3]:
                inner_inner_dict[temp_str.strip()] = ", ".join(vals_list[0:2]).strip()
                inner_inner_dict[vals_list[2].strip()] = vals_list[3].strip()
                
            elif isdigit_list == [0, 2, 3]:
                inner_inner_dict[temp_str.strip()] = vals_list[0].strip()
                inner_inner_dict[vals_list[1].strip()] = ", ".join(vals_list[0:2]).strip()
                
            else: 
                inner_inner_dict[vals_list[0].strip()] = vals_list[1].strip()
                inner_inner_dict[vals_list[2].strip()] = vals_list[3].strip()
            
            # else: raise ValueError(f"error : {file_name} : crop {crop_name} : 기상특성 : len(vals_list) == 4 : \n {line} \n {isdigit_list}")
            
                
        elif len(vals_list) > 4: # etc
            if isdigit_list == [1, 4]: # 2-5
                inner_inner_dict[vals_list[0].strip()] = vals_list[1].strip()
                inner_inner_dict[" ".join(vals_list[2:4]).strip()] = vals_list[4].strip()
                
            elif isdigit_list == [2, 4]: # 2-5
                inner_inner_dict[" ".join(vals_list[0:2]).strip()] = vals_list[2].strip()
                inner_inner_dict[vals_list[3].strip()] = vals_list[4].strip()
            
            elif isdigit_list == [1, 3, 5]: # 3-6
                inner_inner_dict[vals_list[0].strip()] = vals_list[1].strip()
                inner_inner_dict[vals_list[2].strip()] = vals_list[3].strip()
                inner_inner_dict[vals_list[4].strip()] = vals_list[5].strip()
                
            else: # 2-n
                idx_tem = 0
                if not line.rfind("기온") == -1: idx_tem = line.rfind("기온") + 2
                elif not line.rfind("적온") == -1: idx_tem = line.rfind("적온") + 2
                elif not line.rfind("온도") == -1: idx_tem = line.rfind("온도") + 2
                else: raise ValueError(f"error : {file_name} : crop {crop_name} : 기상특성 : len(vals_list) > 4 : \n {line}")
                inner_inner_dict[line[:idx_tem].strip()] = line[idx_tem:].strip()
            
        elif len(vals_list) == 1: # over line
            temp_str = line
        
        elif len(vals_list) == 3: # over line
            if isdigit_list == [0, 2]:
                inner_inner_dict[temp_str.strip()] = vals_list[0].strip()
                inner_inner_dict[vals_list[1].strip()] = vals_list[2].strip()
            elif isdigit_list == [1, 2]:
                inner_inner_dict[vals_list[0].strip()] = vals_list[1].strip()
                inner_inner_dict[temp_str.strip()] = vals_list[2].strip()
        
            
        else: raise ValueError(f"error : {file_name} : crop {crop_name} : 기상특성 : len 1 3 ? : \n {line}")
                             
            
    temp_dict[crop_name]["작물생육에따른기상특성"] = inner_inner_dict
        

In [485]:
for crop_name in crop_dict.keys():
    crop_dict[crop_name]["작물생육에따른기상특성"] = temp_dict[crop_name]["작물생육에따른기상특성"]

### 작물 재배에 알맞은 토양 형태적 특성 및 물리성
124 인삼, 126 작두콩

In [486]:
#### parsing inner-inner ()
temp_dict = {}


#### variation of column (작물재배에알맞은토양형태적특성및물리성)
col_list_05 = ['지형', '경사(%)', '토성', '유효토심(cm)', '배수등급']
col_list_06_01 = ['지형', '경사(%)', '토성', '유효토심(cm)', '배수등급', '경사(%)']
col_list_06_02 = ['지형', '경사(%)', '토성', '유효토심(cm)', '배수등급', '해발(cm)']

flt_tem_rlist = ["(cm)", "cm)", "(cm",
                    "(m)", "m)", "(m",
                    "(%)", "%)", "(%"]


except_crop_list_01 = ["줄맨드라미(아마란스)", "천마"] # none
except_crop_list_02 = ["작두콩", "쥐눈이콩"] # multi_row
except_crop_list_03 = ["인삼"] # impossible


for crop_name in crop_dict.keys():
    temp_dict[crop_name] = {}
    inner_inner_dict = {}
    
    #### parssing table
    if crop_name in except_crop_list_01:
        temp_dict[crop_name]["작물재배에알맞은토양형태적특성및물리성"] = inner_inner_dict
        continue
    
    line_list = crop_dict[crop_name]["작물재배에알맞은토양형태적특성및물리성"][:-1].split("\n")
    
    
    #### case - len rows
    if len(line_list) == 2: # row 2
        col_str = line_list[0]
        for str_remove in flt_tem_rlist:
            col_str = col_str.replace(str_remove, "")
            
        val_list = list(filter(None, line_list[1].split(" ")))
        
        #### case - len vals
        if len(val_list) == 5: col_list = col_list_05
        elif len(val_list) == 6 and "경사" in col_str: col_list = col_list_06_01
        elif len(val_list) == 6 and "해발" in col_str: col_list = col_list_06_02
        else: raise ValueError(f"error : len col_list : {len(col_list)}")
        
        for idx_col, col_name in enumerate(col_list):
            inner_inner_dict[col_name] = val_list[idx_col].replace("-", "~").strip()
            
    elif len(line_list) == 4:  # row 2-4
        val_list = list(filter(None, line_list[3].split(" ")))
        if crop_name in except_crop_list_02:
            val_list = list(filter(None, line_list[2].split(" ")))
            val_list.insert(2, line_list[0]+","+line_list[2])
        len_col = len(val_list)
            
        
        #### case - len vals
        if len_col == 5: col_list = col_list_05
        elif len_col == 6 and "경사" in col_str: col_list = col_list_06_01
        elif len_col == 6 and "해발" in col_str: col_list = col_list_06_02
        elif crop_name in except_crop_list_03:
            temp_dict[crop_name]["작물재배에알맞은토양형태적특성및물리성"] = inner_inner_dict
            continue
        else: raise ValueError(f"error : {file_name} : crop {crop_name} : 토양형태적특성및물리성 : len vals : {len_col} : \n {line_list}")
        
        for idx_col, col_name in enumerate(col_list):
            inner_inner_dict[col_name] = val_list[idx_col].replace("-", "~").strip()
            
    elif len(line_list) > 4:
        pass

    else: raise ValueError(f"error : {file_name} : crop {crop_name} : 토양형태적특성및물리성 : \n {line_list}")
        
    
    temp_dict[crop_name]["작물재배에알맞은토양형태적특성및물리성"] = inner_inner_dict

In [487]:
for crop_name in crop_dict.keys():
    print(crop_name)
    crop_dict[crop_name]["작물재배에알맞은토양형태적특성및물리성"] = temp_dict[crop_name]["작물재배에알맞은토양형태적특성및물리성"]

파프리카
팥
포도
플럼코트
피(사료용)
피망
호박
황금
황기
황련


### 작물 재배에 알맞은 토양 화학성

In [488]:
#### parsing inner-inner ()
temp_dict = {}
col_list_dict = {
                    "6" : "산도 유효인산 칼륨 칼슘 마그네슘 양이온교환용량".split(" "),
                    "7" : "산도 유기물 유효인산 칼륨 칼슘 마그네슘 양이온교환용량".split(" "),
                    "8" : "산도 전기전도도 유기물 유효인산 칼륨 칼슘 마그네슘 양이온교환용량".split(" "),
                    "9" : "산도 전기전도도 질산태질소 유기물 유효인산 칼륨 칼슘 마그네슘 양이온교환용량".split(" "),
                    "10" : "산도 전기전도도 질산태질소 유기물 유효인산 칼륨 칼슘 마그네슘 양이온교환용량 붕소".split(" "),
            }
    
except_crop_list_04 = ["천마"] # none


for crop_name in crop_dict.keys():
    temp_dict[crop_name] = {}
    inner_inner_dict = {}
    
    #### find values
    if crop_name in except_crop_list_04:
        temp_dict[crop_name]["작물재배에알맞은토양화학성"] = inner_inner_dict
        continue
        
    line_list = crop_dict[crop_name]["작물재배에알맞은토양화학성"][:-1].split("\n")
    for idx_line in range(1, len(line_list)+1):
        line = line_list[-idx_line].replace("-", "~")
        val_list = list(filter(None, line.split(" ")))
        
        
        #### values
        if line.count("~") > 3:
            
            if 6 <= len(val_list) <= 10:
                col_list = col_list_dict[f"{len(val_list)}"]
            
                for idx_val, val in enumerate(val_list):
                    if not "~" in val:
                        val_list[idx_val] = "~" + val
                
                    for idx_val, val in enumerate(val_list):
                        inner_inner_dict[col_list[idx_val]] = val
                            
            else: raise ValueError(f"error : {file_name} : crop {crop_name} : 작물재배에알맞은토양화학성 : \n {val_list}")
            
        
    temp_dict[crop_name]["작물재배에알맞은토양화학성"] = inner_inner_dict

In [489]:
for crop_name in crop_dict.keys():
    crop_dict[crop_name]["작물재배에알맞은토양화학성"] = temp_dict[crop_name]["작물재배에알맞은토양화학성"]

In [490]:
#### save
output_path = os.path.join(ROOT_DIR, "data", "ocr_json", file_name+".json")


with open(output_path, "w") as f:
    f.write(json.dumps(crop_dict, ensure_ascii=False, indent=4))