# 논문 - 쿼리에 따른 검색

## 논문 호출용 클래스 및 함수 정의

In [2]:
import base64
from Crypto.Cipher import AES

class AESTestClass:
    def __init__(self, plain_txt, key):
        # iv, block_size 값은 고정
        self.iv = 'jvHJ1EFA0IXBrxxz'
        self.block_size = 16
        self.plain_txt = plain_txt
        self.key = key

    def pad(self):
        # PKCS#7 패딩
        number_of_bytes_to_pad = self.block_size - len(self.plain_txt) % self.block_size
        ascii_str = chr(number_of_bytes_to_pad)
        padding_str = number_of_bytes_to_pad * ascii_str
        return self.plain_txt + padding_str

    def encrypt(self):
        cipher = AES.new(self.key.encode('utf-8'), AES.MODE_CBC, self.iv.encode('utf-8'))
        padded_txt = self.pad()
        encrypted_bytes = cipher.encrypt(padded_txt.encode('utf-8'))
        # Base64 encrypt
        encrypted_str = base64.urlsafe_b64encode(encrypted_bytes).decode('utf-8')
        return encrypted_str

In [3]:
import xml.etree.ElementTree as ET
import pandas as pd

# XML 형태의 결과를 df로 변경
def xml_to_df(xml):
    root = ET.fromstring(xml)

    # recordList 찾기
    record_list_element = root.find('recordList')

    # 데이터를 담을 리스트
    records = []

    if record_list_element is not None:
        # 각 record에 대해 반복
        for record_element in record_list_element.findall('record'):
            record_data = {}
            # 각 item에 대해 반복
            for item_element in record_element.findall('item'):
                meta_code = item_element.get('metaCode')
                # CDATA 섹션의 텍스트 추출
                value = item_element.text.strip() if item_element.text else ''
                record_data[meta_code] = value
            records.append(record_data)

    df = pd.DataFrame(records)
    return df

In [4]:
import json

# ScienceON에서 사용하는 형태로 쿼리 수정하는 함수
def transform_query(input_query):

    query = {
        "BI": input_query,  # 전체
        # "TI": None,  # 논문명
        # "AU": None,  # 저자
        # "AB": None,  # 초록
        # "KW": None,  # 키워드
        # "PB": None,  # 출판사(발행기관)
        # "SN": None,  # ISSN
        # "BN": None,  # ISBN
        # "PY": None,  # 발행년도
        # "CN": None,  # 문헌번호
        # "DI": None   # DOI
    }

    json_query = json.dumps(query, separators=(',', ':')) 

    return json_query

In [5]:
import json, requests
from datetime import datetime

# 액세스 토큰 불러오기
def call_access_token(MAC_ADDRESS, API_KEY, CLIENT_ID):
    # 맥주소
    mac = (MAC_ADDRESS or "").strip().strip('"').strip("'").upper().replace(":", "-")
    if not mac:
        raise SystemExit("MAC_ADDRESS가 비어있음")

    # datetime 생성
    dt = datetime.now().strftime('%Y%m%d%H%M%S')

    # JSON 페이로드 생성
    payload = {
        "mac_address": mac,
        "datetime": dt
    }
    plain_json = json.dumps(payload, separators=(',', ':'))

    # AES 암호화 → Base64
    aes = AESTestClass(plain_txt=plain_json, key=API_KEY)
    b64_cipher = aes.encrypt()

    # 인코딩 + 토큰 요청
    endpoint = "https://apigateway.kisti.re.kr/tokenrequest.do"
    params = {
        "accounts": b64_cipher,
        "client_id": CLIENT_ID
    }

    response = requests.get(endpoint, params=params, timeout=10)
    response.raise_for_status()

    data = response.json()
    token = data['access_token'] 
    return token

## 예시 데이터

In [None]:
# query
with open('../data/queries.txt', 'r', encoding='utf-8') as f:
    queries = [line.strip() for line in f if line.strip()]

## 작동 방식

In [None]:
import os, requests
import pandas as pd
from dotenv import load_dotenv

load_dotenv(override=True)

CLIENT_ID = os.getenv("SCIENCEON_CLIENT_ID")
ARTI_KEY = os.getenv("SCIENCEON_API_KEY")
MAC_ADDRESS = os.getenv("MAC_ADDRESS")

access_token = call_access_token(MAC_ADDRESS, ARTI_KEY, CLIENT_ID)

url = "https://apigateway.kisti.re.kr/openapicall.do"
df = pd.DataFrame()

for query in queries:
    params = {
        "client_id": CLIENT_ID,
        "token": access_token,
        "version": 1.0,
        "action": "search",
        "target": "ARTI",
        "searchQuery": transform_query(query),
        'curPage': 1, # 현재페이지 번호
        'rowCount': 20, # 디스플레이 건수(기본값 10, 최대값 100)
    }

    res = requests.get(url, params=params, timeout=20)
    xml = res.text
    tmp = xml_to_df(xml)
    tmp["query"] = query
    df = pd.concat([df, tmp], ignore_index=True)
    
df = df.drop_duplicates(subset='CN')
# df.to_csv('../data/search_results_article.csv', index=False, encoding='utf-8')
print('\n[total article length]\n', len(df))

cleaned_df = (
    df[
        ['CN', 'Title', 'Abstract', 'Pubyear', 'Keyword', 'Author', 'ContentURL', 'query']
    ]
    .rename(
        columns={
            'CN': 'ID',
            'Title': 'title',
            'Abstract': 'description',
            'Pubyear': 'pubyear',
            'Keyword': 'keyword',
            'Author': 'author',
            'ContentURL': 'URL'
        }
    )
)

cleaned_df['category'] = 'article'

display(cleaned_df.head())


[total article length]
 100


Unnamed: 0,ID,title,description,pubyear,keyword,author,URL,query,category
0,NART70198266,Twentieth century sea&#x2010;ice trends in the...,<▼1><P><B>Abstract</B><P>We present the first ...,2014,,"Sinclair, Kate E.;Bertler, Nancy A. N.;Bowen, ...",http://click.ndsl.kr/servlet/OpenAPIDetailView...,Ross Sea core,article
1,NART109782005,An 83 000-year-old ice core from Roosevelt Isl...,<P>Abstract. In 2013 an ice core was recovered...,2020,,"Lee, James E.;Brook, Edward J.;Bertler, Nancy ...",http://click.ndsl.kr/servlet/OpenAPIDetailView...,Ross Sea core,article
2,NART102750892,Geochemical characterization of a marine sedim...,<P><B>Abstract</B></P> <P>The chemical fracti...,2020,Antarctica . Ross sea . Marine sediment . Sequ...,"Burgay, F.;Abollino, O.;Buoso, S.;Costa, E.;Gi...",http://click.ndsl.kr/servlet/OpenAPIDetailView...,Ross Sea core,article
3,NPAP04903951,Palaeoenvironmental Inferences from Core ANTA9...,,2003,,"Orsini, G.;Giglio, F.;Langone, L.;Ravaioli, M.;",http://click.ndsl.kr/servlet/OpenAPIDetailView...,Ross Sea core,article
4,NART64771304,Little Ice Age climate and oceanic conditions ...,<P>Abstract. Increasing paleoclimatic evidence...,2012,,"Rhodes, R. H.;Bertler, N. A. N.;Baker, J. A.;S...",http://click.ndsl.kr/servlet/OpenAPIDetailView...,Ross Sea core,article
