In [None]:
#improve dataset
#Sentences with NO entities
#Multiple entities in one sentence
#Real user chat logs

#rule-based filtering 
#prompt tighting
#post validation

#false positive

In [None]:
!pip install yfinance


In [8]:
# stock_dict.py dic 만들기 
STOCK_TICKERS = [
    "삼성전자",
    "LG화학",
    "현대차",
    "네이버",
    "카카오",
    "SK하이닉스",
    "TIGER ETF 200",
    "애플",
    "마이크로소프트",
    "엔비디아"
]


In [22]:
#random data maker
# generate_no_entity.py
import json
import random

sentences = [
    "오늘 날씨 어때?",
    "점심 뭐 먹을까",
    "요즘 너무 바쁘다",
    "주식 말고 여행 이야기하자",
    "취업 준비 어떻게 하고 있어?",
    "이번 주말에 뭐해?",
    "운동 좀 해야겠다",
    "피곤해서 아무것도 하기 싫어",
]

with open("dataset/raw/no_entity.jsonl", "w", encoding="utf-8") as f:
    for _ in range(1666):
        text = random.choice(sentences)
        f.write(json.dumps({"text": text}, ensure_ascii=False) + "\n")


In [23]:
# generate_single_entity.py
import json
import random
from stock_dict import STOCK_TICKERS #yfinance lib

templates = [
    "{ticker} 전망 어때?",
    "{ticker} 주가 알려줘",
    "{ticker} 지금 사도 될까?",
    "{ticker} 하향가야?",
    "{ticker} 장기 투자 괜찮아?",
    "{ticker} 실적 발표 언제야?"
]

with open("dataset/raw/single_entity.jsonl", "w", encoding="utf-8") as f:
    for _ in range(1666):
        ticker = random.choice(STOCK_TICKERS)
        text = random.choice(templates).format(ticker=ticker)
        f.write(json.dumps({"text": text}, ensure_ascii=False) + "\n")


In [24]:
# generate_multi_entity.py
import json
import random
from stock_dict import STOCK_TICKERS

templates = [
    "{a}랑 {b} 비교해줘",
    "{a}랑 {b} 중에 뭐가 나아?",
    "{a}, {b} 둘 다 전망 어때?",
]

with open("dataset/raw/multi_entity.jsonl", "w", encoding="utf-8") as f:
    for _ in range(1666):
        a, b = random.sample(STOCK_TICKERS, 2)
        text = random.choice(templates).format(a=a, b=b)
        f.write(json.dumps({"text": text}, ensure_ascii=False) + "\n")


In [25]:
# generate_hard_negative.py
import json

sentences = [
    "싸우지들 좀 마소",
    "ISA 계좌 알려줘",
    "AI주 요즘 핫하더라",
    "배당주 추천해줘",
    "테마주 뭐가 좋아?",
    "TIGER는 동물 이름 아니야?",
    "삼성 비교해줘",   # incomplete entity
]

with open("dataset/raw/hard_negative.jsonl", "w", encoding="utf-8") as f:
    for s in sentences:
        f.write(json.dumps({"text": s}, ensure_ascii=False) + "\n")


In [None]:
#labelling pipeline, DocBin

In [26]:
#load dataset for labelling 
# load_raw_data.py
import json
from pathlib import Path

def load_jsonl(path):
    data = []
    with open(path, encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line)["text"])
    return data

RAW_DIR = Path("dataset/raw")

texts = []
for file in RAW_DIR.glob("*.jsonl"):
    texts.extend(load_jsonl(file))

print("Total texts:", len(texts))


Total texts: 5005


In [31]:
#auto label remove false true
# auto_label.py
import re
from stock_dict import STOCK_TICKERS

def find_entities(text):
    entities = []
    for ticker in STOCK_TICKERS:
        for match in re.finditer(re.escape(ticker), text):
            start, end = match.start(), match.end()

            # boundary check
            before = text[start-1] if start > 0 else " "
            after = text[end] if end < len(text) else " "

            if before.isalnum() or after.isalnum():
                continue

            entities.append((start, end, "TICKER"))

    return entities


In [32]:
entities

[]

In [33]:
#build docbin 
# build_docbin.py
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
from auto_label import find_entities
from load_raw_data import texts

nlp = spacy.blank("xx")   # PURE spaCy, no MeCab, label
nlp.tokenizer = spacy.tokenizer.Tokenizer(nlp.vocab)

db = DocBin()

error_log = open("dataset/processed/err_spans.txt", "w", encoding="utf-8")

for text in tqdm(texts):
    doc = nlp.make_doc(text)
    ents = []
    used_tokens = set()

    for start, end, label in find_entities(text):
        span = doc.char_span(start, end, label=label, alignment_mode="strict")
        if span is None:
            error_log.write(f"{text} | {start}:{end}\n")
            continue

        # prevent overlapping entities
        token_ids = set(range(span.start, span.end))
        if used_tokens & token_ids:
            continue

        used_tokens |= token_ids
        ents.append(span)

    doc.ents = ents
    db.add(doc)

error_log.close()

db.to_disk("dataset/spacy/all.spacy")
print("Saved DocBin")


Total texts: 5005


100%|██████████| 5005/5005 [00:00<00:00, 12183.09it/s]


Saved DocBin


In [34]:
# split_docbin.py
from spacy.tokens import DocBin
import random

db = DocBin().from_disk("dataset/spacy/all.spacy")
docs = list(db.get_docs(spacy.blank("xx").vocab))

random.shuffle(docs)

split = int(len(docs) * 0.8)
train_docs = docs[:split]
dev_docs = docs[split:]

DocBin(docs=train_docs).to_disk("dataset/spacy/train.spacy")
DocBin(docs=dev_docs).to_disk("dataset/spacy/dev.spacy")

print("Train:", len(train_docs))
print("Dev:", len(dev_docs))


Train: 4004
Dev: 1001


In [None]:
#train data with new split data 
#python -m spacy train config.cfg --output ./output --paths.train dataset/spacy/train.spacy --paths.dev dataset/spacy/dev.spacy
#
#python -m spacy init fill-config base_config.cfg config.cfg


In [35]:
import spacy

nlp = spacy.load("output/model-best")

text ="마소킴이 오늘 회의에 늦었다"
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)


마소킴이 TICKER


In [36]:
import spacy

nlp = spacy.load("output/model-last")

text = "요즘 삼성전자랑 LG화학 중에 뭐가 더 나을까?"
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)


LG화학 TICKER


In [37]:
import spacy

nlp = spacy.load("output/model-best")

text = "ISA 계좌랑 삼성전자 주식 뭐가 좋아?"
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)


삼성전자 TICKER
