In [1]:
!pip install transformers faiss-cpu sentence-transformers

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu, sentence-transformers
Successfully installed faiss-cpu-1.9.0 sentence-transformers-3.2.0


In [None]:
!pip show transformers

Name: transformers
Version: 4.44.2
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.10/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: sentence-transformers


In [2]:
# 데이터 처리 및 분석 관련 라이브러리
import faiss
import numpy as np
import pandas as pd
import networkx as nx

# 웹 관련 라이브러리
import urllib.parse
import urllib.request
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning

# 자연어 처리 및 텍스트 분석 라이브러리
import nltk
from nltk.tokenize import sent_tokenize
from nltk.cluster.util import cosine_distance
from sentence_transformers import SentenceTransformer

nltk.download('punkt')

# Transformer 기반 언어 모델 관련 라이브러리
import torch
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast

# 시간 및 정규 표현식 관련 라이브러리
from datetime import datetime
import re
import time
import json

# 경고 및 로그 관리
import warnings
import logging
from transformers import logging as hf_logging
hf_logging.set_verbosity_error()
logging.getLogger("faiss").setLevel(logging.ERROR)
warnings.filterwarnings('ignore', category=FutureWarning)
logging.getLogger("transformers").setLevel(logging.ERROR)
warnings.filterwarnings('ignore', category=MarkupResemblesLocatorWarning)

# 네이버 API 클라이언트 ID와 시크릿
client_id     = "4aM0BLbwSKf5jwggWUmb"
client_secret = "VG12t6mFAJ"

  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
def getRequestUrl(url):
  req = urllib.request.Request(url)
  req.add_header("X-Naver-Client-Id", client_id)
  req.add_header("X-Naver-Client-Secret", client_secret)
  try:
    response = urllib.request.urlopen(req)
    if response.getcode() == 200:
      return response.read().decode('utf-8')
    else:
      return None
  except Exception as e:
    return None

def searchNaverNews(query, display=10, start=1):
  base_url = 'https://openapi.naver.com/v1/search/news.json'
  query    = urllib.parse.quote(query)
  url      = f"{base_url}?query={query}&display={display}&start={start}&sort=date"

  response = getRequestUrl(url)
  if response is None:
    return None

  return json.loads(response)

def sentence_similarity(sent1, sent2, stopwords=None):
  if stopwords is None:
    stopwords = []

  sent1 = [word.lower() for word in sent1 if word not in stopwords]
  sent2 = [word.lower() for word in sent2 if word not in stopwords]

  all_words = list(set(sent1 + sent2))

  vector1 = [0] * len(all_words)
  vector2 = [0] * len(all_words)

  for word in sent1:
    if word in stopwords:
      continue
    vector1[all_words.index(word)] += 1

  for word in sent2:
    if word in stopwords:
      continue
    vector2[all_words.index(word)] += 1

  return 1 - cosine_distance(vector1, vector2)

def build_similarity_matrix(sentences, stop_words):
  similarity_matrix = np.zeros((len(sentences), len(sentences)))

  for idx1 in range(len(sentences)):
    for idx2 in range(len(sentences)):
      if idx1 == idx2:
        continue
      similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

  return similarity_matrix

def textrank_summary(text, num_sentences=3):
  sentences = sent_tokenize(text)
  stop_words = ['을', '를', '이', '가', '은', '는', '에', '의', '과', '와', '한', '들', '의']
  sentences  = [word for word in sentences if word not in stop_words]

  if len(sentences) < 2:
    return ' '.join(sentences)

  similarity_matrix = build_similarity_matrix(sentences, stop_words)
  scores            = nx.pagerank(nx.from_numpy_array(similarity_matrix))
  ranked_sentences  = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
  summary           = ' '.join([sentence for score, sentence in ranked_sentences[:num_sentences]])

  return summary

def economic_news_search(num_days=30, display=100):
  all_filtered_news = []
  collected_data    = set()

  keywords = ['증시', '코스피', '코스닥', '주식', '상장', '상장폐지', '배당', '배당금', '주가', '시가총액']
  exclude_keywords = ['기자', '?', "앵커", '투자', '운용사', '괜찮아요', 'http', '신진대사', '체질', '날씨', '기온']
  today = datetime.today()
  formatted_date = f"{today.month}월 {today.day}일"

  queries = ['증시', '미국증시', '한국증시', '나스닥']
  queries_with_date = [f"{formatted_date} {query}" for query in queries]

  for query in queries_with_date:
    filtered_news = searchNaverNews(query, display=display)
    if filtered_news and 'items' in filtered_news:
      for item in filtered_news['items']:
        title = BeautifulSoup(item['title'], 'html.parser').get_text()
        description = BeautifulSoup(item['description'], 'html.parser').get_text()
        content = description

        if any(exclude in title for exclude in exclude_keywords) or any(exclude in content for exclude in exclude_keywords):
          continue

        if title in collected_data or description in collected_data or content in collected_data:
          continue

        if not '증시' in content:
          continue

        matched_keywords = [keyword for keyword in keywords if keyword in title or keyword in content]
        if len(matched_keywords) < 3:
          continue

        link = item['link']
        news_data = {
            'title': title,
            'description': description,
            'link': link,
            'Date': datetime.strptime(item['pubDate'], '%a, %d %b %Y %H:%M:%S +0900').strftime('%Y-%m-%d'),
            'content': description,
        }

        news_data['summary'] = textrank_summary(news_data['description'])
        all_filtered_news.append(news_data)
        collected_data.update([title, description, content])

  return None if not all_filtered_news else pd.DataFrame(all_filtered_news)

In [4]:
def get_response(user_query_pre, top_k=3, max_new_tokens=150, try_count=False):
    today = datetime.today()
    formatted_date = f"{today.month}월 {today.day}일"
    user_query = formatted_date + user_query_pre

    # 1. 뉴스 데이터 불러오기 및 전처리
    df = economic_news_search()
    df = df.dropna(subset=['summary'])  # 요약이 없는 데이터는 제거
    texts = df['summary'].tolist()  # 요약 부분만 리스트로 변환
    def remove_urls(text):
        url_pattern = r'https?://\S+|www\.\S+'
        return re.sub(url_pattern, '', text)

    texts = [remove_urls(text) for text in texts]

    # GPT2 모델 로드 및 토크나이저 설정
    generator_model = GPT2LMHeadModel.from_pretrained('gpt2')
    tokenizer = PreTrainedTokenizerFast.from_pretrained('gpt2')

    tokenizer.pad_token = tokenizer.eos_token
    generator_model.config.pad_token_id = tokenizer.eos_token_id

    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        generator_model.resize_token_embeddings(len(tokenizer))

    # Sentence-BERT로 임베딩 생성
    embedding_model = SentenceTransformer('jhgan/ko-sroberta-multitask')
    document_embeddings = embedding_model.encode(texts, convert_to_tensor=False, show_progress_bar=False)
    document_embeddings = np.array(document_embeddings).astype('float32')
    dimension = document_embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(document_embeddings)

    def get_relevant_summaries(query, top_k=3):
        query_embedding = embedding_model.encode([query], convert_to_tensor=False)
        query_embedding = np.array(query_embedding).astype('float32')
        distances, indices = index.search(query_embedding, top_k)
        relevant_summaries = [texts[idx] for idx in indices[0]]
        return relevant_summaries

    device = torch.device('cpu')
    generator_model.to(device)

    # 양자화 적용
    quantized_model = torch.quantization.quantize_dynamic(
        generator_model, {torch.nn.Linear}, dtype=torch.qint8
    )

    # 가지치기 적용
    for name, module in quantized_model.named_modules():
        if isinstance(module, torch.nn.Linear):
            prune.l1_unstructured(module, name='weight', amount=0.3)  # 가중치 30% 가지치기
            prune.remove(module, 'weight')  # 가지치기된 부분 정리

    relevant_summaries = get_relevant_summaries(user_query, top_k)
    context = " ".join(relevant_summaries)

    prompt = f"뉴스 요약: {context}\n질문: {user_query}\n답변: "
    inputs = tokenizer(prompt, return_tensors='pt', padding=True).to(device)
    attention_mask = inputs['attention_mask']
    outputs = quantized_model.generate(
        inputs['input_ids'],
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True
    )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = generated_text.split('답변:')[-1].strip()

    def process_response(answer):
        answer = answer.replace("괜찮아요.\n", "")
        split_response = answer.split("다.")[:3]
        final_response = "다.".join(split_response) + "다."
        korean_text_match = re.search(r'[가-힣]', final_response)
        if korean_text_match:
            korean_text_start = korean_text_match.start()
            return final_response[korean_text_start:]
        else:
            return final_response

    final_response = process_response(answer)

    print("상세 응답:", final_response)

    # 반환값이 비어 있거나, 예상된 답변이 아니면 다시 시도
    if (final_response in ("", None) or user_query_pre not in final_response) and not try_count:
        time.sleep(300)
        return get_response(user_query, top_k, max_new_tokens, try_count=True)
    if (final_response in ("", None) or user_query_pre not in final_response) and try_count:
        return process_response(context)  # 그래도 답변이 없으면 요약된 내용을 반환

    return final_response

In [5]:
# 사용 예시
user_query_pre = " 증시"
response = get_response(user_query_pre)
print(f'Processed response: {response}')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.86k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/744 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/585 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

상세 응답: 다.
상세 응답: 다.
Processed response: 지난 9월 14일 기준 코스닥의... 외국인이 국내 주식 시장에서 ‘매도세’로 돌아선 것은 지난해 10월 이후 10개월 만이다. [리서치센터장 인터뷰] 최근 세계 주요 증시 중 한국 시장의 성적은 참담하다. 9월에 들어서도... 10 *국제금융시장(현지시간 14일 → 17일) -美 국채금리 10년물 4.22% → 4.28% -달러지수 105.55 → 105.34 -WTI... *한국증시 마감(18일) -코스피 19.82p(0.72%) 상승 2763.92 -코스닥 0.01p(0.00%) 하락 858.95 -원·달러 환율 0.1원 내린... 11월... 실제 코스피는 지난달 14일 미 연준의 금리 인하 기대감이 퍼지면서 이후 최장기간인 5거래일(12월 14일~12월 20일)... 10월까지 순매도세를 보였던 외국인이 공매도 금지에도 불구하고 한국 증시로 몰려온 것이다.
