In [5]:
import pandas as pd
from transformers import pipeline
import re
import traceback

# 1. 데이터 로드 및 전처리
df = pd.read_csv("../../store data/NLP/M7_news_2021_2022.csv", on_bad_lines='skip')
df = df.dropna(subset=["text"])
df["text"] = df["text"].astype(str).apply(lambda x: re.sub(r'\s+', ' ', x).strip())

# 2. 랜덤 샘플링
sample_df = df.sample(n=20, random_state=42).copy()

# 3. T5 summarizer 로드
summarizer = pipeline(
    "summarization",
    model="t5-small",
    tokenizer="t5-small",
    framework="pt",
    device=-1
)
print("✅ T5 summarizer 로드 완료")

# 4. FinBERT 로딩
finbert = pipeline("sentiment-analysis", model="ProsusAI/finbert", tokenizer="ProsusAI/finbert")
print("✅ FinBERT 모델 로드 완료")

# 5. 요약 및 감성 분석 함수
def summarize_and_analyze(text):
    try:
        input_text = text[:512] if isinstance(text, str) else ""
        if not input_text.strip():
            return "", "empty", 0.0
        summary = summarizer(input_text, max_length=80, min_length=20, do_sample=False)[0]["summary_text"]
        sentiment = finbert(summary)[0]
        return summary, sentiment["label"], sentiment["score"]
    except Exception as e:
        print(f"⚠️ 오류 발생: {e}")
        traceback.print_exc()
        return "", "error", 0.0

# 6. 처리 루프
summaries, sentiments, scores = [], [], []

for i, row in sample_df.iterrows():
    summary, sent, score = summarize_and_analyze(row["text"])
    summaries.append(summary)
    sentiments.append(sent)
    scores.append(score)

# 7. 결과 정리
sample_df["summary"] = summaries
sample_df["sentiment"] = sentiments
sample_df["confidence"] = scores

# 8. 출력
pd.set_option('display.max_colwidth', None)
print(sample_df[["summary", "sentiment", "confidence"]])


Device set to use cpu


✅ T5 summarizer 로드 완료


Device set to use mps:0


✅ FinBERT 모델 로드 완료


Your max_length is set to 80, but your input_length is only 64. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=32)


                                                                                                                                                                                                                                                                                 summary  \
940                                                                                                                                          a new york Times report cited factset data stating that in the 12 months ending in June, people spent in excess of $610 billion on Amazon .   
986                      85.24% of retail investor accounts lose money when trading CFDs with this provider . you should consider whether you understand how they work and whether you can afford to take the high risk of losing your money AL English Sign up Welcome to Capital.com .   
405   business Insider Subscribe DOW JONES -0.24% NASDAQ -0.67% S&P 500-0.64% AAPL -0.09% NVDA Personal Finance Video Subscribe Log in Newsletters U

In [None]:
import pandas as pd
from transformers import pipeline
import re
import traceback

# 1. 데이터 로드 및 전처리
df = pd.read_csv("../../store data/NLP/M7_news_2021_2022.csv", on_bad_lines='skip')
df = df.dropna(subset=["text"])
df["text"] = df["text"].astype(str).apply(lambda x: re.sub(r'\s+', ' ', x).strip())

# 2. 랜덤 샘플링
sample_df = df.sample(n=20, random_state=42).copy()

# 3. T5 summarizer 로드
summarizer = pipeline(
    "summarization",
    model="t5-small",
    tokenizer="t5-small",
    framework="pt",
    device=-1
)
print("✅ T5 summarizer 로드 완료")

# 4. FinBERT 로딩
finbert = pipeline("sentiment-analysis", model="ProsusAI/finbert", tokenizer="ProsusAI/finbert")
print("✅ FinBERT 모델 로드 완료")

# 5. 요약 및 감성 분석 함수
def summarize_and_analyze(text):
    try:
        input_text = text[:512] if isinstance(text, str) else ""
        if not input_text.strip():
            return "", "empty", 0.0
        summary = summarizer(input_text, max_length=80, min_length=20, do_sample=False)[0]["summary_text"]
        sentiment = finbert(summary)[0]
        return summary, sentiment["label"], sentiment["score"]
    except Exception as e:
        print(f"⚠️ 오류 발생: {e}")
        traceback.print_exc()
        return "", "error", 0.0

# 6. 처리 루프
summaries, sentiments, scores = [], [], []

for i, row in sample_df.iterrows():
    summary, sent, score = summarize_and_analyze(row["text"])
    summaries.append(summary)
    sentiments.append(sent)
    scores.append(score)

# 7. 결과 정리
sample_df["summary"] = summaries
sample_df["sentiment"] = sentiments
sample_df["confidence"] = scores

# 8. 출력
pd.set_option('display.max_colwidth', None)
print(sample_df[["summary", "sentiment", "confidence"]])


  from .autonotebook import tqdm as notebook_tqdm
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cpu


✅ T5 summarizer 로드 완료


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [None]:
import pandas as pd

# 문자열 출력 길이 무제한으로 설정
pd.set_option('display.max_colwidth', None)

# 다시 출력
print(sample_df[["summary", "sentiment", "confidence"]])


                                                                                                                                                                                                                                                                                                                                                       summary  \
940                                                                                                                                the report cited Factset data stating that in the 12 months ending in June, people spent in excess of $610 billion on Amazon . in contrast, the report indicated that Amazon, the e-commerce giant, had dethroned Walmart .   
986   85.24% of retail investor accounts lose money when trading CFDs with this provider . the products and services listed on this website are not available to US residents . amazon stock price prediction in 5 years by Rob Griffin Edited by Alexandra Pankratyeva 16:45 (UTC), 29 November 202

In [1]:
import pandas as pd
from transformers import pipeline
import re
import traceback

# 1. 데이터 로드 및 전처리
df = pd.read_csv("../../store data/NLP/M7_news_2021_2022.csv", on_bad_lines='skip')
df = df.dropna(subset=["text"])
df["text"] = df["text"].astype(str).apply(lambda x: re.sub(r'\s+', ' ', x).strip())

# 2. T5 summarizer 로드
summarizer = pipeline(
    "summarization",
    model="t5-small",
    tokenizer="t5-small",
    framework="pt",
    device=-1
)
print("✅ T5 summarizer 로드 완료")

# 3. FinBERT 로드
finbert = pipeline("sentiment-analysis", model="ProsusAI/finbert", tokenizer="ProsusAI/finbert")
print("✅ FinBERT 모델 로드 완료")

# 4. 요약 및 감성 분석 함수
def summarize_and_analyze(text):
    try:
        input_text = text[:512] if isinstance(text, str) else ""
        if not input_text.strip():
            return "", "empty", 0.0
        summary = summarizer(input_text, max_length=80, min_length=20, do_sample=False)[0]["summary_text"]
        sentiment = finbert(summary)[0]
        return summary, sentiment["label"], sentiment["score"]
    except Exception as e:
        print(f"⚠️ 오류 발생: {e}")
        traceback.print_exc()
        return "", "error", 0.0

# 5. 전체 데이터 처리
summaries, sentiments, scores = [], [], []

for i, row in df.iterrows():
    summary, sentiment, score = summarize_and_analyze(row["text"])
    summaries.append(summary)
    sentiments.append(sentiment)
    scores.append(score)

# 6. 결과 추가
df["summary"] = summaries
df["sentiment"] = sentiments
df["confidence"] = scores

# 7. CSV 파일 저장
output_path = "../../store data/NLP/M7_news_with_sentiment_2021_2022.csv"
df.to_csv(output_path, index=False)
print(f"✅ 결과 저장 완료: {output_path}")

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


✅ T5 summarizer 로드 완료


Device set to use mps:0


✅ FinBERT 모델 로드 완료


Your max_length is set to 80, but your input_length is only 77. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=38)
Your max_length is set to 80, but your input_length is only 70. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)
Your max_length is set to 80, but your input_length is only 71. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)
Your max_length is set to 80, but your input_length is only 72. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=36)
Your max

✅ 결과 저장 완료: ../../store data/NLP/M7_news_with_sentiment.csv
