In [4]:
!pip3 install pandas newspaper3k nltk google-generativeai scikit-learn

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting google-generativeai
  Downloading google_generativeai-0.8.5-py3-none-any.whl.metadata (3.9 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp313-cp313-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting PyYAML>=3.11 (from newspaper3k)
  Downloading PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl.metadata (2.1 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import datetime
import re
import time
from pathlib import Path
import nltk
import pandas as pd
from nltk.corpus import stopwords
import google.generativeai as genai
from google.colab import userdata # Colab API 세팅 필요

# --- 기본 설정 ---
try:
    GEMINI_API_KEY = userdata.get("GOOGLE_API_KEY")
    if not GEMINI_API_KEY:
        print("WARN: 'GOOGLE_API_KEY' not found in Colab userdata or is empty.")
        GEMINI_API_KEY = None
except Exception as e:
    print(f"WARN: Error accessing Colab userdata for 'GOOGLE_API_KEY': {e}")
    GEMINI_API_KEY = None

nltk.download("stopwords", quiet=True, raise_on_error=False)
STOP_WORDS = set(stopwords.words("english"))

# 뉴스 파일 로드 및 날짜 범위 설정
NEWS_START_DATE_STR = "2025-01-01" # LLM 라벨링 대상 시작일
NEWS_END_DATE_STR = "2025-03-30"   # LLM 라벨링 대상 종료일
NEWS_START_DATE = datetime.datetime.strptime(NEWS_START_DATE_STR, "%Y-%m-%d").date()
NEWS_END_DATE = datetime.datetime.strptime(NEWS_END_DATE_STR, "%Y-%m-%d").date()

LLM_MODEL_NAME = "gemini-2.0-flash"
LLM_REQUEST_DELAY = 2
LLM_PRICE_LABELS = ["price-down", "neutral", "price-up"]

PRICE_FILE_PATH = "./data/coffee_c_price.csv"
PRICE_CHANGE_THRESHOLD_PERCENT = 4.0

DATA_DIR = "./data/"
DAILYCOFFEENEWS_CSV = Path(DATA_DIR) / "dailycoffeenews_250503.csv"
WORLDCOFFEEPORTAL_CSV = Path(DATA_DIR) / "worldcoffeeportal_250503.csv"


# --- 함수 정의 ---
def fix_encoding(text):
    if not isinstance(text, str): return text
    try:
        return text.encode("raw_unicode_escape").decode("utf-8", "ignore").encode("latin1").decode("utf-8", "ignore")
    except: return text

def preprocess_text_for_llm(title, body=None):
    full_text = f"Title: {title or ''}".strip().lower()
    if body:
        full_text += f"\n\nBody: {body or ''}"
    return re.sub(r"http\S+", "[URL REMOVED]", full_text).strip()


def load_and_combine_crawled_news_from_specific_files(
    daily_news_file: Path,
    portal_news_file: Path,
    start_date_limit: datetime.date,
    end_date_limit: datetime.date
) -> pd.DataFrame:
    print("크롤링된 뉴스 파일 로드 중...")

    df_daily = pd.DataFrame()
    df_portal = pd.DataFrame()

    if daily_news_file.exists():
        print(f"  Daily Coffee News 파일 로드: {daily_news_file}")
        try:
            df_daily = pd.read_csv(daily_news_file)
            df_daily['source'] = 'dailycoffeenews'
        except Exception as e:
            print(f"    오류: {daily_news_file} 로드 실패 - {e}")
    else:
        print(f"  경고: Daily Coffee News 파일 없음 - {daily_news_file}")

    if portal_news_file.exists():
        print(f"  World Coffee Portal 파일 로드: {portal_news_file}")
        try:
            df_portal = pd.read_csv(portal_news_file)
            df_portal['source'] = 'worldcoffeeportal'
        except Exception as e:
            print(f"    오류: {portal_news_file} 로드 실패 - {e}")
    else:
        print(f"  경고: World Coffee Portal 파일 없음 - {portal_news_file}")

    if df_daily.empty and df_portal.empty:
        print("로드할 뉴스 파일이 없습니다.")
        return pd.DataFrame()

    combined_df = pd.concat([df_daily, df_portal], ignore_index=True)
    if 'date' not in combined_df.columns or 'title' not in combined_df.columns:
        print("오류: CSV 파일에 'date' 또는 'title' 컬럼이 없습니다.")
        return pd.DataFrame()

    combined_df['date'] = pd.to_datetime(combined_df['date'], errors='coerce').dt.date
    combined_df.dropna(subset=['date', 'title'], inplace=True)

    combined_df = combined_df[(combined_df['date'] >= start_date_limit) & (combined_df['date'] <= end_date_limit)]

    if 'url' in combined_df.columns:
        combined_df.drop_duplicates(subset=['url'], keep='first', inplace=True)
    else:
        combined_df.drop_duplicates(subset=['date', 'title'], keep='first', inplace=True)

    combined_df.sort_values(by="date", inplace=True)

    combined_df["text_for_llm"] = combined_df["title"].apply(lambda x: preprocess_text_for_llm(x))
    combined_df = combined_df[combined_df["text_for_llm"] != ""].reset_index(drop=True)

    print(f"필터링 및 병합 후 LLM 라벨링 대상 기사 수: {len(combined_df)}")
    return combined_df[['date', 'title', 'url', 'text_for_llm', 'source']]


def get_llm_pseudo_label_gemini(article_text, llm_model_client):
    if not llm_model_client:
        import random
        return random.choice(LLM_PRICE_LABELS)
    
    prompt = f"""Analyze the coffee market sentiment of the following news article.
Label it as 'price-up', 'price-down', or 'neutral'. Return ONLY the label.

Examples:
Article: "Title: Brazil Frosts Decimate Coffee Crop Body: Severe frost damages arabica output."
Label: price-up

Article: "Title: ICO Reports Record Global Coffee Exports Body: Strong harvest in Vietnam boosts exports."
Label: price-down

Article: "Title: Nestlé Launches New Coffee Line Body: Nestlé expands premium coffee offerings."
Label: neutral
---
Article: "{article_text}"
Label:"""
    try:
        safety_settings=[{"category": c,"threshold": "BLOCK_NONE"} for c in ["HARM_CATEGORY_HARASSMENT","HARM_CATEGORY_HATE_SPEECH","HARM_CATEGORY_SEXUALLY_EXPLICIT","HARM_CATEGORY_DANGEROUS_CONTENT"]]
        response = llm_model_client.generate_content(prompt, safety_settings=safety_settings)
        llm_label = ""
        if response.parts:
             llm_label = "".join(part.text for part in response.parts if hasattr(part, 'text')).strip().lower()
        elif hasattr(response, 'text') and response.text:
             llm_label = response.text.strip().lower()
        else:
            if response.prompt_feedback and response.prompt_feedback.block_reason:
                print(f"    경고: Gemini가 프롬프트를 차단했습니다. 이유: {response.prompt_feedback.block_reason}. neutral로 기본 설정.")
            else:
                print(f"    경고: Gemini가 비어 있거나 예기치 않은 응답 구조를 반환했습니다. neutral로 기본 설정.")
            return "neutral"
        return llm_label if llm_label in LLM_PRICE_LABELS else "neutral"
    except Exception as e:
        return "neutral"

def generate_pseudo_labels_with_gemini(df):
    print(f"{len(df)}개 기사 LLM 라벨링 시작...")
    llm_model_client = None
    if GEMINI_API_KEY:
        try:
            genai.configure(api_key=GEMINI_API_KEY)
            llm_model_client = genai.GenerativeModel(LLM_MODEL_NAME)
        except Exception as e:
            print(f"Gemini 모델 초기화 오류: {e}. 임의 라벨링 사용.")
    else:
        print("Gemini API 키 없음. 임의 라벨링 사용.")

    labels = []
    for i, row in df.iterrows():
        pseudo_label = get_llm_pseudo_label_gemini(row['text_for_llm'][:25000], llm_model_client) # 텍스트 길이 제한
        labels.append(pseudo_label)
        if llm_model_client: time.sleep(LLM_REQUEST_DELAY)
    df["llm_pseudo_label"] = labels
    df = df[df["llm_pseudo_label"].isin(LLM_PRICE_LABELS)]
    print(f"LLM 라벨링 완료. 유효 라벨 {len(df)}개.")
    return df

def load_and_prepare_price_data(price_file_path, threshold):
    if not Path(price_file_path).exists():
        print(f"가격 파일 없음: {price_file_path}")
        return pd.DataFrame()
    price_df = pd.read_csv(price_file_path)
    price_df.rename(columns={"Date": "date", "Coffee_Price": "close_price"}, inplace=True)
    price_df['date'] = pd.to_datetime(price_df['date'], errors='coerce').dt.date
    price_df['close_price'] = pd.to_numeric(price_df['close_price'], errors='coerce')
    price_df.dropna(subset=['date', 'close_price'], inplace=True)
    price_df.sort_values(by="date", inplace=True)
    price_df["next_day_close_price"] = price_df["close_price"].shift(-1)
    price_df["actual_price_return_pct"] = ((price_df["next_day_close_price"] - price_df["close_price"]) / price_df["close_price"]) * 100
    def classify_actual_movement(pct_change):
        if pd.isna(pct_change): return "정보없음"
        if pct_change >= threshold: return "상승"
        elif pct_change <= -threshold: return "하락"
        else: return "중립"
    price_df["actual_price_label"] = price_df["actual_price_return_pct"].apply(classify_actual_movement)
    return price_df[["date", "close_price", "next_day_close_price", "actual_price_return_pct", "actual_price_label"]]

# --- 메인 실행 ---
if __name__ == "__main__":
    Path(DATA_DIR).mkdir(parents=True, exist_ok=True)
    print(f"'{DATA_DIR}' 폴더에 '{DAILYCOFFEENEWS_CSV.name}', '{WORLDCOFFEEPORTAL_CSV.name}', '{Path(PRICE_FILE_PATH).name}' 파일이 있는지 확인해주세요.")

    print("--- 1단계: 뉴스 수집 및 LLM 라벨링 ---")
    news_to_label_df = load_and_combine_crawled_news_from_specific_files(
        daily_news_file=DAILYCOFFEENEWS_CSV,
        portal_news_file=WORLDCOFFEEPORTAL_CSV,
        start_date_limit=NEWS_START_DATE,
        end_date_limit=NEWS_END_DATE
    )

    if news_to_label_df.empty:
        print("지정된 날짜 범위의 수집된 뉴스가 없습니다. 종료.")
    else:
        pseudo_labeled_news_df = generate_pseudo_labels_with_gemini(news_to_label_df.copy())
        if pseudo_labeled_news_df.empty:
            print("LLM 라벨링된 뉴스 없음. 종료.")
        else:
            print("\n--- 2단계: 가격 데이터 통합 ---")
            price_data_df = load_and_prepare_price_data(PRICE_FILE_PATH, PRICE_CHANGE_THRESHOLD_PERCENT)
            if price_data_df.empty:
                print("가격 데이터 로드 실패. LLM 라벨링된 뉴스만 저장합니다.")
                output_filename = f"crawled_llm_labeled_news_{NEWS_START_DATE_STR}_to_{NEWS_END_DATE_STR}.csv"
                pseudo_labeled_news_df.to_csv(output_filename, index=False, encoding="utf-8-sig")
                print(f"저장 완료: {output_filename}")
            else:
                final_df = pd.merge(pseudo_labeled_news_df, price_data_df, on="date", how="left")
                output_filename = f"crawled_llm_labeled_news_with_prices_{NEWS_START_DATE_STR}_to_{NEWS_END_DATE_STR}.csv"
                output_columns = ["date", "title", "url", "source", "llm_pseudo_label",
                                  "close_price", "next_day_close_price", "actual_price_return_pct", "actual_price_label",
                                  "text_for_llm"]
                final_df_output = final_df[[col for col in output_columns if col in final_df.columns]]
                final_df_output.to_csv(output_filename, index=False, encoding="utf-8-sig")
                print(f"최종 데이터 저장 완료: {output_filename}")
                print("\n결과 샘플:")
                print(final_df_output.head())

                comparable_df = final_df_output.dropna(subset=['actual_price_label', 'llm_pseudo_label'])
                if not comparable_df.empty and comparable_df['actual_price_label'].nunique() > 0 and comparable_df['llm_pseudo_label'].nunique() > 0 :
                    print("\nLLM 라벨 vs 실제 가격 변동 (비교 가능 행):")
                    try:
                        print(pd.crosstab(comparable_df['llm_pseudo_label'], comparable_df['actual_price_label']))
                    except:
                        print("크로스탭 생성 중 문제 발생.")
    print("\n스크립트 실행 완료.")
    if not GEMINI_API_KEY:
        print("주의: Gemini API 키가 설정되지 않아 임의 라벨이 사용되었을 수 있습니다.")

'./data/' 폴더에 'dailycoffeenews_250503.csv', 'worldcoffeeportal_250503.csv', 'coffee_c_price.csv' 파일이 있는지 확인해주세요.
--- 1단계: 뉴스 수집 및 LLM 라벨링 ---
크롤링된 뉴스 파일 로드 중...
  Daily Coffee News 파일 로드: data/dailycoffeenews_250503.csv
  World Coffee Portal 파일 로드: data/worldcoffeeportal_250503.csv
필터링 및 병합 후 LLM 라벨링 대상 기사 수: 324
324개 기사 LLM 라벨링 시작...
LLM 라벨링 완료. 유효 라벨 324개.

--- 2단계: 가격 데이터 통합 ---
최종 데이터 저장 완료: crawled_llm_labeled_news_with_prices_2025-01-01_to_2025-03-30.csv

결과 샘플:
         date                                              title  \
0  2025-01-01  Study Shows Links Between Coffee Drinking and ...   
1  2025-01-01  Design Details: Atomic Coffee Roasters Spreadi...   
2  2025-01-02  Three Questions with Filmmaker and Third Space...   
3  2025-01-02  Iowa’s Euphoria Coffee Finds a Happy Home in C...   
4  2025-01-02  Luckin Coffee launches in Hong Kong with five ...   

                                                 url             source  \
0  https://dailycoffeenews.com/2025/01/01/s