In [None]:
# parameters
AWS_ACCESS_KEY_ID = None
AWS_SECRET_ACCESS_KEY = None
exec_date_str = None

In [None]:
import pandas as pd
import numpy as np
import boto3
from datetime import datetime, timedelta
import io

import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, accuracy_score, confusion_matrix
import snowflake.connector
from snowflake.connector.pandas_tools import write_pandas
from concurrent.futures import ThreadPoolExecutor

import json

In [None]:
# 포맷 확인 후 가능한 경우 포맷 맞춰 변환, 불가능하면 None 반환
def try_convert(s, fmt="%Y-%m-%d %H:%M:%S"):
    try:
        dt = pd.to_datetime(s, errors='raise')  # 실패하면 예외 발생
        return dt.strftime(fmt)
    except:
        return None

In [None]:
def load_recent_2weeks_data(exec_date_str: str) -> pd.DataFrame:
    """
    S3에서 exec_date_str 기준 최근 2주간의 데이터를 불러온다.
    누락된 날짜가 있어도 가능한 데이터만 취합한다.
    """
    # 문자열 날짜 -> datetime
    exec_date = datetime.strptime(exec_date_str, '%Y-%m-%d')
    start_date = exec_date - timedelta(days=13)  # 포함해서 14일치

    print(f"기준일: {exec_date.strftime('%Y-%m-%d')}")
    print(f"검색 기간: {start_date.strftime('%Y-%m-%d')} ~ {exec_date.strftime('%Y-%m-%d')}")

    df_all = []

    for i in range(14):
        target_date = (start_date + timedelta(days=i)).strftime('%Y-%m-%d')
        prefix = f"derived_stock/stock_dt={target_date}/" # 경로 수정

        try:
            response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
            if 'Contents' not in response:
                print(f"누락된 날짜: {target_date} - 데이터 없음")
                continue

            file_keys = [obj['Key'] for obj in response['Contents'] if obj['Key'].endswith('.parquet')]
            if not file_keys:
                print(f"Parquet 파일 없음: {target_date}")
                continue

            for key in file_keys:
                s3_object = s3_client.get_object(Bucket=bucket_name, Key=key)
                buffer = io.BytesIO(s3_object['Body'].read())
                df_temp = pd.read_parquet(buffer)

                # stock_name을 제외하고 필요한 컬럼만 선택합니다.
                df_temp = df_temp[['trade_time_min', 'stock_code', 'open_price', 'high_price', 'low_price', 'close_price', 'cum_volume', 'cum_amount', 'vwap_price', 'change_rate', 'spread']]

                df_temp['trade_time_min'] = target_date + " " + df_temp['trade_time_min'].astype(str)
                df_temp['trade_time_min'] = df_temp.apply(lambda row: try_convert(row['trade_time_min']), axis=1)
                df_all.append(df_temp)

            print(f"{target_date} → {len(file_keys)}개 파일 불러옴")

        except Exception as e:
            print(f"{target_date} 처리 중 오류: {e}")

    if not df_all:
        return pd.DataFrame()

    return pd.concat(df_all, ignore_index=True)

In [None]:
# 전체 ML 학습 및 예측
def train_and_save(df, price_change_threshold=0.002):
    try:
        print("\n--- ML 모델 학습 및 예측 시작 ---")
        df.sort_values(by=['trade_time_min', 'stock_code'], inplace=True)
        df['stock_code'] = df['stock_code'].astype('category')
        df['future_avg_price'] = (
            df.groupby('stock_code')['close_price'].transform(lambda x: x.rolling(window=60, min_periods=60).mean().shift(-59))
        )
        df['future_return'] = (df['future_avg_price'] - df['close_price']) / df['close_price']
        df['target_direction'] = np.select(
            [df['future_return'] > price_change_threshold,
            df['future_return'] < -price_change_threshold],
            [2, 0], default=1
        )
        df.dropna(subset=['future_avg_price', 'future_return'], inplace=True)
        if df.empty:
            print("Empty dataframe")
            return

        # stock_name이 실수로 포함되었을 경우를 대비해 한 번 더 제외합니다.
        feature_cols = [col for col in df.columns if col not in ['trade_time_min', 'stock_name', 'future_return', 'future_avg_price', 'target_direction']]

        # --- train & save ---
        clf_X, clf_y = df[feature_cols], df['target_direction']

        split_idx = int(len(df) * 0.9)
        clf_X_train, clf_X_test, clf_y_train, clf_y_test = clf_X[:split_idx], clf_X[split_idx:], clf_y[:split_idx], clf_y[split_idx:]

        if len(np.unique(clf_y_train)) < 2:
            print("Dataset is TOO small")
            return

        clf_model = lgb.LGBMClassifier(objective='multiclass', num_class=3, random_state=42)
        clf_model.fit(
            clf_X_train, clf_y_train,
            eval_set=[(clf_X_test, clf_y_test)],
            categorical_feature=['stock_code'],
            callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
        )

        reg_X, reg_y = df[feature_cols], df['future_avg_price']
        reg_X_train, reg_X_test, reg_y_train, reg_y_test = reg_X[:split_idx], reg_X[split_idx:], reg_y[:split_idx], reg_y[split_idx:]

        reg_model = lgb.LGBMRegressor(objective='regression_l1', random_state=42)
        reg_model.fit(
            reg_X_train, reg_y_train,
            eval_set=[(reg_X_test, reg_y_test)],
            categorical_feature=['stock_code'],
            callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
        )

        reg_predictions = reg_model.predict(reg_X_test)

        clf_predictions = clf_model.predict(clf_X_test)
        mae = mean_absolute_error(reg_y_test, reg_predictions)
        accuracy = accuracy_score(clf_y_test, clf_predictions)
        display(confusion_matrix(clf_y_test, clf_predictions))
        print(f"정확도: {accuracy:.4f}")
        print(f"MAE: {mae:.4f}")
        print("학습 및 예측 완료")

        # asset_type을 'stock'으로 명시하여 저장합니다.
        save_model(clf_model)
        save_model(reg_model,model_type='reg')
        return
    except:
        raise

In [None]:
def save_model(model, model_type='clf'):
    try:
        model_path = "/tmp/"
        if model_type == 'reg':
            booster_name = "stock_reg_booster.txt"
            params_name = "stock_reg_params.json"
        else:
            booster_name = "stock_clf_booster.txt"
            params_name = "stock_clf_params.json"
        booster = model.booster_
        params = model.get_params()
        booster.save_model(model_path+booster_name)
        with open(model_path + params_name, 'w') as f:
            json.dump(params, f)

        s3_booster_key = f'models/stock/{booster_name}'
        s3_params_key = f'models/stock/{params_name}'

        # 업로드 수행
        s3_client.upload_file(model_path+booster_name, bucket_name, s3_booster_key)
        s3_client.upload_file(model_path+params_name, bucket_name, s3_params_key)
        print(f"업로드 완료")
    except Exception as e:
        raise

In [None]:
aws_access_key_id = AWS_ACCESS_KEY_ID
aws_secret_access_key = AWS_SECRET_ACCESS_KEY

s3_client = boto3.client(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name='us-east-1'
)

bucket_name = "de6-team7-bucket"
prefix = "derived_stock/"

try:
    der_df = load_recent_2weeks_data(exec_date_str)
    train_and_save(der_df)
    print("---------------------------------------------")
except Exception as e:
    raise

기준일: 2025-07-22
검색 기간: 2025-07-09 ~ 2025-07-22
누락된 날짜: 2025-07-09 - 데이터 없음
2025-07-10 → 1개 파일 불러옴
2025-07-11 → 1개 파일 불러옴
2025-07-12 → 1개 파일 불러옴
2025-07-13 → 1개 파일 불러옴
2025-07-14 → 1개 파일 불러옴
2025-07-15 → 1개 파일 불러옴
2025-07-16 → 1개 파일 불러옴
2025-07-17 → 1개 파일 불러옴
2025-07-18 → 1개 파일 불러옴
2025-07-19 → 1개 파일 불러옴
2025-07-20 → 1개 파일 불러옴
2025-07-21 → 1개 파일 불러옴
2025-07-22 → 1개 파일 불러옴

--- ML 모델 학습 및 예측 시작 ---


  df.groupby('stock_code')['close_price'].transform(lambda x: x.rolling(window=60, min_periods=60).mean().shift(-59))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008948 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5025
[LightGBM] [Info] Number of data points in the train set: 94262, number of used features: 10
[LightGBM] [Info] Start training from score -0.716220
[LightGBM] [Info] Start training from score -1.887358
[LightGBM] [Info] Start training from score -1.021838
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009118 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5025
[LightGBM] [Info] Number of data points in the train set: 94262, number of used features: 10
[LightGBM] [Info] Start training from score 5594.750000


array([[4524,  164,  605],
       [ 462,  954,  330],
       [ 739,  143, 2553]])

정확도: 0.7668
MAE: 2643.3477
학습 및 예측 완료
업로드 완료
업로드 완료
---------------------------------------------
