In [None]:
# parameters
AWS_ACCESS_KEY_ID = None
AWS_SECRET_ACCESS_KEY = None
target_date = None

In [None]:
import boto3
import os
import pandas as pd
import numpy as np
import io
import json
from datetime import datetime, timedelta
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import botocore.exceptions

import lightgbm as lgb
from sklearn.metrics import accuracy_score, confusion_matrix, mean_absolute_error
from concurrent.futures import ThreadPoolExecutor

In [None]:
aws_access_key_id = AWS_ACCESS_KEY_ID
aws_secret_access_key = AWS_SECRET_ACCESS_KEY

s3_client = boto3.client(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name='us-east-1'
)

bucket_name = "de6-team7-bucket"
prefix = f"derived_stock/stock_dt={target_date}/"

In [None]:
# 포맷 확인 후 가능한 경우 포맷 맞춰 변환, 불가능하면 None 반환
def try_convert(s, fmt="%Y-%m-%d %H:%M:%S"):
    try:
        dt = pd.to_datetime(s, errors='raise')  # 실패하면 예외 발생
        return dt.strftime(fmt)
    except:
        return None

def load_data(exec_date_str: str) -> pd.DataFrame:
    response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
    if 'Contents' not in response:
        print(f"누락된 날짜: {target_date} - 데이터 없음")
        return pd.DataFrame()

    file_keys = [obj['Key'] for obj in response.get('Contents', []) if obj['Key'].endswith('.parquet')]
    if not file_keys:
        print(f"Parquet 파일 없음: {target_date}")
        return pd.DataFrame()

    df_list = []
    for key in file_keys:
        s3_object = s3_client.get_object(Bucket=bucket_name, Key=key)
        buffer = io.BytesIO(s3_object['Body'].read())
        df_temp = pd.read_parquet(buffer)
        #print(df_temp.head())

        df_temp = df_temp[['trade_time_min', 'stock_code', 'open_price', 'high_price', 'low_price', 'close_price', 'cum_volume', 'cum_amount', 'vwap_price', 'change_rate', 'spread']]

        df_temp['trade_time_min'] = target_date + " " + df_temp['trade_time_min'].astype(str)
        df_temp['trade_time_min'] = df_temp.apply(lambda row: try_convert(row['trade_time_min']), axis=1)
        df_list.append(df_temp)

    print(f"{target_date} → {len(file_keys)}개 파일 불러옴")

    return pd.concat(df_list, ignore_index=True)

In [None]:
def generate_features(df, price_change_threshold=0.002):
    try:
        # --- 추가 전처리 ---
        df.sort_values(by=['trade_time_min', 'stock_code'], inplace=True)
        df['stock_code'] = df['stock_code'].astype('category')
        df['future_avg_price'] = (
            df.groupby('stock_code')['close_price'].transform(lambda x: x.rolling(window=1, min_periods=1).mean().shift(-1))
        )
        df['future_return'] = (df['future_avg_price'] - df['close_price']) / df['close_price']
        df['target_direction'] = np.select(
            [df['future_return'] > price_change_threshold,
            df['future_return'] < -price_change_threshold],
            [2, 0], default=1
        )
        print('전:', df.shape)
        df.dropna(subset=['future_avg_price', 'future_return'], inplace=True)
        print('후:', df.shape)
        if df.empty:
            print("Empty dataframe")
            return pd.DataFrame()
        return df
    except Exception as e:
        print(e)
        return pd.DataFrame()

In [None]:
def check_booster_exists(bucket_name: str, s3_key: str) -> bool:
    try:
        s3_client.head_object(Bucket=bucket_name, Key=s3_key)
        return True  # 파일이 존재함
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == '404':
            return False  # 파일이 존재하지 않음
        else:
            raise  # 다른 예외는 그대로 raise

In [None]:
def load_booster(model_type='clf'):
    try:
        model_path = "/tmp/"
        if model_type == 'reg':
            booster_name = "stock_reg_booster.txt"
            params_name = "stock_reg_params.json"
        else:
            booster_name = "stock_clf_booster.txt"
            params_name = "stock_clf_params.json"

        s3_booster_key = f'models/stock/{booster_name}'
        s3_params_key = f'models/stock/{params_name}'

        if check_booster_exists(bucket_name, s3_booster_key):
            s3_client.download_file(bucket_name, s3_booster_key, model_path+booster_name)  # booster download
            s3_client.download_file(bucket_name, s3_params_key, model_path+params_name)  # params download
            booster = lgb.Booster(model_file=model_path+booster_name)
            with open(model_path+params_name, 'r') as f:
                params = json.load(f)
        else:
            raise
        return booster, params
    except Exception as e:
        raise

In [None]:
def save_booster(booster, params, booster_type='clf'):
    try:
        model_path = "/tmp/"
        if booster_type == 'reg':
            booster_name = "stock_reg_booster.txt"
            params_name = "stock_reg_params.json"
        else:
            booster_name = "stock_clf_booster.txt"
            params_name = "stock_clf_params.json"

        s3_booster_key = f'models/stock/{booster_name}'
        s3_params_key = f'models/stock/{params_name}'

        booster.save_model(model_path+booster_name)
        with open(model_path + params_name, 'w') as f:
            json.dump(params, f)

        # 업로드 수행
        s3_client.upload_file(model_path+booster_name, bucket_name, s3_booster_key)
        print(f"업로드 완료: s3://{bucket_name}/{s3_booster_key}")
        s3_client.upload_file(model_path+params_name, bucket_name, s3_params_key)
        print(f"업로드 완료: s3://{bucket_name}/{s3_params_key}")
    except Exception as e:
        raise

In [None]:
def train_and_save_models(df: pd.DataFrame):
    print("\n--- ML 모델 학습 및 예측 시작 ---")
    try:
        if df.empty:
            print("DataFrame is empty. Skipping training.")
            return

        feature_cols = df.columns.difference(['trade_time_min', 'future_return', 'future_avg_price', 'target_direction'])

        # --- train & save ---
        split_idx = int(len(df) * 0.9)

        clf_X, clf_y = df[feature_cols], df['target_direction']
        clf_X_train, clf_X_test, clf_y_train, clf_y_test = clf_X[:split_idx], clf_X[split_idx:], clf_y[:split_idx], clf_y[split_idx:]
        clf_train_data = lgb.Dataset(clf_X_train, label=clf_y_train)
        clf_val_data = lgb.Dataset(clf_X_test, label=clf_y_test)

        if len(np.unique(clf_y_train)) < 2:
            print("Dataset is TOO small")
            return

        clf_booster, clf_params = load_booster()
        print("clf params: ", clf_params)
        if 'metric' not in clf_params.keys():
            clf_params['metric'] = 'multi_logloss'
        clf_booster = lgb.train(clf_params, train_set=clf_train_data, valid_sets=[clf_val_data], init_model=clf_booster, num_boost_round=100, callbacks=[lgb.early_stopping(stopping_rounds=10)])

        reg_X, reg_y = df[feature_cols], df['future_avg_price']
        reg_X_train, reg_X_test, reg_y_train, reg_y_test = reg_X[:split_idx], reg_X[split_idx:], reg_y[:split_idx], reg_y[split_idx:]
        reg_train_data = lgb.Dataset(reg_X_train, label=reg_y_train)
        reg_val_data = lgb.Dataset(reg_X_test, label=reg_y_test)

        reg_booster, reg_params = load_booster('reg')
        print("reg params: ", reg_params)
        if 'metric' not in reg_params.keys():
            reg_params['metric'] = 'mae'
        reg_booster = lgb.train(reg_params, train_set=reg_train_data, valid_sets=[reg_val_data], init_model=reg_booster, num_boost_round=100, callbacks=[lgb.early_stopping(stopping_rounds=10)])

        reg_predictions = reg_booster.predict(reg_X_test)
        clf_predictions = clf_booster.predict(clf_X_test)
        clf_predictions = np.argmax(clf_predictions, axis=1)
        mae = mean_absolute_error(reg_y_test, reg_predictions)
        accuracy = accuracy_score(clf_y_test, clf_predictions)
        display(confusion_matrix(clf_y_test, clf_predictions))
        print(f"정확도: {accuracy:.4f}")
        print(f"MAE: {mae:.4f}")
        print("학습 및 예측 완료")

        save_booster(clf_booster, clf_params)
        save_booster(reg_booster, reg_params, 'reg')
        return
    except:
        raise

In [None]:
if __name__ == "__main__":
    raw = load_data(target_date)
    data = generate_features(raw)
    train_and_save_models(data)

2025-07-22 → 1개 파일 불러옴


  df.groupby('stock_code')['close_price'].transform(lambda x: x.rolling(window=1, min_periods=1).mean().shift(-1))


전: (33330, 14)
후: (30570, 14)

--- ML 모델 학습 및 예측 시작 ---
clf params:  {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': None, 'num_leaves': 31, 'objective': 'multiclass', 'random_state': 42, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0, 'num_class': 3}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002989 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5020
[LightGBM] [Info] Number of data points in the train set: 27513, number of used features: 10
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[146]	valid_0's multi_logloss: 0.876018
reg params:  {'boosting_type': 'gbdt', 'class_weight': None, 'colsam

array([[   0,    0,    0],
       [  61, 2982,   14],
       [   0,    0,    0]])

정확도: 0.9755
MAE: 2589.5819
학습 및 예측 완료
업로드 완료: s3://de6-team7-bucket/models/stock/stock_clf_booster.txt
업로드 완료: s3://de6-team7-bucket/models/stock/stock_clf_params.json
업로드 완료: s3://de6-team7-bucket/models/stock/stock_reg_booster.txt
업로드 완료: s3://de6-team7-bucket/models/stock/stock_reg_params.json
