In [34]:
# !pip install yfinance tqdm pytz --quiet

In [35]:
import yfinance as yf
import pandas as pd
from tqdm import tqdm
import pytz

X 데이터 수집

In [36]:
# ============================================
# 수집할 종목 리스트 정의
# ============================================
symbols = ['^VIX','GC=F','SHY']

In [37]:
# =============================
# 2. 시계열 데이터 수집 (뉴욕시간 기준)
# =============================
START = "2025-11-10"
END   = "2025-11-19"

In [38]:
print("[1/2] 시계열 데이터 수집 중 ...")
data = yf.download(symbols, start=START, end=END, interval="1d", auto_adjust=False)

[**********************67%*******                ]  2 of 3 completed

[*********************100%***********************]  3 of 3 completed

[1/2] 시계열 데이터 수집 중 ...





In [39]:
# UTC로 지정 → 뉴욕시간으로 변환
data.index = pd.to_datetime(data.index, utc=True).tz_convert("America/New_York")

# '세션 날짜'를 계산: 전날 저녁 라벨 → 다음날 거래일로 이동
session_dates = (data.index + pd.Timedelta(days=1)).normalize()

# '09:30' (개장시각) 타임스탬프로 교체
session_times = session_dates + pd.Timedelta(hours=9, minutes=30)

# 이걸 인덱스로 지정
data.index = session_times

print("인덱스(라벨)를 실제 뉴욕 개장 시각(09:30)으로 조정 완료")
print(data.index.min(), "→", data.index.max())

인덱스(라벨)를 실제 뉴욕 개장 시각(09:30)으로 조정 완료
2025-11-10 09:30:00-05:00 → 2025-11-18 09:30:00-05:00


In [40]:
print("데이터 레벨명:", data.columns.names)   # ['Attributes', 'Symbols'] 형태
print("기간:", data.index.min().date(), "~", data.index.max().date())

데이터 레벨명: ['Price', 'Ticker']
기간: 2025-11-10 ~ 2025-11-18


In [41]:
# 필요한 컬럼 정리
ohlcv = data[["High", "Low","Volume"]].copy()
ohlcv.columns = [f"{ticker}_{col}" for col, ticker in ohlcv.columns]

In [42]:
# 변동성(High-Low)
volatility = pd.DataFrame(index=ohlcv.index)
for sym in symbols:
    h, l = f"{sym}_High", f"{sym}_Low"
    if h in ohlcv.columns and l in ohlcv.columns:
        volatility[f"{sym}_Volatility"] = ohlcv[h] - ohlcv[l]

price_features = pd.concat([ohlcv, volatility], axis=1).sort_index()

In [43]:
price_features.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7 entries, 2025-11-10 09:30:00-05:00 to 2025-11-18 09:30:00-05:00
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   GC=F_High        7 non-null      float64
 1   SHY_High         7 non-null      float64
 2   ^VIX_High        7 non-null      float64
 3   GC=F_Low         7 non-null      float64
 4   SHY_Low          7 non-null      float64
 5   ^VIX_Low         7 non-null      float64
 6   GC=F_Volume      7 non-null      int64  
 7   SHY_Volume       7 non-null      int64  
 8   ^VIX_Volume      7 non-null      int64  
 9   ^VIX_Volatility  7 non-null      float64
 10  GC=F_Volatility  7 non-null      float64
 11  SHY_Volatility   7 non-null      float64
dtypes: float64(9), int64(3)
memory usage: 728.0 bytes


In [44]:
price_features.columns

Index(['GC=F_High', 'SHY_High', '^VIX_High', 'GC=F_Low', 'SHY_Low', '^VIX_Low',
       'GC=F_Volume', 'SHY_Volume', '^VIX_Volume', '^VIX_Volatility',
       'GC=F_Volatility', 'SHY_Volatility'],
      dtype='object')

In [45]:
price_features = price_features.drop(['GC=F_High', 'SHY_High', '^VIX_High', 'GC=F_Low', 'SHY_Low', '^VIX_Low','SHY_Volume', '^VIX_Volume'], axis=1)

In [46]:
price_features.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7 entries, 2025-11-10 09:30:00-05:00 to 2025-11-18 09:30:00-05:00
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   GC=F_Volume      7 non-null      int64  
 1   ^VIX_Volatility  7 non-null      float64
 2   GC=F_Volatility  7 non-null      float64
 3   SHY_Volatility   7 non-null      float64
dtypes: float64(3), int64(1)
memory usage: 280.0 bytes


In [47]:
price_features

Unnamed: 0_level_0,GC=F_Volume,^VIX_Volatility,GC=F_Volatility,SHY_Volatility
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2025-11-10 09:30:00-05:00,189,1.219999,51.799805,0.029999
2025-11-11 09:30:00-05:00,569,0.76,33.300293,0.040001
2025-11-12 09:30:00-05:00,396,0.959999,101.5,0.030006
2025-11-13 09:30:00-05:00,167,3.799999,65.0,0.029999
2025-11-14 09:30:00-05:00,483,3.470001,150.899902,0.089996
2025-11-17 09:30:00-05:00,1552,3.9,80.100098,0.020004
2025-11-18 09:30:00-05:00,1375,2.93,27.799805,0.07


In [48]:
import pandas as pd

# 1) CSV 불러오기
path = "../버전1(top60)/data/output_data/투자지표_555_평균.csv"
df_ind = pd.read_csv(path)

# 2) PER(배) 마지막 값 가져오기
last_per = df_ind["PER(배)"].dropna().iloc[-1]

# 3) price_features에 새로운 컬럼으로 추가
price_features["PER(배)"] = last_per

print(price_features.tail())

                           GC=F_Volume  ^VIX_Volatility  GC=F_Volatility  \
Date                                                                       
2025-11-12 09:30:00-05:00          396         0.959999       101.500000   
2025-11-13 09:30:00-05:00          167         3.799999        65.000000   
2025-11-14 09:30:00-05:00          483         3.470001       150.899902   
2025-11-17 09:30:00-05:00         1552         3.900000        80.100098   
2025-11-18 09:30:00-05:00         1375         2.930000        27.799805   

                           SHY_Volatility     PER(배)  
Date                                                  
2025-11-12 09:30:00-05:00        0.030006  57.209831  
2025-11-13 09:30:00-05:00        0.029999  57.209831  
2025-11-14 09:30:00-05:00        0.089996  57.209831  
2025-11-17 09:30:00-05:00        0.020004  57.209831  
2025-11-18 09:30:00-05:00        0.070000  57.209831  


In [49]:
price_features

Unnamed: 0_level_0,GC=F_Volume,^VIX_Volatility,GC=F_Volatility,SHY_Volatility,PER(배)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-11-10 09:30:00-05:00,189,1.219999,51.799805,0.029999,57.209831
2025-11-11 09:30:00-05:00,569,0.76,33.300293,0.040001,57.209831
2025-11-12 09:30:00-05:00,396,0.959999,101.5,0.030006,57.209831
2025-11-13 09:30:00-05:00,167,3.799999,65.0,0.029999,57.209831
2025-11-14 09:30:00-05:00,483,3.470001,150.899902,0.089996,57.209831
2025-11-17 09:30:00-05:00,1552,3.9,80.100098,0.020004,57.209831
2025-11-18 09:30:00-05:00,1375,2.93,27.799805,0.07,57.209831


Y데이터 수집

In [50]:
import yfinance as yf
import pandas as pd
import numpy as np

# ============================================
# 1) 종목 리스트
# ============================================
symbols = ["SPY"]

# ============================================
# 2) 시계열 데이터 수집
# ============================================
START = "2025-11-10"
END   = "2025-11-19"

print("[1/2] 시계열 데이터 수집 중 ...")
data = yf.download(symbols, start=START, end=END, interval="1d", auto_adjust=False)

# UTC → 뉴욕시간 변환
data.index = pd.to_datetime(data.index, utc=True).tz_convert("America/New_York")

# 전날 가격을 다음날 개장시간(09:30)에 반영
session_dates = (data.index + pd.Timedelta(days=1)).normalize()
session_times = session_dates + pd.Timedelta(hours=9, minutes=30)
data.index = session_times

print("NY 개장 시간으로 라벨 조정 완료")
print("기간:", data.index.min().date(), "~", data.index.max().date())

# ============================================
# 3) MultiIndex → 단일 컬럼으로 변환
# ============================================
data.columns = [f"{col}_{sym}" for col, sym in data.columns]

# ============================================
# 4) 필요한 컬럼만 선택
# ============================================
ohlcv = data[["Close_SPY"]].copy()

ohlcv[["y_target"]] = ohlcv[["Close_SPY"]]

# 첫 번째 행은 값이 없으니 보통은 드랍
# ohlcv = ohlcv.dropna().copy()

# ============================================
# 5) Date 컬럼 생성
# ============================================
ohlcv["Date"] = ohlcv.index.tz_localize(None).normalize().date
ohlcv.reset_index(drop=True, inplace=True)

# Date 컬럼을 맨 앞으로 이동
ohlcv = ohlcv[["Date"] + [col for col in ohlcv.columns if col != "Date"]]

# ============================================
# 6) 출력
# ============================================
print(ohlcv)

[*********************100%***********************]  1 of 1 completed

[1/2] 시계열 데이터 수집 중 ...
NY 개장 시간으로 라벨 조정 완료
기간: 2025-11-10 ~ 2025-11-18
         Date   Close_SPY    y_target
0  2025-11-10  681.440002  681.440002
1  2025-11-11  683.000000  683.000000
2  2025-11-12  683.380005  683.380005
3  2025-11-13  672.039978  672.039978
4  2025-11-14  671.929993  671.929993
5  2025-11-17  665.669983  665.669983
6  2025-11-18  660.080017  660.080017





In [51]:
ohlcv = ohlcv.drop(['Close_SPY'], axis=1)

In [52]:
ohlcv

Unnamed: 0,Date,y_target
0,2025-11-10,681.440002
1,2025-11-11,683.0
2,2025-11-12,683.380005
3,2025-11-13,672.039978
4,2025-11-14,671.929993
5,2025-11-17,665.669983
6,2025-11-18,660.080017


X,Y 데이터 병합

In [67]:
# 1) pf: price_features → Date 인덱스(날짜만)
pf = price_features.copy()
pf.index = pd.to_datetime(pf.index).tz_localize(None)   # 타임존 제거
pf = pf.rename_axis("Date").reset_index()
pf["Date"] = pf["Date"].dt.date                        # 순수 date
pf = pf.set_index("Date")

# 2) ohlcv: Date 컬럼을 인덱스로 맞추기
ohlcv2 = ohlcv.copy()
ohlcv2["Date"] = pd.to_datetime(ohlcv2["Date"]).dt.date
ohlcv2 = ohlcv2.set_index("Date")

# 3) 인덱스(Date) 기준 병합
merged_df = pf.join(ohlcv2, how="inner")   # 필요에 따라 left/outer로 변경 가능

print(merged_df.shape)
print(merged_df.head())


(7, 6)
            GC=F_Volume  ^VIX_Volatility  GC=F_Volatility  SHY_Volatility  \
Date                                                                        
2025-11-10          189         1.219999        51.799805        0.029999   
2025-11-11          569         0.760000        33.300293        0.040001   
2025-11-12          396         0.959999       101.500000        0.030006   
2025-11-13          167         3.799999        65.000000        0.029999   
2025-11-14          483         3.470001       150.899902        0.089996   

               PER(배)    y_target  
Date                               
2025-11-10  57.209831  681.440002  
2025-11-11  57.209831  683.000000  
2025-11-12  57.209831  683.380005  
2025-11-13  57.209831  672.039978  
2025-11-14  57.209831  671.929993  


In [68]:
merged_df

Unnamed: 0_level_0,GC=F_Volume,^VIX_Volatility,GC=F_Volatility,SHY_Volatility,PER(배),y_target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-11-10,189,1.219999,51.799805,0.029999,57.209831,681.440002
2025-11-11,569,0.76,33.300293,0.040001,57.209831,683.0
2025-11-12,396,0.959999,101.5,0.030006,57.209831,683.380005
2025-11-13,167,3.799999,65.0,0.029999,57.209831,672.039978
2025-11-14,483,3.470001,150.899902,0.089996,57.209831,671.929993
2025-11-17,1552,3.9,80.100098,0.020004,57.209831,665.669983
2025-11-18,1375,2.93,27.799805,0.07,57.209831,660.080017


In [69]:
merged_df_clean = merged_df.dropna()

In [70]:
merged_df_clean

Unnamed: 0_level_0,GC=F_Volume,^VIX_Volatility,GC=F_Volatility,SHY_Volatility,PER(배),y_target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-11-10,189,1.219999,51.799805,0.029999,57.209831,681.440002
2025-11-11,569,0.76,33.300293,0.040001,57.209831,683.0
2025-11-12,396,0.959999,101.5,0.030006,57.209831,683.380005
2025-11-13,167,3.799999,65.0,0.029999,57.209831,672.039978
2025-11-14,483,3.470001,150.899902,0.089996,57.209831,671.929993
2025-11-17,1552,3.9,80.100098,0.020004,57.209831,665.669983
2025-11-18,1375,2.93,27.799805,0.07,57.209831,660.080017


In [71]:
# 1. 'y_target'에 로그 변환을 적용하여 'y_target_log' 생성
merged_df_clean['y_target_log'] = np.log(merged_df_clean['y_target'])
# 2. 타겟(y) 정의
y = merged_df_clean['y_target_log']

# 3. X 정의 (모든 피처 포함)
X = merged_df_clean.drop(columns=['y_target', 'y_target_log','PER(배)'])

In [72]:
# -------------------------------
# 1) 자동 로그 변환 함수 (그대로 사용)
# -------------------------------
def auto_log_transform_by_value(merged_df_clean):
    print(f"--- 자동 로그 변환 시작 (값 범위 기반) ---")

    df_transformed = merged_df_clean.copy()
    transformed_cols = []

    numeric_cols = df_transformed.select_dtypes(include=[np.number]).columns

    for col in numeric_cols:
        col_data = df_transformed[col]
        col_min = col_data.min()

        if col_min < 0:
            print(f"[{col}] 건너뜀: 음수 값이 포함되어 있습니다. (최소값: {col_min:.2f})")
            continue

        elif col_min < 1:
            df_transformed[col] = np.log1p(col_data)
            transformed_cols.append(col)
            print(f"[{col}] 변환 적용 (log1p): 최소값={col_min:.2f} (< 1)")

        else:
            df_transformed[col] = np.log(col_data)
            transformed_cols.append(col)
            print(f"[{col}] 변환 적용 (log): 최소값={col_min:.2f} (>= 1)")

    print(f"--- 총 {len(transformed_cols)}개 열 변환 완료 ---")

    return df_transformed, transformed_cols

In [73]:
X_transformed, transformed_list_X = auto_log_transform_by_value(X)

print("\n[X 변환된 컬럼 왜도]")
print(X_transformed[transformed_list_X].skew())

--- 자동 로그 변환 시작 (값 범위 기반) ---
[GC=F_Volume] 변환 적용 (log): 최소값=167.00 (>= 1)
[^VIX_Volatility] 변환 적용 (log1p): 최소값=0.76 (< 1)
[GC=F_Volatility] 변환 적용 (log): 최소값=27.80 (>= 1)
[SHY_Volatility] 변환 적용 (log1p): 최소값=0.02 (< 1)
--- 총 4개 열 변환 완료 ---

[X 변환된 컬럼 왜도]
GC=F_Volume        0.128861
^VIX_Volatility   -0.369041
GC=F_Volatility    0.002195
SHY_Volatility     1.205010
dtype: float64


In [74]:
merged_df_clean

Unnamed: 0_level_0,GC=F_Volume,^VIX_Volatility,GC=F_Volatility,SHY_Volatility,PER(배),y_target,y_target_log
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2025-11-10,189,1.219999,51.799805,0.029999,57.209831,681.440002,6.524208
2025-11-11,569,0.76,33.300293,0.040001,57.209831,683.0,6.526495
2025-11-12,396,0.959999,101.5,0.030006,57.209831,683.380005,6.527051
2025-11-13,167,3.799999,65.0,0.029999,57.209831,672.039978,6.510318
2025-11-14,483,3.470001,150.899902,0.089996,57.209831,671.929993,6.510154
2025-11-17,1552,3.9,80.100098,0.020004,57.209831,665.669983,6.500794
2025-11-18,1375,2.93,27.799805,0.07,57.209831,660.080017,6.492361


In [75]:
X_transformed['PER(배)'] = merged_df_clean['PER(배)']

In [76]:
X_transformed['y_target_log'] = merged_df_clean['y_target_log']

In [77]:
X_transformed

Unnamed: 0_level_0,GC=F_Volume,^VIX_Volatility,GC=F_Volatility,SHY_Volatility,PER(배),y_target_log
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-11-10,5.241747,0.797507,3.947386,0.029558,57.209831,6.524208
2025-11-11,6.34388,0.565314,3.505566,0.039222,57.209831,6.526495
2025-11-12,5.981414,0.672944,4.620059,0.029565,57.209831,6.527051
2025-11-13,5.117994,1.568616,4.174387,0.029558,57.209831,6.510318
2025-11-14,6.180017,1.497389,5.016617,0.086174,57.209831,6.510154
2025-11-17,7.3473,1.589235,4.383277,0.019807,57.209831,6.500794
2025-11-18,7.226209,1.36864,3.325029,0.067658,57.209831,6.492361


In [78]:
X_transformed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, 2025-11-10 to 2025-11-18
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   GC=F_Volume      7 non-null      float64
 1   ^VIX_Volatility  7 non-null      float64
 2   GC=F_Volatility  7 non-null      float64
 3   SHY_Volatility   7 non-null      float64
 4   PER(배)           7 non-null      float64
 5   y_target_log     7 non-null      float64
dtypes: float64(6)
memory usage: 392.0+ bytes


In [80]:
X_transformed.to_csv("../예측/log샘플데이터.csv")