In [1]:
# !pip install yfinance tqdm pytz --quiet

In [1]:
import yfinance as yf
import pandas as pd
from tqdm import tqdm
import pytz

### X 및 Y 데이터 수집 (nav)

In [2]:
# SPY_NAV(자산가치) 데이터 가져오기
import pandas as pd

path = "../버전2(SPY)/data/raw_data/navhist-us-en-spy (1).xlsx"

# 1. 엑셀 불러오기
df = pd.read_excel(path)

# 2. Date를 datetime으로 변환 (날짜가 아닌 건 NaT로 처리)
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")

# 3. Date가 NaT인 행(문장 들어간 행 등) 제거
df = df.dropna(subset=["Date"])

# 4. 형식 'YYYY-MM-DD'로 바꾸기
df["Date"] = df["Date"].dt.strftime("%Y-%m-%d")

# 5. Date, NAV 두 컬럼만 남기기
nav_df = df[["Date", "NAV"]].copy()

# 6. NAV → SPY_NAV로 컬럼명 변경
nav_df = nav_df.rename(columns={"NAV": "SPY_NAV"})

print(nav_df.head())

         Date     SPY_NAV
0  2025-11-18  660.127046
1  2025-11-17  665.567433
2  2025-11-14  671.649115
3  2025-11-13  671.850237
4  2025-11-12  683.129392


In [3]:
nav_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5531 entries, 0 to 5530
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Date     5531 non-null   object 
 1   SPY_NAV  5531 non-null   float64
dtypes: float64(1), object(1)
memory usage: 129.6+ KB


In [4]:
import pandas as pd

# 1. Date를 datetime으로 변환 (현재 'YYYY-MM-DD' 문자열 상태라고 가정)
nav_df["Date"] = pd.to_datetime(nav_df["Date"])

# 2. 날짜 범위 필터링 (2025-11-10 ~ 2025-11-18 포함)
start = pd.to_datetime("2025-11-10")
end   = pd.to_datetime("2025-11-18")

nav_df = nav_df[(nav_df["Date"] >= start) & (nav_df["Date"] <= end)]

# 3. Date 기준 오름차순 정렬
nav_df = nav_df.sort_values("Date")

# 4. 다시 'YYYY-MM-DD' 문자열로 쓰고 싶으면
nav_df["Date"] = nav_df["Date"].dt.strftime("%Y-%m-%d")

print(nav_df.head())


         Date     SPY_NAV
6  2025-11-10  681.263847
5  2025-11-11  682.675112
4  2025-11-12  683.129392
3  2025-11-13  671.850237
2  2025-11-14  671.649115


In [5]:
nav_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, 6 to 0
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Date     7 non-null      object 
 1   SPY_NAV  7 non-null      float64
dtypes: float64(1), object(1)
memory usage: 168.0+ bytes


In [6]:
nav_df

Unnamed: 0,Date,SPY_NAV
6,2025-11-10,681.263847
5,2025-11-11,682.675112
4,2025-11-12,683.129392
3,2025-11-13,671.850237
2,2025-11-14,671.649115
1,2025-11-17,665.567433
0,2025-11-18,660.127046


In [7]:
# ============================================
# 수집할 종목 리스트 정의
# ============================================
symbols = ['SPY']

In [8]:
# =============================
# 2. 시계열 데이터 수집 (뉴욕시간 기준)
# =============================
START = "2025-11-10"
END   = "2025-11-19"

In [9]:
print("[1/2] 시계열 데이터 수집 중 ...")
data = yf.download(symbols, start=START, end=END, interval="1d", auto_adjust=False)

[1/2] 시계열 데이터 수집 중 ...


[*********************100%***********************]  1 of 1 completed


In [10]:
# UTC로 지정 → 뉴욕시간으로 변환
data.index = pd.to_datetime(data.index, utc=True).tz_convert("America/New_York")

# '세션 날짜'를 계산: 전날 저녁 라벨 → 다음날 거래일로 이동
session_dates = (data.index + pd.Timedelta(days=1)).normalize()

# '09:30' (개장시각) 타임스탬프로 교체
session_times = session_dates + pd.Timedelta(hours=9, minutes=30)

# 이걸 인덱스로 지정
data.index = session_times

print("인덱스(라벨)를 실제 뉴욕 개장 시각(09:30)으로 조정 완료")
print(data.index.min(), "→", data.index.max())

인덱스(라벨)를 실제 뉴욕 개장 시각(09:30)으로 조정 완료
2025-11-10 09:30:00-05:00 → 2025-11-18 09:30:00-05:00


In [11]:
print("데이터 레벨명:", data.columns.names)   # ['Attributes', 'Symbols'] 형태
print("기간:", data.index.min().date(), "~", data.index.max().date())

데이터 레벨명: ['Price', 'Ticker']
기간: 2025-11-10 ~ 2025-11-18


In [12]:
# 필요한 컬럼 정리
ohlcv = data[["Close"]].copy()
ohlcv.columns = [f"{ticker}_{col}" for col, ticker in ohlcv.columns]

In [13]:
ohlcv

Unnamed: 0_level_0,SPY_Close
Date,Unnamed: 1_level_1
2025-11-10 09:30:00-05:00,681.440002
2025-11-11 09:30:00-05:00,683.0
2025-11-12 09:30:00-05:00,683.380005
2025-11-13 09:30:00-05:00,672.039978
2025-11-14 09:30:00-05:00,671.929993
2025-11-17 09:30:00-05:00,665.669983
2025-11-18 09:30:00-05:00,660.080017


In [14]:
# 1) nav_df: Date 컬럼 정리
nav_df["Date"] = pd.to_datetime(nav_df["Date"]).dt.date

# 2) ohlcv: 인덱스 → Date 컬럼으로 내리고 date만 추출
ohlcv2 = ohlcv.copy()
ohlcv2.index = pd.to_datetime(ohlcv2.index).tz_localize(None)
ohlcv2 = ohlcv2.reset_index().rename(columns={"index": "Date"})
ohlcv2["Date"] = ohlcv2["Date"].dt.date                          # date만 추출

# 3) 두 DF를 Date 기준으로 병합(inner)
final_df = pd.merge(nav_df, ohlcv2[["Date", "SPY_Close"]], on="Date", how="inner")

# 4) SPY Premium 계산
final_df["SPY_Premium_pct"] = (final_df["SPY_Close"] - final_df["SPY_NAV"]) / final_df["SPY_NAV"] * 100

In [15]:
final_df

Unnamed: 0,Date,SPY_NAV,SPY_Close,SPY_Premium_pct
0,2025-11-10,681.263847,681.440002,0.025857
1,2025-11-11,682.675112,683.0,0.04759
2,2025-11-12,683.129392,683.380005,0.036686
3,2025-11-13,671.850237,672.039978,0.028242
4,2025-11-14,671.649115,671.929993,0.041819
5,2025-11-17,665.567433,665.669983,0.015408
6,2025-11-18,660.127046,660.080017,-0.007124


In [16]:
# 2) SPY_NAV 삭제
final_df = final_df.drop(columns=["SPY_NAV"])

# 3) 확인
print(final_df.head())
print(final_df.shape)

         Date   SPY_Close  SPY_Premium_pct
0  2025-11-10  681.440002         0.025857
1  2025-11-11  683.000000         0.047590
2  2025-11-12  683.380005         0.036686
3  2025-11-13  672.039978         0.028242
4  2025-11-14  671.929993         0.041819
(7, 3)


### X데이터 수집 (환경변수)

In [17]:
# ============================================
# 수집할 종목 리스트 정의
# ============================================
symbols = ['^VIX','GC=F','SHY']

In [18]:
# =============================
# 2. 시계열 데이터 수집 (뉴욕시간 기준)
# =============================
START = "2025-11-10"
END   = "2025-11-19"

In [19]:
print("[1/2] 시계열 데이터 수집 중 ...")
data = yf.download(symbols, start=START, end=END, interval="1d", auto_adjust=False)

[1/2] 시계열 데이터 수집 중 ...


[*********************100%***********************]  3 of 3 completed


In [20]:
# UTC로 지정 → 뉴욕시간으로 변환
data.index = pd.to_datetime(data.index, utc=True).tz_convert("America/New_York")

# '세션 날짜'를 계산: 전날 저녁 라벨 → 다음날 거래일로 이동
session_dates = (data.index + pd.Timedelta(days=1)).normalize()

# '09:30' (개장시각) 타임스탬프로 교체
session_times = session_dates + pd.Timedelta(hours=9, minutes=30)

# 이걸 인덱스로 지정
data.index = session_times

print("인덱스(라벨)를 실제 뉴욕 개장 시각(09:30)으로 조정 완료")
print(data.index.min(), "→", data.index.max())

인덱스(라벨)를 실제 뉴욕 개장 시각(09:30)으로 조정 완료
2025-11-10 09:30:00-05:00 → 2025-11-18 09:30:00-05:00


In [21]:
print("데이터 레벨명:", data.columns.names)   # ['Attributes', 'Symbols'] 형태
print("기간:", data.index.min().date(), "~", data.index.max().date())

데이터 레벨명: ['Price', 'Ticker']
기간: 2025-11-10 ~ 2025-11-18


In [22]:
# 필요한 컬럼 정리
ohlcv = data[["High", "Low","Volume"]].copy()
ohlcv.columns = [f"{ticker}_{col}" for col, ticker in ohlcv.columns]

In [23]:
# 변동성(High-Low)
volatility = pd.DataFrame(index=ohlcv.index)
for sym in symbols:
    h, l = f"{sym}_High", f"{sym}_Low"
    if h in ohlcv.columns and l in ohlcv.columns:
        volatility[f"{sym}_Volatility"] = ohlcv[h] - ohlcv[l]

price_features = pd.concat([ohlcv, volatility], axis=1).sort_index()

In [24]:
price_features.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7 entries, 2025-11-10 09:30:00-05:00 to 2025-11-18 09:30:00-05:00
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   GC=F_High        7 non-null      float64
 1   SHY_High         7 non-null      float64
 2   ^VIX_High        7 non-null      float64
 3   GC=F_Low         7 non-null      float64
 4   SHY_Low          7 non-null      float64
 5   ^VIX_Low         7 non-null      float64
 6   GC=F_Volume      7 non-null      int64  
 7   SHY_Volume       7 non-null      int64  
 8   ^VIX_Volume      7 non-null      int64  
 9   ^VIX_Volatility  7 non-null      float64
 10  GC=F_Volatility  7 non-null      float64
 11  SHY_Volatility   7 non-null      float64
dtypes: float64(9), int64(3)
memory usage: 728.0 bytes


In [25]:
price_features.columns

Index(['GC=F_High', 'SHY_High', '^VIX_High', 'GC=F_Low', 'SHY_Low', '^VIX_Low',
       'GC=F_Volume', 'SHY_Volume', '^VIX_Volume', '^VIX_Volatility',
       'GC=F_Volatility', 'SHY_Volatility'],
      dtype='object')

In [26]:
price_features = price_features.drop(['GC=F_High', 'SHY_High', '^VIX_High', 'GC=F_Low', 'SHY_Low', '^VIX_Low','SHY_Volume', '^VIX_Volume'], axis=1)

In [27]:
price_features.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7 entries, 2025-11-10 09:30:00-05:00 to 2025-11-18 09:30:00-05:00
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   GC=F_Volume      7 non-null      int64  
 1   ^VIX_Volatility  7 non-null      float64
 2   GC=F_Volatility  7 non-null      float64
 3   SHY_Volatility   7 non-null      float64
dtypes: float64(3), int64(1)
memory usage: 280.0 bytes


In [28]:
price_features

Unnamed: 0_level_0,GC=F_Volume,^VIX_Volatility,GC=F_Volatility,SHY_Volatility
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2025-11-10 09:30:00-05:00,189,1.219999,51.799805,0.029999
2025-11-11 09:30:00-05:00,569,0.76,33.300293,0.040001
2025-11-12 09:30:00-05:00,396,0.959999,101.5,0.030006
2025-11-13 09:30:00-05:00,167,3.799999,65.0,0.029999
2025-11-14 09:30:00-05:00,483,3.470001,150.899902,0.089996
2025-11-17 09:30:00-05:00,1552,3.9,80.100098,0.020004
2025-11-18 09:30:00-05:00,1375,2.93,27.799805,0.07


### X, Y 데이터 병합

In [29]:
# 1) nav_df: Date 컬럼 정리
final_df["Date"] = pd.to_datetime(final_df["Date"]).dt.date

# 2) ohlcv: 인덱스 → Date 컬럼으로 내리고 date만 추출
price_features2 = price_features.copy()
price_features2.index = pd.to_datetime(price_features2.index).tz_localize(None)
price_features2 = price_features2.reset_index().rename(columns={"index": "Date"})
price_features2["Date"] = price_features2["Date"].dt.date                          # date만 추출

# 3) 두 DF를 Date 기준으로 병합(inner)
merged_df = pd.merge(price_features2, final_df[["Date", "SPY_Premium_pct", "SPY_Close"]], on="Date", how="inner")

In [30]:
merged_df

Unnamed: 0,Date,GC=F_Volume,^VIX_Volatility,GC=F_Volatility,SHY_Volatility,SPY_Premium_pct,SPY_Close
0,2025-11-10,189,1.219999,51.799805,0.029999,0.025857,681.440002
1,2025-11-11,569,0.76,33.300293,0.040001,0.04759,683.0
2,2025-11-12,396,0.959999,101.5,0.030006,0.036686,683.380005
3,2025-11-13,167,3.799999,65.0,0.029999,0.028242,672.039978
4,2025-11-14,483,3.470001,150.899902,0.089996,0.041819,671.929993
5,2025-11-17,1552,3.9,80.100098,0.020004,0.015408,665.669983
6,2025-11-18,1375,2.93,27.799805,0.07,-0.007124,660.080017


In [31]:
merged_df = merged_df.rename(columns={"SPY_Close": "y_target"})

In [32]:
merged_df

Unnamed: 0,Date,GC=F_Volume,^VIX_Volatility,GC=F_Volatility,SHY_Volatility,SPY_Premium_pct,y_target
0,2025-11-10,189,1.219999,51.799805,0.029999,0.025857,681.440002
1,2025-11-11,569,0.76,33.300293,0.040001,0.04759,683.0
2,2025-11-12,396,0.959999,101.5,0.030006,0.036686,683.380005
3,2025-11-13,167,3.799999,65.0,0.029999,0.028242,672.039978
4,2025-11-14,483,3.470001,150.899902,0.089996,0.041819,671.929993
5,2025-11-17,1552,3.9,80.100098,0.020004,0.015408,665.669983
6,2025-11-18,1375,2.93,27.799805,0.07,-0.007124,660.080017


### SPY 로그 변환

In [33]:
import numpy as np

In [34]:
# 1. 'y_target'에 로그 변환을 적용하여 'y_target_log' 생성
merged_df['y_target_log'] = np.log(merged_df['y_target'])
# 2. 타겟(y) 정의
y = merged_df['y_target_log']

# 3. X 정의 (모든 피처 포함)
X = merged_df.drop(columns=['y_target', 'y_target_log','SPY_Premium_pct'])

In [35]:
# -------------------------------
# 1) 자동 로그 변환 함수 (그대로 사용)
# -------------------------------
def auto_log_transform_by_value(merged_df):
    print(f"--- 자동 로그 변환 시작 (값 범위 기반) ---")

    df_transformed = merged_df.copy()
    transformed_cols = []

    numeric_cols = df_transformed.select_dtypes(include=[np.number]).columns

    for col in numeric_cols:
        col_data = df_transformed[col]
        col_min = col_data.min()

        if col_min < 0:
            print(f"[{col}] 건너뜀: 음수 값이 포함되어 있습니다. (최소값: {col_min:.2f})")
            continue

        elif col_min < 1:
            df_transformed[col] = np.log1p(col_data)
            transformed_cols.append(col)
            print(f"[{col}] 변환 적용 (log1p): 최소값={col_min:.2f} (< 1)")

        else:
            df_transformed[col] = np.log(col_data)
            transformed_cols.append(col)
            print(f"[{col}] 변환 적용 (log): 최소값={col_min:.2f} (>= 1)")

    print(f"--- 총 {len(transformed_cols)}개 열 변환 완료 ---")

    return df_transformed, transformed_cols

In [36]:
X_transformed, transformed_list_X = auto_log_transform_by_value(X)

print("\n[X 변환된 컬럼 왜도]")
print(X_transformed[transformed_list_X].skew())

--- 자동 로그 변환 시작 (값 범위 기반) ---
[GC=F_Volume] 변환 적용 (log): 최소값=167.00 (>= 1)
[^VIX_Volatility] 변환 적용 (log1p): 최소값=0.76 (< 1)
[GC=F_Volatility] 변환 적용 (log): 최소값=27.80 (>= 1)
[SHY_Volatility] 변환 적용 (log1p): 최소값=0.02 (< 1)
--- 총 4개 열 변환 완료 ---

[X 변환된 컬럼 왜도]
GC=F_Volume        0.128861
^VIX_Volatility   -0.369041
GC=F_Volatility    0.002195
SHY_Volatility     1.205010
dtype: float64


In [37]:
merged_df

Unnamed: 0,Date,GC=F_Volume,^VIX_Volatility,GC=F_Volatility,SHY_Volatility,SPY_Premium_pct,y_target,y_target_log
0,2025-11-10,189,1.219999,51.799805,0.029999,0.025857,681.440002,6.524208
1,2025-11-11,569,0.76,33.300293,0.040001,0.04759,683.0,6.526495
2,2025-11-12,396,0.959999,101.5,0.030006,0.036686,683.380005,6.527051
3,2025-11-13,167,3.799999,65.0,0.029999,0.028242,672.039978,6.510318
4,2025-11-14,483,3.470001,150.899902,0.089996,0.041819,671.929993,6.510154
5,2025-11-17,1552,3.9,80.100098,0.020004,0.015408,665.669983,6.500794
6,2025-11-18,1375,2.93,27.799805,0.07,-0.007124,660.080017,6.492361


In [38]:
X_transformed['SPY_Premium_pct'] = merged_df['SPY_Premium_pct']

In [39]:
X_transformed['y_target_log'] = merged_df['y_target_log']

In [40]:
X_transformed

Unnamed: 0,Date,GC=F_Volume,^VIX_Volatility,GC=F_Volatility,SHY_Volatility,SPY_Premium_pct,y_target_log
0,2025-11-10,5.241747,0.797507,3.947386,0.029558,0.025857,6.524208
1,2025-11-11,6.34388,0.565314,3.505566,0.039222,0.04759,6.526495
2,2025-11-12,5.981414,0.672944,4.620059,0.029565,0.036686,6.527051
3,2025-11-13,5.117994,1.568616,4.174387,0.029558,0.028242,6.510318
4,2025-11-14,6.180017,1.497389,5.016617,0.086174,0.041819,6.510154
5,2025-11-17,7.3473,1.589235,4.383277,0.019807,0.015408,6.500794
6,2025-11-18,7.226209,1.36864,3.325029,0.067658,-0.007124,6.492361


In [41]:
X_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Date             7 non-null      object 
 1   GC=F_Volume      7 non-null      float64
 2   ^VIX_Volatility  7 non-null      float64
 3   GC=F_Volatility  7 non-null      float64
 4   SHY_Volatility   7 non-null      float64
 5   SPY_Premium_pct  7 non-null      float64
 6   y_target_log     7 non-null      float64
dtypes: float64(6), object(1)
memory usage: 524.0+ bytes


In [42]:
X_transformed.to_csv("../예측/SPYlog샘플데이터.csv")