In [1]:
import pandas as pd

# 데이터 불러오기
df_env = pd.read_csv("../output_data/환경변수_통합.csv")
df_stock = pd.read_csv("../output_data/top60기업주식평균_일별지표.csv")
df_ind = pd.read_csv("../output_data/투자지표_555_평균.csv")
y = pd.read_csv("../output_data/top60_y_target_data.csv")

# -------------------------------------------------------------------
# 1) 날짜 변환: tz 무시 + 문자열로 변환 후 날짜만 추출
# -------------------------------------------------------------------

def normalize_date(series):
    # 1) 문자열로 변환
    s = series.astype(str)
    # 2) 날짜만 잘라내기 (앞 10자리 YYYY-MM-DD)
    s = s.str.slice(0, 10)
    # 3) datetime 변환
    return pd.to_datetime(s, format="%Y-%m-%d", errors="coerce")

df_stock["Date"] = normalize_date(df_stock["Date"])
df_env["Date"]   = normalize_date(df_env["Date"])
df_ind["Date"]   = normalize_date(df_ind["Date"])

# -------------------------------------------------------------------
# 2) df_stock(Date) + df_env(Date) inner join
# -------------------------------------------------------------------
merged_1 = pd.merge(
    df_stock,
    df_env,
    on="Date",
    how="inner"
)

# -------------------------------------------------------------------
# 3) merged_1(Date) + df_ind(Date) inner join
# -------------------------------------------------------------------
final_df = pd.merge(
    merged_1,
    df_ind,
    on="Date",
    how="inner"
)

# -------------------------------------------------------------------
# 4) 결과 확인
# -------------------------------------------------------------------
print(final_df.head())
print(final_df.shape)
print(final_df.dtypes)

# ------------------------------------------------------
# 5) y 데이터 날짜 정리 후 y_target 컬럼 생성
# ------------------------------------------------------
y["Date"] = normalize_date(y["Date"])

# y_target = Close_SPY
y = y.rename(columns={"Close_SPY": "y_target"})

# 불필요한 다른 컬럼이 있다면 정리
y = y[["Date", "y_target"]]

# ------------------------------------------------------
# 6) final_df + y_target inner join
# ------------------------------------------------------
final_df = pd.merge(
    final_df,
    y,
    on="Date",
    how="inner"
)

# ------------------------------------------------------
# 7) 결과 확인
# ------------------------------------------------------
print(final_df.head())
print(final_df.shape)
print(final_df.columns)


        Date        High         Low        Volume       Close  Volatility  \
0 2022-11-07  159.266873  155.416743  2.422511e+07  158.288672    3.850129   
1 2022-11-08  161.183501  157.012456  2.774416e+07  159.389651    4.171045   
2 2022-11-09  159.729943  155.755290  2.600689e+07  156.143013    3.974652   
3 2022-11-10  165.475252  159.964003  3.690776e+07  164.653242    5.511249   
4 2022-11-11  168.213936  162.340353  3.221336e+07  165.934455    5.873583   

   DX-Y.NYB_Close   GC=F_Close   GLD_Close  SHY_Close  ...     PER(배)  \
0      110.120003  1676.500000  155.850006  80.610001  ...  29.475254   
1      109.620003  1712.099976  159.449997  80.690002  ...  29.475254   
2      110.550003  1710.099976  158.649994  80.820000  ...  29.475254   
3      108.209999  1750.300049  163.479996  81.260002  ...  29.475254   
4      106.290001  1766.000000  164.559998  81.239998  ...  29.475254   

      PBR(배)     PCR(배)    SPS(달러)   DPS(달러)  EPS(달러)    BPS(달러)    CPS(달러)  \
0  11.644286 

In [2]:
final_df

Unnamed: 0,Date,High,Low,Volume,Close,Volatility,DX-Y.NYB_Close,GC=F_Close,GLD_Close,SHY_Close,...,PBR(배),PCR(배),SPS(달러),DPS(달러),EPS(달러),BPS(달러),CPS(달러),ROA(%),ROIC(%),y_target
0,2022-11-07,159.266873,155.416743,2.422511e+07,158.288672,3.850129,110.120003,1676.500000,155.850006,80.610001,...,11.644286,29.772373,55.195254,2.177966,6.406780,38.125593,12.635254,8.800000,17.843103,379.950012
1,2022-11-08,161.183501,157.012456,2.774416e+07,159.389651,4.171045,109.620003,1712.099976,159.449997,80.690002,...,11.644286,29.772373,55.195254,2.177966,6.406780,38.125593,12.635254,8.800000,17.843103,382.000000
2,2022-11-09,159.729943,155.755290,2.600689e+07,156.143013,3.974652,110.550003,1710.099976,158.649994,80.820000,...,11.644286,29.772373,55.195254,2.177966,6.406780,38.125593,12.635254,8.800000,17.843103,374.130005
3,2022-11-10,165.475252,159.964003,3.690776e+07,164.653242,5.511249,108.209999,1750.300049,163.479996,81.260002,...,11.644286,29.772373,55.195254,2.177966,6.406780,38.125593,12.635254,8.800000,17.843103,394.690002
4,2022-11-11,168.213936,162.340353,3.221336e+07,165.934455,5.873583,106.290001,1766.000000,164.559998,81.239998,...,11.644286,29.772373,55.195254,2.177966,6.406780,38.125593,12.635254,8.800000,17.843103,398.510010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,2025-11-03,327.134669,318.054334,1.983944e+07,322.874002,9.080335,99.870003,4000.300049,368.779999,82.720001,...,14.815893,26.317288,69.308136,2.895254,10.008814,49.845593,11.579831,12.547458,26.120339,683.340027
750,2025-11-04,324.485566,315.981418,2.154276e+07,319.568666,8.504149,100.220001,3947.699951,362.320007,82.750000,...,14.815893,26.317288,69.308136,2.895254,10.008814,49.845593,11.579831,12.547458,26.120339,675.239990
751,2025-11-05,325.360668,316.155665,1.882375e+07,321.918001,9.205003,100.199997,3980.300049,366.510010,82.690002,...,14.815893,26.317288,69.308136,2.895254,10.008814,49.845593,11.579831,12.547458,26.120339,677.580017
752,2025-11-06,324.613499,315.241166,2.138675e+07,319.116336,9.372333,99.730003,3979.899902,366.070007,82.790001,...,14.815893,26.317288,69.308136,2.895254,10.008814,49.845593,11.579831,12.547458,26.120339,670.309998


In [3]:
final_df["Date"] = pd.to_datetime(final_df["Date"])
final_df = final_df.set_index("Date")

In [4]:
final_df.index

DatetimeIndex(['2022-11-07', '2022-11-08', '2022-11-09', '2022-11-10',
               '2022-11-11', '2022-11-14', '2022-11-15', '2022-11-16',
               '2022-11-17', '2022-11-18',
               ...
               '2025-10-27', '2025-10-28', '2025-10-29', '2025-10-30',
               '2025-10-31', '2025-11-03', '2025-11-04', '2025-11-05',
               '2025-11-06', '2025-11-07'],
              dtype='datetime64[ns]', name='Date', length=754, freq=None)

In [5]:
# 저장
final_df.to_csv("../output_data/top60_final_merged_y_close.csv", index=True, encoding="utf-8-sig")