In [1]:
import numpy as np
import pandas as pd
import os

def calculate_trend_slope(data, window_size):
    if len(data) < window_size:
        raise ValueError("데이터 길이가 구간 크기보다 작습니다.")
    window_data = data.astype(float)
    X = np.arange(window_size)
    coefficients = np.polyfit(X, window_data, 1)
    slope = coefficients[0]
    nan_indices = np.isnan(window_data)
    if np.any(nan_indices):
        window_data = window_data[~nan_indices]
        X = X[~nan_indices]
        coefficients = np.polyfit(X, window_data, 1)
        slope = coefficients[0]
    abs_mean = np.mean(np.abs(window_data))
    if abs_mean == 0:
        return None
    else:
        standardized_slope = slope / abs_mean
        return standardized_slope

def calculate_trend_slope_daily(data, window_size):
    result = {}
    column_names = data.columns.tolist()
    for column in column_names:
        trend_slope_result = []
        for i in range(len(data) - window_size + 1):
            window_data = data[column].iloc[i:i+window_size]
            trend_slope = calculate_trend_slope(window_data, window_size)
            trend_slope_result.append(trend_slope)
        result[column] = trend_slope_result
    return result

# 데이터 불러오기
raw = r'feature_database_0628_ver.xlsx'
df = pd.read_excel(raw, sheet_name='Sheet1', index_col=0)
df.index = pd.to_datetime(df.index)  # 인덱스를 날짜 형식으로 변환
df = df.dropna(axis=0)  # NaN 값이 있는 행 삭제

# 추세 계산
window_size_20 = 20
result_20 = calculate_trend_slope_daily(df, window_size_20)
new_df_20 = df.iloc[window_size_20-1:]
result_df_20 = pd.DataFrame(result_20)
result_df_20.index = new_df_20.index

window_size_60 = 60
result_60 = calculate_trend_slope_daily(df, window_size_60)
new_df_60 = df.iloc[window_size_60-1:]
result_df_60 = pd.DataFrame(result_60)
result_df_60.index = new_df_60.index

merge_inner = pd.merge(df, result_df_20.add_suffix('_20'), left_index=True, right_index=True)
merge_inner = pd.merge(merge_inner, result_df_60.add_suffix('_60'), left_index=True, right_index=True)

# 각 월별 첫 번째 데이터를 선택하도록 필터링
merge_inner['month'] = merge_inner.index.to_period('M')  # 각 인덱스를 월 단위로 변환
df_first_day_of_month = merge_inner.groupby('month').first()  # 각 월의 첫 번째 데이터 선택
df_first_day_of_month.index = df_first_day_of_month.index.to_timestamp().strftime('%Y-%m-%d')  # 인덱스를 'YYYY-MM-DD' 형식으로 변환
print(df_first_day_of_month)

# 결과를 엑셀 파일로 저장
base_filename = "Feature_data_base_20_60_1.xlsx"
i = 0
while os.path.exists(f"{base_filename[:-5]}_{i}.xlsx"):
    i += 1
new_filename = f"{base_filename[:-5]}_{i}.xlsx"
df_first_day_of_month.to_excel(new_filename, index=True)

keywords = ["auto","construct","capital_market","chemicals","equipment",
            "transport","semi","bank","steel",
            "telecom","staples","discretionary","kospi"]


                  WTI  DGS2  DGS10  TIPS    VIX      PPI      코스피    per  \
month                                                                      
2010-04-01  84.530000  1.05   3.89  1.61  17.47  172.200  1719.17  24.13   
2010-05-01  86.190000  1.00   3.72  1.32  20.19  173.900  1721.21  15.23   
2010-06-01  72.700000  0.78   3.29  1.32  35.54  175.200  1630.40  14.75   
2010-07-01  72.950000  0.63   2.96  1.21  32.86  176.100  1686.24  15.06   
2010-08-01  81.250000  0.56   2.99  1.13  22.01  174.800  1782.27  15.23   
...               ...   ...    ...   ...    ...      ...      ...    ...   
2024-02-01  74.360000  4.20   1.68  1.68  13.88  242.618  2542.46  18.67   
2024-03-01  79.670000  4.61   1.88  1.88  13.49  244.078  2674.27  19.66   
2024-04-01  84.540000  4.72   1.98  1.98  13.65  248.720  2747.86  20.21   
2024-05-01  80.250484  4.82   4.47  2.15  13.23  249.646  2734.36  21.09   
2024-06-01  76.894733  4.82   4.41  2.08  13.11  251.543  2682.52  20.70   

           