In [15]:
import numpy as np
import pandas as pd
import os

In [16]:
def calculate_trend_slope(data, window_size):
    if len(data) < window_size:
        raise ValueError("데이터 길이가 구간 크기보다 작습니다.")
    # float으로 data를 우선 만든다 그 이유는 소수점으로 나올 듯 하기 때문  좀더 정확하게 계산 하기 위해서
    window_data = data.astype(float)
    # x는 window size 만큼의 배열을 생성 
    # 이렇게 생성된 배열 X는 추세 선을 적합하기 위해 사용될 데이터 포인트의 인덱스를 나타낸다
    # 추세 선의 기울기를 결정하는데 사용됩니다.
    X = np.arange(window_size)
    # 함수는 다항식을 피팅하기 위해 주어진 데이터에 대해 최소 제곱 방법을 사용하여 다항식의 계수를 계산합니다.
    coefficients = np.polyfit(X, window_data, 1)  
    slope = coefficients[0]  # 추세선의 기울기를 추출합니다.
    nan_indices = np.isnan(window_data)
    # 데이터의 절대값의 평균을 계산합니다.
    abs_mean = np.mean(np.abs(window_data)) 
    # abs_mean이 0인 경우 예외를 발생시킵니다.
    if np.any(nan_indices):
        window_data = window_data[~nan_indices]
        X = X[~nan_indices]
        coefficients = np.polyfit(X, window_data, 1)
        slope = coefficients[0]
    abs_mean = np.mean(np.abs(window_data))
    
    if abs_mean == 0:
        return None
    else:
        standardized_slope = slope / abs_mean
        return standardized_slope

def calculate_trend_slope_daily(data, window_size):
    result = {}
    # 여기가 매우 중요한 부분 우리가 원하는 column을 tolist로 생성해야 한다
    column_names = data.columns.tolist()
    # 생성후 차례차례 column으로 넘겨서 각 column마다 돌아서 하기 위해 2중 for문 사용
    for column in column_names:
        # 계속해서 list를 생성할 수 없으니 for문 column마다 초기화
        trend_slope_result = []
        # 우리가 원하는 값이 data의 값과 window size값의 차이만큼을 원하므로 이런식으로 사용
        for i in range(len(data) - window_size + 1):
            #window_data에 iloc을 사용해서 i 부터 i+ window_size 만큼 준다 그 이후 window_data의 값을 선형회기 함수에 
            # 이 값을 result에 append 하고 result [column]에 넣고 처음으로 다시 돌아간다
            window_data = data[column].iloc[i:i+window_size]
            trend_slope = calculate_trend_slope(window_data, window_size)
            trend_slope_result.append(trend_slope)
        result[column] = trend_slope_result
    return result

In [17]:
# 데이터 불러오기 우리가 원하는 값들을 불러오면됨
raw = r'feature_database_0628_ver.xlsx'
df = pd.read_excel(raw, sheet_name='Sheet1', index_col=0)
df.index = df.index.strftime('%Y-%m-%d')

In [18]:
df.head()

Unnamed: 0_level_0,WTI,DGS2,DGS10,TIPS,VIX,PPI,코스피,per,pbr,원달러,...,발틱운임,scfi,미 소맥 선물,미 대두 선물,항공 여객,소비자심리지수,ICT 전망 BSI,IT 월별 수출현황,SOX,메모리 수출금액
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-01,,,,,,,,,,,...,0.0,1072.58,540.75,1048.63,6210262.0,116.9,90,11722815985,,1390919
2010-01-02,,,,,,,,,,,...,0.0,1072.58,,,6210262.0,116.9,90,11722815985,,1390919
2010-01-03,,,,,,,,,,,...,0.0,1072.58,,,6210262.0,116.9,90,11722815985,,1390919
2010-01-04,81.52,1.09,3.85,1.47,20.04,170.8,1696.14,23.87,23.87,1154.8,...,3140.0,1072.58,557.63,1058.13,6210262.0,116.9,90,11722815985,366.1,1390919
2010-01-05,81.74,1.01,3.77,1.43,19.35,170.8,1690.62,23.8,23.8,1140.5,...,3270.0,1072.58,552.63,1059.88,6210262.0,116.9,90,11722815985,366.4,1390919


In [19]:
# 3. NaN 값이 있는 열 삭제
df= df.dropna(axis=0)

# 결과 출력
print(df)

                  WTI  DGS2  DGS10  TIPS    VIX      PPI      코스피    per  \
date                                                                       
2010-01-05  81.740000  1.01   3.77  1.43  19.35  170.800  1690.62  23.80   
2010-01-06  83.120000  1.01   3.85  1.48  19.16  170.800  1705.32  24.01   
2010-01-07  82.600000  1.03   3.85  1.44  19.06  170.800  1683.45  23.71   
2010-01-08  82.740000  0.96   3.83  1.41  18.13  170.800  1695.26  23.88   
2010-01-11  82.540000  0.95   3.85  1.47  17.55  170.800  1694.12  23.85   
...               ...   ...    ...   ...    ...      ...      ...    ...   
2024-06-21  81.986841  4.70   4.25  2.02  13.20  248.953  2784.26  18.66   
2024-06-24  82.724723  4.71   4.25  2.03  13.33  248.953  2764.73  18.52   
2024-06-25  82.062925  4.65   4.23  2.01  12.84  248.953  2774.39  18.59   
2024-06-26  82.120369  4.71   4.32  2.06  12.55  248.953  2792.05  18.71   
2024-06-27  82.810180  4.70   4.29  2.03  12.24  248.953  2784.06  18.66   

           

In [20]:
df.head()

Unnamed: 0_level_0,WTI,DGS2,DGS10,TIPS,VIX,PPI,코스피,per,pbr,원달러,...,발틱운임,scfi,미 소맥 선물,미 대두 선물,항공 여객,소비자심리지수,ICT 전망 BSI,IT 월별 수출현황,SOX,메모리 수출금액
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-05,81.74,1.01,3.77,1.43,19.35,170.8,1690.62,23.8,23.8,1140.5,...,3270.0,1072.58,552.63,1059.88,6210262.0,116.9,90,11722815985,366.4,1390919
2010-01-06,83.12,1.01,3.85,1.48,19.16,170.8,1705.32,24.01,24.01,1136.4,...,3259.0,1072.58,567.13,1058.13,6210262.0,116.9,90,11722815985,366.3,1390919
2010-01-07,82.6,1.03,3.85,1.44,19.06,170.8,1683.45,23.71,23.71,1135.4,...,3149.0,1106.28,556.63,1026.13,6210262.0,116.9,90,11722815985,362.3,1390919
2010-01-08,82.74,0.96,3.83,1.41,18.13,170.8,1695.26,23.88,23.88,1130.5,...,3140.0,1106.28,568.13,1019.38,6210262.0,116.9,90,11722815985,367.7,1390919
2010-01-11,82.54,0.95,3.85,1.47,17.55,170.8,1694.12,23.85,23.85,1119.8,...,3148.0,1106.28,574.25,1009.88,6210262.0,116.9,90,11288314689,366.6,1390919


In [21]:
df.info

<bound method DataFrame.info of                   WTI  DGS2  DGS10  TIPS    VIX      PPI      코스피    per  \
date                                                                       
2010-01-05  81.740000  1.01   3.77  1.43  19.35  170.800  1690.62  23.80   
2010-01-06  83.120000  1.01   3.85  1.48  19.16  170.800  1705.32  24.01   
2010-01-07  82.600000  1.03   3.85  1.44  19.06  170.800  1683.45  23.71   
2010-01-08  82.740000  0.96   3.83  1.41  18.13  170.800  1695.26  23.88   
2010-01-11  82.540000  0.95   3.85  1.47  17.55  170.800  1694.12  23.85   
...               ...   ...    ...   ...    ...      ...      ...    ...   
2024-06-21  81.986841  4.70   4.25  2.02  13.20  248.953  2784.26  18.66   
2024-06-24  82.724723  4.71   4.25  2.03  13.33  248.953  2764.73  18.52   
2024-06-25  82.062925  4.65   4.23  2.01  12.84  248.953  2774.39  18.59   
2024-06-26  82.120369  4.71   4.32  2.06  12.55  248.953  2792.05  18.71   
2024-06-27  82.810180  4.70   4.29  2.03  12.24  248.953

In [22]:
window_size_20=20

result_20 = calculate_trend_slope_daily(df, window_size_20)

new_df_20 = df.iloc[window_size_20-1:]
result_df_20 = pd.DataFrame(result_20)


result_df_20.index=new_df_20.index

print(result_df_20)


                 WTI      DGS2     DGS10      TIPS       VIX           PPI  \
date                                                                         
2010-02-02 -0.006935 -0.009921 -0.003012 -0.006833  0.017654 -1.199605e-17   
2010-02-03 -0.006354 -0.008857 -0.002775 -0.006683  0.016993 -1.199605e-17   
2010-02-04 -0.006123 -0.008868 -0.002612 -0.006136  0.018869 -1.199605e-17   
2010-02-05 -0.006175 -0.008676 -0.002470 -0.005264  0.020143 -1.199605e-17   
2010-02-08 -0.005834 -0.008923 -0.002190 -0.004272  0.020425 -1.199605e-17   
...              ...       ...       ...       ...       ...           ...   
2024-06-21  0.000383 -0.002082 -0.003088 -0.003343  0.000455 -5.821254e-04   
2024-06-24  0.000876 -0.002387 -0.003338 -0.003620  0.001771 -6.523171e-04   
2024-06-25  0.001389 -0.002940 -0.003838 -0.004345  0.001317 -7.070420e-04   
2024-06-26  0.002023 -0.003110 -0.003845 -0.004458 -0.000035 -7.462760e-04   
2024-06-27  0.002639 -0.003306 -0.003954 -0.004825 -0.001951 -7.

In [23]:
window_size_60=60
result_60 = calculate_trend_slope_daily(df, window_size_60)
new_df_60 = df.iloc[window_size_60-1:]
result_df_60 = pd.DataFrame(result_60)
result_df_60.index=new_df_60.index
print(result_df_60)


                 WTI      DGS2     DGS10      TIPS       VIX       PPI  \
date                                                                     
2010-04-01  0.001020  0.001886  0.000202  0.002811 -0.003742  0.000251   
2010-04-05  0.001237  0.002512  0.000351  0.003084 -0.004067  0.000245   
2010-04-06  0.001483  0.003055  0.000521  0.003383 -0.004470  0.000240   
2010-04-07  0.001698  0.003488  0.000649  0.003483 -0.004842  0.000233   
2010-04-08  0.001903  0.003838  0.000776  0.003575 -0.005295  0.000226   
...              ...       ...       ...       ...       ...       ...   
2024-06-21 -0.000746  0.000838  0.015951  0.001765 -0.002374  0.000292   
2024-06-24 -0.000805  0.000707  0.015265  0.001515 -0.002419  0.000252   
2024-06-25 -0.000882  0.000564  0.014560  0.001261 -0.002461  0.000212   
2024-06-26 -0.000967  0.000464  0.013885  0.001066 -0.002690  0.000171   
2024-06-27 -0.000997  0.000370  0.013192  0.000868 -0.002962  0.000128   

                 코스피       per       

In [24]:
merge_inner = pd.merge(df, result_df_20.add_suffix('_20'), left_index=True, right_index=True)
merge_inner = pd.merge(merge_inner, result_df_60.add_suffix('_60'), left_index=True, right_index=True)
print(merge_inner)
print(len(merge_inner.columns))

                  WTI  DGS2  DGS10  TIPS    VIX      PPI      코스피    per  \
date                                                                       
2010-04-01  84.530000  1.05   3.89  1.61  17.47  172.200  1719.17  24.13   
2010-04-05  86.360000  1.18   4.01  1.70  17.02  172.200  1724.99  24.22   
2010-04-06  86.540000  1.14   3.98  1.68  16.23  172.200  1726.09  24.24   
2010-04-07  85.640000  1.06   3.89  1.55  16.62  172.200  1726.60  24.26   
2010-04-08  85.170000  1.09   3.91  1.58  16.48  172.200  1733.78  24.42   
...               ...   ...    ...   ...    ...      ...      ...    ...   
2024-06-21  81.986841  4.70   4.25  2.02  13.20  248.953  2784.26  18.66   
2024-06-24  82.724723  4.71   4.25  2.03  13.33  248.953  2764.73  18.52   
2024-06-25  82.062925  4.65   4.23  2.01  12.84  248.953  2774.39  18.59   
2024-06-26  82.120369  4.71   4.32  2.06  12.55  248.953  2792.05  18.71   
2024-06-27  82.810180  4.70   4.29  2.03  12.24  248.953  2784.06  18.66   

           

In [25]:
# base_filename = r'C:\Users\siim2\sic_project_final\sic\auto_raw_20_60_{}.xlsx'

# # 해당 디렉토리에 이미 존재하는 파일들을 확인
# existing_files = os.listdir(r'C:\Users\siim2\sic_project_final\sic')
# if existing_files is True:
#     print("already")
# else:
#     print("none")

# # 가장 최신 버전 찾기
# latest_version = 0
# for filename in existing_files:
#     if filename.startswith('result_version_') and filename.endswith('.xlsx'):
#         version_number = int(filename.split('_')[2].split('.')[0])
#         if version_number > latest_version:
#             latest_version = version_number


In [26]:
import os

# 기본 파일 이름
base_filename = "Feature_data_base_20_60_1.xlsx"

# 숫자 초기화
i = 0

# 파일 이름이 존재하지 않을 때까지 반복
while os.path.exists(f"{base_filename[:-5]}_{i}.xlsx"):
    i += 1

# 새 파일 이름 생성
new_filename = f"{base_filename[:-5]}_{i}.xlsx"

# DataFrame을 Excel 파일로 저장
merge_inner.to_excel("Feature_data_base_20_60_final_06_28.xlsx", index=True)