In [91]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
import xgboost as xgb
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
# 데이터 로드
#df = pd.read_csv('/Users/smooooth/Downloads/이상치모두제거.csv', index_col=0)
#df.drop(columns=['연차', 'BusinessNum'], inplace=True)

In [161]:
# 데이터로드
path = 'C:/Users/MSI/Desktop/study/Data/기업정보/'
df = pd.read_csv(path + '원자료변수_이상치모두제거.csv', index_col=0)
df.drop(columns=['BusinessNum'], inplace=True)

In [162]:
df2 = pd.read_csv(path + '원자료변수_재무통합_전처리.csv', index_col=0)
df2 = df2.loc[df2['stYear']==2021.0]
df2.drop(columns=['BusinessNum'], inplace=True)
df2 = df2.dropna(subset=['매출액', '영업이익', '당기순이익(손실)'])

In [144]:
df.columns

Index(['stYear', '당기순이익(손실)', '매출액', '영업이익', '유동부채', '유동자산', '자기자본', '총자산',
       '매출총이익', '매출원가', '매출채권', '종업원수', '대분류코드', '항목명', '성장단계', '주소'],
      dtype='object')

In [145]:
df2.columns

Index(['stYear', '당기순이익(손실)', '매출액', '영업이익', '유동부채', '유동자산', '자기자본', '총자산',
       '매출총이익', '매출원가', '매출채권', '종업원수', '대분류코드', '항목명', '성장단계', '주소'],
      dtype='object')

In [156]:
cond1 = df['stYear'] == 2021.0
df3 = df.loc[~cond1]


df3 = df3.dropna(subset=['매출액', '영업이익', '당기순이익(손실)'])

# 카테고리와 숫자형 변수 리스트 생성
category_list = df3.select_dtypes(include=['object']).columns
numeric_list = df3.select_dtypes(include=[np.number]).columns.drop(['매출액', '영업이익', '당기순이익(손실)'])

# 스케일러 목록
scalers = [StandardScaler(), RobustScaler()]

# 모델 목록
models = {
    'XGBoost': xgb.XGBRegressor(),
    'ElasticNet' : ElasticNet(),
    'GradientBoostingRegressor': GradientBoostingRegressor()
}

# 타겟 변수 목록
targets = ['매출액', '영업이익', '당기순이익(손실)']

results = []
results2 = []

for target in targets:
    Y = df3[target]
    X = df3.drop(columns=target)
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)
    
    for scaler in scalers:
        numeric_pipe = make_pipeline(SimpleImputer(strategy='median'), scaler)
        category_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'))
        
        prepro_pipe = make_column_transformer((numeric_pipe, numeric_list),
                                            (category_pipe, category_list))
        
        for model_name, model in models.items():
            model_pipe = make_pipeline(prepro_pipe, model)
            
            model_pipe.fit(X_train, Y_train)
            
            Y_train_pred = model_pipe.predict(X_train)
            Y_test_pred = model_pipe.predict(X_test)
            
            train_mse = mean_squared_error(Y_train, Y_train_pred)
            test_mse = mean_squared_error(Y_test, Y_test_pred)
            train_r2 = r2_score(Y_train, Y_train_pred)
            test_r2 = r2_score(Y_test, Y_test_pred)
            
            results.append({
                'target': target,
                'scaler': scaler.__class__.__name__,
                'model': model_name,
                'train_mse': train_mse,
                'test_mse': test_mse,
                'train_r2': train_r2,
                'test_r2': test_r2
            })

            # df2에 대한 성능 평가
            if target in df2.columns:
                X_df2 = df2.drop(columns=target)
                Y_df2 = df2[target]
                Y_df2_pred = model_pipe.predict(X_df2)
                df2_mse = mean_squared_error(Y_df2, Y_df2_pred)
                df2_r2 = r2_score(Y_df2, Y_df2_pred)


                results2.append({
                    'target': target,
                    'scaler': scaler.__class__.__name__,
                    'model': model_name,
                    'df2_mse': df2_mse,
                    'df2_r2': train_r2
                })

# 결과를 데이터프레임으로 변환
results_df = pd.DataFrame(results)
results2_df = pd.DataFrame(results2)

# int형으로 변환
results_df['train_mse'] = results_df['train_mse'].apply(lambda x: int(float(x)))
results_df['test_mse'] = results_df['test_mse'].apply(lambda x: int(float(x)))
results2_df['df2_mse'] = results2_df['df2_mse'].apply(lambda x: int(float(x)))

In [55]:
# 파생변수
# results_df

Unnamed: 0,target,scaler,model,train_mse,test_mse
0,매출액,StandardScaler,XGBoost,325679771972,2947917693579
1,매출액,StandardScaler,ElasticNet,4797079760252,4557023833415
2,매출액,StandardScaler,GradientBoostingRegressor,2466627963942,2783371933655
3,매출액,RobustScaler,XGBoost,331649322390,2894180982395
4,매출액,RobustScaler,ElasticNet,4706766581338,4403952335066
5,매출액,RobustScaler,GradientBoostingRegressor,2466627963942,2779794974869
6,영업이익,StandardScaler,XGBoost,1175861603,7567468298
7,영업이익,StandardScaler,ElasticNet,9799229720,9287005133
8,영업이익,StandardScaler,GradientBoostingRegressor,6280306325,7204909244
9,영업이익,RobustScaler,XGBoost,1175861603,7567468298


In [123]:
#원자료변수
results_df

Unnamed: 0,target,scaler,model,train_mse,test_mse,train_r2,test_r2
0,매출액,StandardScaler,XGBoost,139822445,6104208291,0.999862,0.993609
1,매출액,StandardScaler,ElasticNet,112421198797,107129207510,0.888942,0.887836
2,매출액,StandardScaler,GradientBoostingRegressor,1671032664,7868325844,0.998349,0.991762
3,매출액,RobustScaler,XGBoost,139822445,6104208451,0.999862,0.993609
4,매출액,RobustScaler,ElasticNet,60762129525,60384729398,0.939975,0.936777
5,매출액,RobustScaler,GradientBoostingRegressor,1671032664,7826101945,0.998349,0.991806
6,영업이익,StandardScaler,XGBoost,17373060,516356980,0.998779,0.961033
7,영업이익,StandardScaler,ElasticNet,6183543330,5802106692,0.565236,0.562139
8,영업이익,StandardScaler,GradientBoostingRegressor,441139188,696037883,0.968984,0.947473
9,영업이익,RobustScaler,XGBoost,17373060,516364043,0.998779,0.961032


In [157]:
#논문자료변수
results_df

Unnamed: 0,target,scaler,model,train_mse,test_mse,train_r2,test_r2
0,매출액,StandardScaler,XGBoost,769231931,20575367896,0.999276,0.980833
1,매출액,StandardScaler,ElasticNet,139942446173,143603716135,0.868308,0.866225
2,매출액,StandardScaler,GradientBoostingRegressor,11528827185,30238184615,0.989151,0.971832
3,매출액,RobustScaler,XGBoost,375416967,18806542489,0.999647,0.982481
4,매출액,RobustScaler,ElasticNet,138725159971,142012611716,0.869453,0.867708
5,매출액,RobustScaler,GradientBoostingRegressor,11528827185,30516392037,0.989151,0.971572
6,영업이익,StandardScaler,XGBoost,6202884108,27394583243,0.898901,0.431731
7,영업이익,StandardScaler,ElasticNet,49105126246,37966390091,0.199649,0.212431
8,영업이익,StandardScaler,GradientBoostingRegressor,24138845990,28227982644,0.606567,0.414443
9,영업이익,RobustScaler,XGBoost,6345715140,26665661158,0.896573,0.446852


In [98]:
# 원자료변수 + 이상치변환
#results_df

Unnamed: 0,target,scaler,model,train_mse,test_mse,train_r2,test_r2
0,매출액,StandardScaler,XGBoost,380249872137,1620248440364,0.961342,0.832741
1,매출액,StandardScaler,ElasticNet,3617120716978,3642967103861,0.632267,0.623934
2,매출액,StandardScaler,GradientBoostingRegressor,1920606696534,2135985174275,0.804742,0.779501
3,매출액,RobustScaler,XGBoost,380249872137,1620305998924,0.961342,0.832735
4,매출액,RobustScaler,ElasticNet,3318051454728,3357491770874,0.662672,0.653404
5,매출액,RobustScaler,GradientBoostingRegressor,1920606696534,2136670644612,0.804742,0.77943
6,영업이익,StandardScaler,XGBoost,2517756618,6886207529,0.909564,0.749278
7,영업이익,StandardScaler,ElasticNet,21563573041,21599825213,0.225448,0.213565
8,영업이익,StandardScaler,GradientBoostingRegressor,11620922120,12366536739,0.582583,0.549743
9,영업이익,RobustScaler,XGBoost,2517756618,6885951567,0.909564,0.749287


In [124]:
# 원자료변수
results2_df

Unnamed: 0,target,scaler,model,df2_mse,df2_r2
0,매출액,StandardScaler,XGBoost,50746822199951000,0.999862
1,매출액,StandardScaler,ElasticNet,83869803129097248,0.888942
2,매출액,StandardScaler,GradientBoostingRegressor,50689235445846024,0.998349
3,매출액,RobustScaler,XGBoost,50746822019340384,0.999862
4,매출액,RobustScaler,ElasticNet,32956246967907276,0.939975
5,매출액,RobustScaler,GradientBoostingRegressor,50686480408195640,0.998349
6,영업이익,StandardScaler,XGBoost,2165282805274501,0.998779
7,영업이익,StandardScaler,ElasticNet,2100988563835355,0.565236
8,영업이익,StandardScaler,GradientBoostingRegressor,2165226281598264,0.968984
9,영업이익,RobustScaler,XGBoost,2165282804348882,0.998779


In [158]:
# 논문자료변수
results2_df

Unnamed: 0,target,scaler,model,df2_mse,df2_r2
0,매출액,StandardScaler,XGBoost,50803221458992744,0.999276
1,매출액,StandardScaler,ElasticNet,5429921268329194,0.868308
2,매출액,StandardScaler,GradientBoostingRegressor,50668197938699992,0.989151
3,매출액,RobustScaler,XGBoost,50814530039543544,0.999647
4,매출액,RobustScaler,ElasticNet,4955982120944182,0.869453
5,매출액,RobustScaler,GradientBoostingRegressor,50687514898672528,0.989151
6,영업이익,StandardScaler,XGBoost,2177350682412965,0.898901
7,영업이익,StandardScaler,ElasticNet,2502253504354956,0.199649
8,영업이익,StandardScaler,GradientBoostingRegressor,2176725897529438,0.606567
9,영업이익,RobustScaler,XGBoost,2179143419233644,0.896573


In [100]:
# 원자료변수 + 이상치변환
#results2_df

Unnamed: 0,target,scaler,model,df2_mse,df2_r2
0,매출액,StandardScaler,XGBoost,51237656119843616,0.961342
1,매출액,StandardScaler,ElasticNet,32440357949804632,0.632267
2,매출액,StandardScaler,GradientBoostingRegressor,50613531560042720,0.804742
3,매출액,RobustScaler,XGBoost,51237645493825832,0.961342
4,매출액,RobustScaler,ElasticNet,16325546210387058,0.662672
5,매출액,RobustScaler,GradientBoostingRegressor,50644526326526496,0.804742
6,영업이익,StandardScaler,XGBoost,2166788731919113,0.909564
7,영업이익,StandardScaler,ElasticNet,1615818576805678,0.225448
8,영업이익,StandardScaler,GradientBoostingRegressor,2165064195258085,0.582583
9,영업이익,RobustScaler,XGBoost,2166788718330880,0.909564


## 베스트 모델 선택 후 하이퍼 파라미터 튜닝

In [163]:
cond1 = df['stYear'] == 2021.0
df3 = df.loc[~cond1]

df3 = df3.dropna(subset=['매출액', '영업이익', '당기순이익(손실)'])
df2 = df2.dropna(subset=['매출액', '영업이익', '당기순이익(손실)'])

In [164]:
# 카테고리와 숫자형 변수 리스트 생성
category_list = df3.select_dtypes(include=['object']).columns
numeric_list = df3.select_dtypes(include=[np.number]).columns.drop(['매출액', '영업이익', '당기순이익(손실)'])

In [171]:
# 타겟 변수 목록
targets = ['매출액', '영업이익', '당기순이익(손실)']
results = []

In [173]:
# 하이퍼파라미터 그리드 설정
param_grid = {
    'gradientboostingregressor__n_estimators': [50, 100, 200],
    'gradientboostingregressor__max_depth': [3, 4, 5, 6],
    'gradientboostingregressor__learning_rate': [0.01, 0.05, 0.1],
    'gradientboostingregressor__subsample': [0.7, 0.8, 0.9]
}

for target in targets:
    Y = df3[target]
    X = df3.drop(columns=target)
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)
    
    numeric_pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())
    category_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'))
        
    prepro_pipe = make_column_transformer((numeric_pipe, numeric_list),
                                          (category_pipe, category_list))
        
    model_pipe = make_pipeline(prepro_pipe, GradientBoostingRegressor(random_state=42))
    
    grid_search = GridSearchCV(model_pipe, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, Y_train)
    
    best_model = grid_search.best_estimator_
    
    Y_train_pred = best_model.predict(X_train)
    Y_test_pred = best_model.predict(X_test)
    
    train_mse = mean_squared_error(Y_train, Y_train_pred)
    test_mse = mean_squared_error(Y_test, Y_test_pred)
    train_r2 = r2_score(Y_train, Y_train_pred)
    test_r2 = r2_score(Y_test, Y_test_pred)
    
    results.append({
        'target': target,
        'train_mse': train_mse,
        'test_mse': test_mse,
        'train_r2': train_r2,
        'test_r2': test_r2,
        'best_params': grid_search.best_params_        
    })

In [174]:
results_df = pd.DataFrame(results)
results_df
results_df['train_mse'] = results_df['train_mse'].apply(lambda x: int(float(x)))
results_df['test_mse'] = results_df['test_mse'].apply(lambda x: int(float(x)))
results_df

Unnamed: 0,target,train_mse,test_mse,train_r2,test_r2,best_params
0,매출액,314722315,7670183339,0.999689,0.991969,{'gradientboostingregressor__learning_rate': 0...
1,영업이익,13585743,370913514,0.999045,0.972009,{'gradientboostingregressor__learning_rate': 0...
2,당기순이익(손실),308052691,1402688244,0.973616,0.859579,{'gradientboostingregressor__learning_rate': 0...


In [167]:
results_df = pd.DataFrame(results)
results_df
results_df['train_mse'] = results_df['train_mse'].apply(lambda x: int(float(x)))
results_df['test_mse'] = results_df['test_mse'].apply(lambda x: int(float(x)))
results_df

Unnamed: 0,target,train_mse,test_mse,train_r2,test_r2,best_params
0,매출액,504331824,6908128118,0.999502,0.992767,"{'xgbregressor__colsample_bytree': 0.9, 'xgbre..."
1,영업이익,33266178,403474849,0.997661,0.969551,"{'xgbregressor__colsample_bytree': 0.9, 'xgbre..."
2,당기순이익(손실),360436132,1451241022,0.969129,0.854719,"{'xgbregressor__colsample_bytree': 0.9, 'xgbre..."


In [141]:
for i in results_df['best_params'] :
    print(i)

{'xgbregressor__colsample_bytree': 1.0, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_depth': 5, 'xgbregressor__n_estimators': 300, 'xgbregressor__subsample': 0.8}
{'xgbregressor__colsample_bytree': 1.0, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_depth': 5, 'xgbregressor__n_estimators': 300, 'xgbregressor__subsample': 0.6}
{'xgbregressor__colsample_bytree': 1.0, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_depth': 7, 'xgbregressor__n_estimators': 300, 'xgbregressor__subsample': 0.6}


In [179]:
results2 = []

In [180]:
for target in targets:
    X_df2 = df2.drop(columns=target)
    Y_df2 = df2[target]
    Y_df2_pred = best_model.predict(X_df2)
    df2_mse = mean_squared_error(Y_df2, Y_df2_pred)
    df2_r2 = r2_score(Y_df2, Y_df2_pred)


    results2.append({
        'target': target,
        'scaler': scaler.__class__.__name__,
        'model': model_name,
        'df2_mse': df2_mse,
        'df2_r2': train_r2 })

In [181]:
results_df2 = pd.DataFrame(results2)
results_df2
results_df2['df2_mse'] = results_df2['df2_mse'].apply(lambda x: int(float(x)))
results_df2

Unnamed: 0,target,scaler,model,df2_mse,df2_r2
0,매출액,RobustScaler,GradientBoostingRegressor,51098253117395296,0.973616
1,영업이익,RobustScaler,GradientBoostingRegressor,2165825058782721,0.973616
2,당기순이익(손실),RobustScaler,GradientBoostingRegressor,1474310239634679,0.973616
