In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
import xgboost as xgb
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error

In [3]:
# 데이터 로드
#df = pd.read_csv('/Users/smooooth/Downloads/이상치모두제거.csv', index_col=0)
#df.drop(columns=['연차', 'BusinessNum'], inplace=True)

# 영업이익 모델

In [45]:
# 데이터로드
path = 'C:/Users/MSI/Desktop/study/Data/기업정보/'
df = pd.read_csv(path + '영업이익용_이상치모두제거.csv', index_col=0)
df.drop(columns=['BusinessNum'], inplace=True)
df = df.dropna(subset=['영업이익'])

In [46]:
cond1 = df['stYear'] == 2021.0
df3 = df.loc[~cond1]
df2 = df.loc[cond1]

df3 = df3.dropna(subset=['영업이익'])

# 카테고리와 숫자형 변수 리스트 생성
category_list = df3.select_dtypes(include=['object']).columns
numeric_list = df3.select_dtypes(include=[np.number]).columns.drop(['영업이익'])

# 스케일러 목록
scalers = [StandardScaler(), RobustScaler()]

# 모델 목록
models = {
    'XGBoost': xgb.XGBRegressor(),
    'ElasticNet' : ElasticNet(),
    'GradientBoostingRegressor': GradientBoostingRegressor()
}


results = []
results2 = []


Y = df3['영업이익']
X = df3.drop(columns='영업이익')

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)

for scaler in scalers:
    numeric_pipe = make_pipeline(SimpleImputer(strategy='median'), scaler)
    category_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'))

    prepro_pipe = make_column_transformer((numeric_pipe, numeric_list),
                                        (category_pipe, category_list))

    for model_name, model in models.items():
        model_pipe = make_pipeline(prepro_pipe, model)

        model_pipe.fit(X_train, Y_train)

        Y_train_pred = model_pipe.predict(X_train)
        Y_test_pred = model_pipe.predict(X_test)

        train_mse = mean_squared_error(Y_train, Y_train_pred)
        test_mse = mean_squared_error(Y_test, Y_test_pred)

        results.append({
            'target': '영업이익',
            'scaler': scaler.__class__.__name__,
            'model': model_name,
            'train_mse': train_mse,
            'test_mse': test_mse
        })

        # df2에 대한 성능 평가
        if '영업이익' in df2.columns:
            X_df2 = df2.drop(columns='영업이익')
            Y_df2 = df2['영업이익']
            Y_df2_pred = model_pipe.predict(X_df2)
            df2_mse = mean_squared_error(Y_df2, Y_df2_pred)

            results2.append({
                'target': '영업이익',
                'scaler': scaler.__class__.__name__,
                'model': model_name,
                'df2_mse': df2_mse
            })

# 결과를 데이터프레임으로 변환
results_df = pd.DataFrame(results)
results2_df = pd.DataFrame(results2)

In [47]:
results_df['train_mse'] = results_df['train_mse'].apply(lambda x: int(float(x)))
results_df['test_mse'] = results_df['test_mse'].apply(lambda x: int(float(x)))

In [48]:
results_df

Unnamed: 0,target,scaler,model,train_mse,test_mse
0,영업이익,StandardScaler,XGBoost,16902863,418346432
1,영업이익,StandardScaler,ElasticNet,1866396709,2199877461
2,영업이익,StandardScaler,GradientBoostingRegressor,310963488,499642840
3,영업이익,RobustScaler,XGBoost,16902863,418343118
4,영업이익,RobustScaler,ElasticNet,1649976514,1989067939
5,영업이익,RobustScaler,GradientBoostingRegressor,310963488,499674831


In [49]:
results2_df['df2_mse'] = results2_df['df2_mse'].apply(lambda x: int(float(x)))

In [50]:
results2_df

Unnamed: 0,target,scaler,model,df2_mse
0,영업이익,StandardScaler,XGBoost,456480335
1,영업이익,StandardScaler,ElasticNet,2518612044
2,영업이익,StandardScaler,GradientBoostingRegressor,534103340
3,영업이익,RobustScaler,XGBoost,456488532
4,영업이익,RobustScaler,ElasticNet,2233752245
5,영업이익,RobustScaler,GradientBoostingRegressor,536842519


### 베스트 모델 선택 후 하이퍼 파라미터 튜닝

In [5]:
cond1 = df['stYear'] == 2021.0
df3 = df.loc[~cond1]
df2 = df.loc[cond1]

df3 = df3.dropna(subset=['영업이익'])

# 카테고리와 숫자형 변수 리스트 생성
category_list = df3.select_dtypes(include=['object']).columns
numeric_list = df3.select_dtypes(include=[np.number]).columns.drop(['영업이익'])

In [6]:
# 타겟 변수 목록
targets = ['매출액', '영업이익', '당기순이익(손실)']
results = []

In [7]:
# 하이퍼파라미터 그리드 설정
param_grid = {
    'xgbregressor__n_estimators': [100, 200, 300],
    'xgbregressor__max_depth': [3, 5, 7],
    'xgbregressor__learning_rate': [0.01, 0.1, 0.3],
    'xgbregressor__subsample': [0.6, 0.8, 1.0],
    'xgbregressor__colsample_bytree': [0.6, 0.8, 1.0]
}


Y = df3['영업이익']
X = df3.drop(columns='영업이익')

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)

numeric_pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())
category_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'))

prepro_pipe = make_column_transformer((numeric_pipe, numeric_list),
                                      (category_pipe, category_list))

model_pipe = make_pipeline(prepro_pipe, xgb.XGBRegressor(random_state=42))

grid_search = GridSearchCV(model_pipe, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, Y_train)

best_model = grid_search.best_estimator_

Y_train_pred = best_model.predict(X_train)
Y_test_pred = best_model.predict(X_test)

train_mse = mean_squared_error(Y_train, Y_train_pred)
test_mse = mean_squared_error(Y_test, Y_test_pred)

results.append({
    'target': '영업이익',
    'train_mse': train_mse,
    'test_mse': test_mse,
    'best_params': grid_search.best_params_
})

매출액	2,466,627,963,942	2,783,371,933,655  파생변수 칼럼통일 하이퍼파라미터전 이상치제거
       1,596,093,496,266   2,669,083,067,718  파생변수 칼럼통일 하이퍼파라미터후 이상치제거
              92,126,937	   1,277,428,827  원변수   칼럼따로 이상치제거
             139,822,445	   6,104,208,291  원변수   칼럼통일 이상치제거
             139,822,445	 6104208291	0.999862	0.993609
             103,800,684	   6,397,147,945  원변수   칼럼통일 이상치제거 하이퍼파라미터
             769,231,931    	  20,575,367,896  논문변수 칼럼통일 이상치제거
         380,249,872,137	   1,620,248,440,364  원변수   칼럼통일 이상치변환
          27,619,746,687	410,726,268,745	   0.999046	0.986168
       

영업이익 6,280,306,325	7,204,909,244
         4,357,454,501	6,966,769,559
            16,902,863	  418,346,432
            17,373,060	  516,356,980
            21,180,373	  356,758,515
         6,202,884,108 27,394,583,243
         2,517,756,618	6,886,207,529	


당기순이익(손실) 3,974,955,742	 4,961,512,963
                 1,724,069,869	4,390,779,966
                 1,469,786,218	21,700,909,392
                 129,605,633	1,625,527,362
                  55,651,933    1,492,895,149
                 5,762,994,439	26,699,366,862
                 3,948,226,257	9,706,814,963	

In [10]:
results_df = pd.DataFrame(results)
results_df
results_df['train_mse'] = results_df['train_mse'].apply(lambda x: int(float(x)))
results_df['test_mse'] = results_df['test_mse'].apply(lambda x: int(float(x)))
results_df

Unnamed: 0,target,train_mse,test_mse,best_params
0,매출액,1596093496266,2669083067718,"{'xgbregressor__colsample_bytree': 0.8, 'xgbre..."
1,영업이익,4357454501,6966769559,"{'xgbregressor__colsample_bytree': 1.0, 'xgbre..."
2,당기순이익(손실),1724069869,4390779966,"{'xgbregressor__colsample_bytree': 1.0, 'xgbre..."


In [13]:
for i in results_df['best_params'] :
    print(i)

{'xgbregressor__colsample_bytree': 0.8, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_depth': 5, 'xgbregressor__n_estimators': 100, 'xgbregressor__subsample': 1.0}
{'xgbregressor__colsample_bytree': 1.0, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_depth': 5, 'xgbregressor__n_estimators': 100, 'xgbregressor__subsample': 0.8}
{'xgbregressor__colsample_bytree': 1.0, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_depth': 5, 'xgbregressor__n_estimators': 200, 'xgbregressor__subsample': 0.6}


# 당기순이익 모델

In [39]:
# 데이터로드
path = 'C:/Users/MSI/Desktop/study/Data/기업정보/'
df = pd.read_csv(path + '당기순이익_이상치모두제거.csv', index_col=0)
df.drop(columns=['BusinessNum'], inplace=True)
df = df.dropna(subset=['당기순이익(손실)'])

In [40]:
cond1 = df['stYear'] == 2021.0
df3 = df.loc[~cond1]
df2 = df.loc[cond1]

df3 = df3.dropna(subset=['당기순이익(손실)'])

# 카테고리와 숫자형 변수 리스트 생성
category_list = df3.select_dtypes(include=['object']).columns
numeric_list = df3.select_dtypes(include=[np.number]).columns.drop(['당기순이익(손실)'])

# 스케일러 목록
scalers = [StandardScaler(), RobustScaler()]

# 모델 목록
models = {
    'XGBoost': xgb.XGBRegressor(),
    'ElasticNet' : ElasticNet(),
    'GradientBoostingRegressor': GradientBoostingRegressor()
}


results = []
results2 = []


Y = df3['당기순이익(손실)']
X = df3.drop(columns='당기순이익(손실)')

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)

for scaler in scalers:
    numeric_pipe = make_pipeline(SimpleImputer(strategy='median'), scaler)
    category_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'))

    prepro_pipe = make_column_transformer((numeric_pipe, numeric_list),
                                        (category_pipe, category_list))

    for model_name, model in models.items():
        model_pipe = make_pipeline(prepro_pipe, model)

        model_pipe.fit(X_train, Y_train)

        Y_train_pred = model_pipe.predict(X_train)
        Y_test_pred = model_pipe.predict(X_test)

        train_mse = mean_squared_error(Y_train, Y_train_pred)
        test_mse = mean_squared_error(Y_test, Y_test_pred)

        results.append({
            'target': '당기순이익(손실)',
            'scaler': scaler.__class__.__name__,
            'model': model_name,
            'train_mse': train_mse,
            'test_mse': test_mse
        })

        # df2에 대한 성능 평가
        if '당기순이익(손실)' in df2.columns:
            X_df2 = df2.drop(columns='당기순이익(손실)')
            Y_df2 = df2['당기순이익(손실)']
            Y_df2_pred = model_pipe.predict(X_df2)
            df2_mse = mean_squared_error(Y_df2, Y_df2_pred)

            results2.append({
                'target': '당기순이익(손실)',
                'scaler': scaler.__class__.__name__,
                'model': model_name,
                'df2_mse': df2_mse
            })

# 결과를 데이터프레임으로 변환
results_df = pd.DataFrame(results)
results2_df = pd.DataFrame(results2)

In [41]:
results_df['train_mse'] = results_df['train_mse'].apply(lambda x: int(float(x)))
results_df['test_mse'] = results_df['test_mse'].apply(lambda x: int(float(x)))

In [42]:
results_df

Unnamed: 0,target,scaler,model,train_mse,test_mse
0,당기순이익(손실),StandardScaler,XGBoost,1469786218,21700909392
1,당기순이익(손실),StandardScaler,ElasticNet,12651547052,21569568377
2,당기순이익(손실),StandardScaler,GradientBoostingRegressor,5425475143,20635213994
3,당기순이익(손실),RobustScaler,XGBoost,1395166233,21839358355
4,당기순이익(손실),RobustScaler,ElasticNet,11808287896,20751347288
5,당기순이익(손실),RobustScaler,GradientBoostingRegressor,5425475143,20615116812


In [43]:
results2_df['df2_mse'] = results2_df['df2_mse'].apply(lambda x: int(float(x)))

In [44]:
results2_df

Unnamed: 0,target,scaler,model,df2_mse
0,당기순이익(손실),StandardScaler,XGBoost,16573510971
1,당기순이익(손실),StandardScaler,ElasticNet,17665584986
2,당기순이익(손실),StandardScaler,GradientBoostingRegressor,16938160879
3,당기순이익(손실),RobustScaler,XGBoost,16322931837
4,당기순이익(손실),RobustScaler,ElasticNet,16751548235
5,당기순이익(손실),RobustScaler,GradientBoostingRegressor,16905372762


# 매출액 모델

In [34]:
# 데이터로드
path = 'C:/Users/MSI/Desktop/study/Data/기업정보/'
df = pd.read_csv(path + '매출액_이상치모두제거.csv', index_col=0)
df.drop(columns=['BusinessNum'], inplace=True)
df = df.dropna(subset=['매출액'])

In [35]:
cond1 = df['stYear'] == 2021.0
df3 = df.loc[~cond1]
df2 = df.loc[cond1]

df3 = df3.dropna(subset=['매출액'])

# 카테고리와 숫자형 변수 리스트 생성
category_list = df3.select_dtypes(include=['object']).columns
numeric_list = df3.select_dtypes(include=[np.number]).columns.drop(['매출액'])

# 스케일러 목록
scalers = [StandardScaler(), RobustScaler()]

# 모델 목록
models = {
    'XGBoost': xgb.XGBRegressor(),
    'ElasticNet' : ElasticNet(),
    'GradientBoostingRegressor': GradientBoostingRegressor()
}


results = []
results2 = []


Y = df3['매출액']
X = df3.drop(columns='매출액')

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)

for scaler in scalers:
    numeric_pipe = make_pipeline(SimpleImputer(strategy='median'), scaler)
    category_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'))

    prepro_pipe = make_column_transformer((numeric_pipe, numeric_list),
                                        (category_pipe, category_list))

    for model_name, model in models.items():
        model_pipe = make_pipeline(prepro_pipe, model)

        model_pipe.fit(X_train, Y_train)

        Y_train_pred = model_pipe.predict(X_train)
        Y_test_pred = model_pipe.predict(X_test)

        train_mse = mean_squared_error(Y_train, Y_train_pred)
        test_mse = mean_squared_error(Y_test, Y_test_pred)

        results.append({
            'target': '매출액',
            'scaler': scaler.__class__.__name__,
            'model': model_name,
            'train_mse': train_mse,
            'test_mse': test_mse
        })

        # df2에 대한 성능 평가
        if '매출액' in df2.columns:
            X_df2 = df2.drop(columns='매출액')
            Y_df2 = df2['매출액']
            Y_df2_pred = model_pipe.predict(X_df2)
            df2_mse = mean_squared_error(Y_df2, Y_df2_pred)

            results2.append({
                'target': '매출액',
                'scaler': scaler.__class__.__name__,
                'model': model_name,
                'df2_mse': df2_mse
            })

# 결과를 데이터프레임으로 변환
results_df = pd.DataFrame(results)
results2_df = pd.DataFrame(results2)

In [36]:
results_df['train_mse'] = results_df['train_mse'].apply(lambda x: int(float(x)))
results_df['test_mse'] = results_df['test_mse'].apply(lambda x: int(float(x)))

In [37]:
results_df

Unnamed: 0,target,scaler,model,train_mse,test_mse
0,매출액,StandardScaler,XGBoost,92126937,1277428827
1,매출액,StandardScaler,ElasticNet,66196060102,59775777498
2,매출액,StandardScaler,GradientBoostingRegressor,1638273322,1646668470
3,매출액,RobustScaler,XGBoost,92126937,1280370526
4,매출액,RobustScaler,ElasticNet,49260869294,44556669547
5,매출액,RobustScaler,GradientBoostingRegressor,1638273322,1664658623


In [27]:
results2_df['df2_mse'] = results2_df['df2_mse'].apply(lambda x: int(float(x)))

In [28]:
results2_df

Unnamed: 0,target,scaler,model,df2_mse
0,영업이익,StandardScaler,XGBoost,456480335
1,영업이익,StandardScaler,ElasticNet,2518612044
2,영업이익,StandardScaler,GradientBoostingRegressor,535029889
3,영업이익,RobustScaler,XGBoost,456488532
4,영업이익,RobustScaler,ElasticNet,2233752245
5,영업이익,RobustScaler,GradientBoostingRegressor,533532396
