In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
import xgboost as xgb
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, r2_score

In [13]:
# 데이터 로드
#df = pd.read_csv('/Users/smooooth/Downloads/이상치모두제거.csv', index_col=0)
#df.drop(columns=['연차', 'BusinessNum'], inplace=True)

# 이상치제거

### 원변수

In [14]:
# 데이터로드
path = 'C:/Users/MSI/Desktop/study/Data/기업정보/'
df3 = pd.read_csv(path + '이상치제거_원변수.csv',index_col=0)
df3.drop(columns=['BusinessNum'], inplace=True)
df3 = df3.dropna(subset=['매출액', '영업이익', '당기순이익(손실)'])

# 카테고리와 숫자형 변수 리스트 생성
category_list = df3.select_dtypes(include=['object']).columns
numeric_list = df3.select_dtypes(include=[np.number]).columns.drop(['매출액', '영업이익', '당기순이익(손실)'])

# 스케일러 목록
scalers = [StandardScaler()]

# 모델 목록
models = {
    'XGBoost': xgb.XGBRegressor(),
    'ElasticNet' : ElasticNet(),
    'GradientBoostingRegressor': GradientBoostingRegressor()
}

# 타겟 변수 목록
targets = ['매출액', '영업이익', '당기순이익(손실)']

results = []

for target in targets:
    Y = df3[target]
    X = df3.drop(columns=target)
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)
    
    for scaler in scalers:
        numeric_pipe = make_pipeline(SimpleImputer(strategy='median'), scaler)
        category_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'))
        
        prepro_pipe = make_column_transformer((numeric_pipe, numeric_list),
                                            (category_pipe, category_list))
        
        for model_name, model in models.items():
            model_pipe = make_pipeline(prepro_pipe, model)
            
            model_pipe.fit(X_train, Y_train)
            
            Y_train_pred = model_pipe.predict(X_train)
            Y_test_pred = model_pipe.predict(X_test)
            
            train_mse = mean_squared_error(Y_train, Y_train_pred)
            test_mse = mean_squared_error(Y_test, Y_test_pred)
            train_r2 = r2_score(Y_train, Y_train_pred)
            test_r2 = r2_score(Y_test, Y_test_pred)
            
            results.append({
                'target': target,
                'scaler': scaler.__class__.__name__,
                'model': model_name,
                'train_mse': train_mse,
                'test_mse': test_mse,
                'train_r2': train_r2,
                'test_r2': test_r2
            })



# 결과를 데이터프레임으로 변환
results_df = pd.DataFrame(results)

# int형으로 변환
results_df['train_mse'] = results_df['train_mse'].apply(lambda x: int(float(x)))
results_df['test_mse'] = results_df['test_mse'].apply(lambda x: int(float(x)))

In [15]:
results_df

Unnamed: 0,target,scaler,model,train_mse,test_mse,train_r2,test_r2
0,매출액,StandardScaler,XGBoost,57675981,1061958267,0.999884,0.997908
1,매출액,StandardScaler,ElasticNet,58707654224,60337562003,0.88144,0.881121
2,매출액,StandardScaler,GradientBoostingRegressor,799495006,2478386001,0.998385,0.995117
3,영업이익,StandardScaler,XGBoost,9390721,131515328,0.998514,0.980254
4,영업이익,StandardScaler,ElasticNet,2972236068,3213781634,0.52974,0.517478
5,영업이익,StandardScaler,GradientBoostingRegressor,140217178,270472216,0.977815,0.959391
6,당기순이익(손실),StandardScaler,XGBoost,66306033,624123771,0.986968,0.883146
7,당기순이익(손실),StandardScaler,ElasticNet,2548299297,2745883496,0.499133,0.485889
8,당기순이익(손실),StandardScaler,GradientBoostingRegressor,516514616,740254375,0.898479,0.861402


### 논문변수

In [16]:
# 데이터로드
path = 'C:/Users/MSI/Desktop/study/Data/기업정보/'
df3 = pd.read_csv(path + '이상치제거_논문변수.csv',index_col=0)
df3.drop(columns=['BusinessNum'], inplace=True)
df3 = df3.dropna(subset=['매출액', '영업이익', '당기순이익(손실)'])

# 카테고리와 숫자형 변수 리스트 생성
category_list = df3.select_dtypes(include=['object']).columns
numeric_list = df3.select_dtypes(include=[np.number]).columns.drop(['매출액', '영업이익', '당기순이익(손실)'])

# 스케일러 목록
scalers = [StandardScaler()]

# 모델 목록
models = {
    'XGBoost': xgb.XGBRegressor(),
    'ElasticNet' : ElasticNet(),
    'GradientBoostingRegressor': GradientBoostingRegressor()
}

# 타겟 변수 목록
targets = ['매출액', '영업이익', '당기순이익(손실)']

results = []

for target in targets:
    Y = df3[target]
    X = df3.drop(columns=target)
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)
    
    for scaler in scalers:
        numeric_pipe = make_pipeline(SimpleImputer(strategy='median'), scaler)
        category_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'))
        
        prepro_pipe = make_column_transformer((numeric_pipe, numeric_list),
                                            (category_pipe, category_list))
        
        for model_name, model in models.items():
            model_pipe = make_pipeline(prepro_pipe, model)
            
            model_pipe.fit(X_train, Y_train)
            
            Y_train_pred = model_pipe.predict(X_train)
            Y_test_pred = model_pipe.predict(X_test)
            
            train_mse = mean_squared_error(Y_train, Y_train_pred)
            test_mse = mean_squared_error(Y_test, Y_test_pred)
            train_r2 = r2_score(Y_train, Y_train_pred)
            test_r2 = r2_score(Y_test, Y_test_pred)
            
            results.append({
                'target': target,
                'scaler': scaler.__class__.__name__,
                'model': model_name,
                'train_mse': train_mse,
                'test_mse': test_mse,
                'train_r2': train_r2,
                'test_r2': test_r2
            })



# 결과를 데이터프레임으로 변환
results_df = pd.DataFrame(results)

# int형으로 변환
results_df['train_mse'] = results_df['train_mse'].apply(lambda x: int(float(x)))
results_df['test_mse'] = results_df['test_mse'].apply(lambda x: int(float(x)))

In [17]:
results_df

Unnamed: 0,target,scaler,model,train_mse,test_mse,train_r2,test_r2
0,매출액,StandardScaler,XGBoost,276504346562,878326282653,0.856287,0.491167
1,매출액,StandardScaler,ElasticNet,1110264157679,1010589803501,0.42294,0.414544
2,매출액,StandardScaler,GradientBoostingRegressor,838305996399,905813433318,0.564291,0.475243
3,영업이익,StandardScaler,XGBoost,3251051943,6852905992,0.638964,0.218378
4,영업이익,StandardScaler,ElasticNet,7556999522,7371834489,0.160781,0.15919
5,영업이익,StandardScaler,GradientBoostingRegressor,6451159713,6673934808,0.283586,0.238791
6,당기순이익(손실),StandardScaler,XGBoost,2551754223,5180221678,0.647674,0.255559
7,당기순이익(손실),StandardScaler,ElasticNet,5975301200,5738332922,0.174978,0.175354
8,당기순이익(손실),StandardScaler,GradientBoostingRegressor,4984172190,5058504306,0.311825,0.273051


### 상관관계변수

In [18]:
# 데이터로드
path = 'C:/Users/MSI/Desktop/study/Data/기업정보/'
df3 = pd.read_csv(path + '이상치제거_상관관계변수.csv',index_col=0)
df3.drop(columns=['BusinessNum'], inplace=True)
df3 = df3.dropna(subset=['매출액', '영업이익', '당기순이익(손실)'])

# 카테고리와 숫자형 변수 리스트 생성
category_list = df3.select_dtypes(include=['object']).columns
numeric_list = df3.select_dtypes(include=[np.number]).columns.drop(['매출액', '영업이익', '당기순이익(손실)'])

# 스케일러 목록
scalers = [StandardScaler()]

# 모델 목록
models = {
    'XGBoost': xgb.XGBRegressor(),
    'ElasticNet' : ElasticNet(),
    'GradientBoostingRegressor': GradientBoostingRegressor()
}

# 타겟 변수 목록
targets = ['매출액', '영업이익', '당기순이익(손실)']

results = []

for target in targets:
    Y = df3[target]
    X = df3.drop(columns=target)
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)
    
    for scaler in scalers:
        numeric_pipe = make_pipeline(SimpleImputer(strategy='median'), scaler)
        category_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'))
        
        prepro_pipe = make_column_transformer((numeric_pipe, numeric_list),
                                            (category_pipe, category_list))
        
        for model_name, model in models.items():
            model_pipe = make_pipeline(prepro_pipe, model)
            
            model_pipe.fit(X_train, Y_train)
            
            Y_train_pred = model_pipe.predict(X_train)
            Y_test_pred = model_pipe.predict(X_test)
            
            train_mse = mean_squared_error(Y_train, Y_train_pred)
            test_mse = mean_squared_error(Y_test, Y_test_pred)
            train_r2 = r2_score(Y_train, Y_train_pred)
            test_r2 = r2_score(Y_test, Y_test_pred)
            
            results.append({
                'target': target,
                'scaler': scaler.__class__.__name__,
                'model': model_name,
                'train_mse': train_mse,
                'test_mse': test_mse,
                'train_r2': train_r2,
                'test_r2': test_r2
            })



# 결과를 데이터프레임으로 변환
results_df = pd.DataFrame(results)

# int형으로 변환
results_df['train_mse'] = results_df['train_mse'].apply(lambda x: int(float(x)))
results_df['test_mse'] = results_df['test_mse'].apply(lambda x: int(float(x)))

In [19]:
results_df

Unnamed: 0,target,scaler,model,train_mse,test_mse,train_r2,test_r2
0,매출액,StandardScaler,XGBoost,56753668,2339405803,0.999888,0.995151
1,매출액,StandardScaler,ElasticNet,60095599903,60835759421,0.881459,0.873895
2,매출액,StandardScaler,GradientBoostingRegressor,727248892,2223806537,0.998565,0.99539
3,영업이익,StandardScaler,XGBoost,10220934,126155911,0.998463,0.980105
4,영업이익,StandardScaler,ElasticNet,3163347408,3018347511,0.524199,0.523993
5,영업이익,StandardScaler,GradientBoostingRegressor,144247627,276776378,0.978304,0.956351
6,당기순이익(손실),StandardScaler,XGBoost,72324475,654431054,0.986405,0.868637
7,당기순이익(손실),StandardScaler,ElasticNet,2676304899,2530783888,0.496929,0.491998
8,당기순이익(손실),StandardScaler,GradientBoostingRegressor,566194788,773797246,0.893571,0.844676


# 원변수사용

### 이상치 3행이상 제거

In [20]:
# 데이터로드
path = 'C:/Users/MSI/Desktop/study/Data/기업정보/'
df3 = pd.read_csv(path + '이상치3행이상_원변수.csv',index_col=0)
df3.drop(columns=['BusinessNum'], inplace=True)
df3 = df3.dropna(subset=['매출액', '영업이익', '당기순이익(손실)'])

# 카테고리와 숫자형 변수 리스트 생성
category_list = df3.select_dtypes(include=['object']).columns
numeric_list = df3.select_dtypes(include=[np.number]).columns.drop(['매출액', '영업이익', '당기순이익(손실)'])

# 스케일러 목록
scalers = [StandardScaler()]

# 모델 목록
models = {
    'XGBoost': xgb.XGBRegressor(),
    'ElasticNet' : ElasticNet(),
    'GradientBoostingRegressor': GradientBoostingRegressor()
}

# 타겟 변수 목록
targets = ['매출액', '영업이익', '당기순이익(손실)']

results = []

for target in targets:
    Y = df3[target]
    X = df3.drop(columns=target)
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)
    
    for scaler in scalers:
        numeric_pipe = make_pipeline(SimpleImputer(strategy='median'), scaler)
        category_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'))
        
        prepro_pipe = make_column_transformer((numeric_pipe, numeric_list),
                                            (category_pipe, category_list))
        
        for model_name, model in models.items():
            model_pipe = make_pipeline(prepro_pipe, model)
            
            model_pipe.fit(X_train, Y_train)
            
            Y_train_pred = model_pipe.predict(X_train)
            Y_test_pred = model_pipe.predict(X_test)
            
            train_mse = mean_squared_error(Y_train, Y_train_pred)
            test_mse = mean_squared_error(Y_test, Y_test_pred)
            train_r2 = r2_score(Y_train, Y_train_pred)
            test_r2 = r2_score(Y_test, Y_test_pred)
            
            results.append({
                'target': target,
                'scaler': scaler.__class__.__name__,
                'model': model_name,
                'train_mse': train_mse,
                'test_mse': test_mse,
                'train_r2': train_r2,
                'test_r2': test_r2
            })



# 결과를 데이터프레임으로 변환
results_df = pd.DataFrame(results)

# int형으로 변환
results_df['train_mse'] = results_df['train_mse'].apply(lambda x: int(float(x)))
results_df['test_mse'] = results_df['test_mse'].apply(lambda x: int(float(x)))

results_df

Unnamed: 0,target,scaler,model,train_mse,test_mse,train_r2,test_r2
0,매출액,StandardScaler,XGBoost,2317279869,22748772702,0.999705,0.997228
1,매출액,StandardScaler,ElasticNet,863844446707,813415816666,0.890127,0.900882
2,매출액,StandardScaler,GradientBoostingRegressor,48895109351,28188538323,0.993781,0.996565
3,영업이익,StandardScaler,XGBoost,290384577,2077101772,0.996061,0.968418
4,영업이익,StandardScaler,ElasticNet,50203407913,43482881208,0.31905,0.338843
5,영업이익,StandardScaler,GradientBoostingRegressor,6803885242,5565056733,0.907713,0.915383
6,당기순이익(손실),StandardScaler,XGBoost,1916847466,10051630494,0.96946,0.822724
7,당기순이익(손실),StandardScaler,ElasticNet,43883303701,38919150261,0.300836,0.313601
8,당기순이익(손실),StandardScaler,GradientBoostingRegressor,11690436650,12979708610,0.813744,0.771083


### 이상치 bound 변환

In [21]:
# 데이터로드
path = 'C:/Users/MSI/Desktop/study/Data/기업정보/'
df3 = pd.read_csv(path + '이상치bound_원변수.csv',index_col=0)
df3.drop(columns=['BusinessNum'], inplace=True)
df3 = df3.dropna(subset=['매출액', '영업이익', '당기순이익(손실)'])

# 카테고리와 숫자형 변수 리스트 생성
category_list = df3.select_dtypes(include=['object']).columns
numeric_list = df3.select_dtypes(include=[np.number]).columns.drop(['매출액', '영업이익', '당기순이익(손실)'])

# 스케일러 목록
scalers = [StandardScaler()]

# 모델 목록
models = {
    'XGBoost': xgb.XGBRegressor(),
    'ElasticNet' : ElasticNet(),
    'GradientBoostingRegressor': GradientBoostingRegressor()
}

# 타겟 변수 목록
targets = ['매출액', '영업이익', '당기순이익(손실)']

results = []

for target in targets:
    Y = df3[target]
    X = df3.drop(columns=target)
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)
    
    for scaler in scalers:
        numeric_pipe = make_pipeline(SimpleImputer(strategy='median'), scaler)
        category_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'))
        
        prepro_pipe = make_column_transformer((numeric_pipe, numeric_list),
                                            (category_pipe, category_list))
        
        for model_name, model in models.items():
            model_pipe = make_pipeline(prepro_pipe, model)
            
            model_pipe.fit(X_train, Y_train)
            
            Y_train_pred = model_pipe.predict(X_train)
            Y_test_pred = model_pipe.predict(X_test)
            
            train_mse = mean_squared_error(Y_train, Y_train_pred)
            test_mse = mean_squared_error(Y_test, Y_test_pred)
            train_r2 = r2_score(Y_train, Y_train_pred)
            test_r2 = r2_score(Y_test, Y_test_pred)
            
            results.append({
                'target': target,
                'scaler': scaler.__class__.__name__,
                'model': model_name,
                'train_mse': train_mse,
                'test_mse': test_mse,
                'train_r2': train_r2,
                'test_r2': test_r2
            })



# 결과를 데이터프레임으로 변환
results_df = pd.DataFrame(results)

# int형으로 변환
results_df['train_mse'] = results_df['train_mse'].apply(lambda x: int(float(x)))
results_df['test_mse'] = results_df['test_mse'].apply(lambda x: int(float(x)))

results_df

Unnamed: 0,target,scaler,model,train_mse,test_mse,train_r2,test_r2
0,매출액,StandardScaler,XGBoost,42700268496,302426047979,0.998556,0.989666
1,매출액,StandardScaler,ElasticNet,3083156412491,2955801160696,0.895753,0.899004
2,매출액,StandardScaler,GradientBoostingRegressor,406303023134,430223946558,0.986262,0.9853
3,영업이익,StandardScaler,XGBoost,4773759258,13965773622,0.948929,0.849242
4,영업이익,StandardScaler,ElasticNet,49965476426,49727403759,0.465459,0.463203
5,영업이익,StandardScaler,GradientBoostingRegressor,19711650924,20872325934,0.789121,0.774688
6,당기순이익(손실),StandardScaler,XGBoost,8827614590,21098940908,0.883739,0.720074
7,당기순이익(손실),StandardScaler,ElasticNet,41893711130,42704386365,0.448255,0.433429
8,당기순이익(손실),StandardScaler,GradientBoostingRegressor,22918375498,24152068527,0.698163,0.679568


# 모델 확인 및 2021년 예측값  RMSE 확인

In [54]:
# 데이터로드
path = 'C:/Users/MSI/Desktop/study/Data/기업정보/'
df = pd.read_csv(path + '이상치제거_원변수.csv', index_col=0)
df.drop(columns=['BusinessNum', 'stYear'], inplace=True)
df = df.dropna(subset=['매출액', '영업이익', '당기순이익(손실)'])

df2 = pd.read_csv(path + '2022_예측데이터.csv')
df2.drop(columns=['stYear'], inplace=True)

# NaN, infinity, 또는 너무 큰 값 처리 함수 정의
def preprocess_data(df):
    # 무한대 값을 NaN으로 변환
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    # 숫자형 열에 대해서만 너무 큰 값(예: 1e10 이상)을 NaN으로 변환
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].applymap(lambda x: np.nan if x > 1e10 else x)
    return df

# df와 df2 데이터 전처리
df = preprocess_data(df)
df2 = preprocess_data(df2)

# 카테고리와 숫자형 변수 리스트 생성
category_list = df.select_dtypes(include=['object']).columns
numeric_list = df.select_dtypes(include=[np.number]).columns.drop(['매출액', '영업이익', '당기순이익(손실)'])

# 전처리 파이프라인 정의
numeric_pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())
category_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'))

prepro_pipe = make_column_transformer((numeric_pipe, numeric_list),
                                      (category_pipe, category_list))

# 전처리 파이프라인을 학습 데이터에 대해 fit 및 transform
X = df.drop(columns=['매출액', '영업이익', '당기순이익(손실)'])
prepro_pipe.fit(X)
X_transformed = prepro_pipe.transform(X)

# df2 전처리
X_new = df2.drop(columns=['BusinessNum'])
X_new_transformed = prepro_pipe.transform(X_new)

# 타겟 변수 목록
targets = ['매출액', '영업이익', '당기순이익(손실)']

results = []
predictions = df2[['BusinessNum']].copy()

for target in targets:
    Y = df[target]
    
    X_train, X_test, Y_train, Y_test = train_test_split(X_transformed, Y, random_state=42)
    
    # 모델 정의
    model = xgb.XGBRegressor()
    
    # 모델 학습
    model.fit(X_train, Y_train)

    Y_train_pred = model.predict(X_train)
    Y_test_pred = model.predict(X_test)

    train_mse = mean_squared_error(Y_train, Y_train_pred)
    test_mse = mean_squared_error(Y_test, Y_test_pred)
    train_r2 = r2_score(Y_train, Y_train_pred)
    test_r2 = r2_score(Y_test, Y_test_pred)

    results.append({
        'target': target,
        'train_mse': train_mse,
        'test_mse': test_mse,
        'train_r2': train_r2,
        'test_r2': test_r2
    })
    
    # 예측값을 predictions 데이터프레임에 추가
    df2_predictions = model.predict(X_new_transformed)
    predictions[f'{target}_pred'] = df2_predictions
    
    # df2에 실제 값이 존재하는 경우
    if target in df2.columns:
        Y_new = df2[target]
        
        # NaN, infinity, 또는 너무 큰 값 처리
        Y_new = Y_new.replace([np.inf, -np.inf], np.nan).fillna(0)
        df2_predictions = np.where(np.isinf(df2_predictions) | np.isnan(df2_predictions), 0, df2_predictions)
        
        # df2에 대한 MSE와 R2 점수 계산
        new_mse = mean_squared_error(Y_new, df2_predictions)
        new_r2 = r2_score(Y_new, df2_predictions)
        
        results.append({
            'target': target,
            'new_mse': new_mse,
            'new_r2': new_r2
        })

# 결과 출력
results_df = pd.DataFrame(results)
print(results_df)
print(predictions.head())

      target     train_mse      test_mse  train_r2   test_r2       new_mse  \
0        매출액  5.229117e+07  1.474608e+09  0.999894  0.997095           NaN   
1        매출액           NaN           NaN       NaN       NaN  1.215556e+17   
2       영업이익  9.069707e+06  1.321075e+08  0.998565  0.980165           NaN   
3       영업이익           NaN           NaN       NaN       NaN  1.905565e+15   
4  당기순이익(손실)  6.518514e+07  6.086597e+08  0.987188  0.886041           NaN   
5  당기순이익(손실)           NaN           NaN       NaN       NaN  1.629132e+15   

     new_r2  
0       NaN  
1 -0.006045  
2       NaN  
3 -0.001791  
4       NaN  
5 -0.000321  
   BusinessNum       매출액_pred     영업이익_pred  당기순이익(손실)_pred
0   1010109319  174344.265625  11237.064453    16760.875000
1   1010204456  956111.812500  56336.636719    52047.363281
2   1010600385  190726.281250  20659.576172    28317.880859
3   1010607727  344559.750000  41499.949219    42662.675781
4   1010709848  190726.281250  20659.576172    28317.88

In [2]:
# 데이터로드
path = 'C:/Users/MSI/Desktop/study/Data/기업정보/'
df = pd.read_csv(path + '이상치3행이상_원변수.csv', index_col=0)
df.drop(columns=['BusinessNum', 'stYear'], inplace=True)
df = df.dropna(subset=['매출액', '영업이익', '당기순이익(손실)'])

df2 = pd.read_csv(path + '2022_예측데이터.csv')
df2.drop(columns=['stYear'], inplace=True)

# NaN, infinity, 또는 너무 큰 값 처리 함수 정의
def preprocess_data(df):
    # 무한대 값을 NaN으로 변환
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    # 숫자형 열에 대해서만 너무 큰 값(예: 1e10 이상)을 NaN으로 변환
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].applymap(lambda x: np.nan if x > 1e10 else x)
    return df

# df와 df2 데이터 전처리
df = preprocess_data(df)
df2 = preprocess_data(df2)

# 카테고리와 숫자형 변수 리스트 생성
category_list = df.select_dtypes(include=['object']).columns
numeric_list = df.select_dtypes(include=[np.number]).columns.drop(['매출액', '영업이익', '당기순이익(손실)'])

# 전처리 파이프라인 정의
numeric_pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())
category_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'))

prepro_pipe = make_column_transformer((numeric_pipe, numeric_list),
                                      (category_pipe, category_list))

# 전처리 파이프라인을 학습 데이터에 대해 fit 및 transform
X = df.drop(columns=['매출액', '영업이익', '당기순이익(손실)'])
prepro_pipe.fit(X)
X_transformed = prepro_pipe.transform(X)

# df2 전처리
X_new = df2.drop(columns=['BusinessNum'])
X_new_transformed = prepro_pipe.transform(X_new)

# 타겟 변수 목록
targets = ['매출액', '영업이익', '당기순이익(손실)']

results = []
predictions = df2[['BusinessNum']].copy()

for target in targets:
    Y = df[target]
    
    X_train, X_test, Y_train, Y_test = train_test_split(X_transformed, Y, random_state=42)
    
    # 모델 정의
    model = xgb.XGBRegressor()
    
    # 모델 학습
    model.fit(X_train, Y_train)

    Y_train_pred = model.predict(X_train)
    Y_test_pred = model.predict(X_test)

    train_mse = mean_squared_error(Y_train, Y_train_pred)
    test_mse = mean_squared_error(Y_test, Y_test_pred)
    train_r2 = r2_score(Y_train, Y_train_pred)
    test_r2 = r2_score(Y_test, Y_test_pred)

    results.append({
        'target': target,
        'train_rmse': np.sqrt(train_mse),
        'test_rmse': np.sqrt(test_mse),
        'train_r2': train_r2,
        'test_r2': test_r2
    })
    
    # 예측값을 predictions 데이터프레임에 추가
    df2_predictions = model.predict(X_new_transformed)
    predictions[f'{target}_pred'] = df2_predictions
    
    # df2에 실제 값이 존재하는 경우
    if target in df2.columns:
        Y_new = df2[target]
        
        # NaN, infinity, 또는 너무 큰 값 처리
        Y_new = Y_new.replace([np.inf, -np.inf], np.nan).fillna(0)
        df2_predictions = np.where(np.isinf(df2_predictions) | np.isnan(df2_predictions), 0, df2_predictions)
        
        # df2에 대한 MSE와 R2 점수 계산
        new_mse = mean_squared_error(Y_new, df2_predictions)
        new_r2 = r2_score(Y_new, df2_predictions)
        
        results.append({
            'target': target,
            'new_rmse': np.sqrt(new_mse),
            'new_r2': new_r2
        })

# 결과 출력
results_df = pd.DataFrame(results)
print(results_df)
print(predictions.head())

      target    train_rmse      test_rmse  train_r2   test_r2      new_rmse  \
0        매출액  49428.533977  164728.518796  0.999689  0.996693           NaN   
1        매출액           NaN            NaN       NaN       NaN  3.458631e+08   
2       영업이익  16221.789345   46214.897527  0.996431  0.967525           NaN   
3       영업이익           NaN            NaN       NaN       NaN  4.363628e+07   
4  당기순이익(손실)  42543.663175   99657.013807  0.971163  0.824842           NaN   
5  당기순이익(손실)           NaN            NaN       NaN       NaN  4.035716e+07   

     new_r2  
0       NaN  
1  0.009964  
2       NaN  
3 -0.001034  
4       NaN  
5 -0.000056  
   BusinessNum      매출액_pred     영업이익_pred  당기순이익(손실)_pred
0   1010109319  1.592396e+05  20936.533203    -4636.151367
1   1010204456  9.673761e+05  62721.511719    45126.843750
2   1010600385  1.034933e+06  41344.503906    33452.093750
3   1010607727  3.386771e+05  30907.162109    32278.207031
4   1010709848  1.034933e+06  41344.503906    33452.0

In [3]:
# 데이터로드
path = 'C:/Users/MSI/Desktop/study/Data/기업정보/'
df = pd.read_csv(path + '이상치bound_원변수.csv', index_col=0)
df.drop(columns=['BusinessNum', 'stYear'], inplace=True)
df = df.dropna(subset=['매출액', '영업이익', '당기순이익(손실)'])

df2 = pd.read_csv(path + '2022_예측데이터.csv')
df2.drop(columns=['stYear'], inplace=True)

# NaN, infinity, 또는 너무 큰 값 처리 함수 정의
def preprocess_data(df):
    # 무한대 값을 NaN으로 변환
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    # 숫자형 열에 대해서만 너무 큰 값(예: 1e10 이상)을 NaN으로 변환
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].applymap(lambda x: np.nan if x > 1e10 else x)
    return df

# df와 df2 데이터 전처리
df = preprocess_data(df)
df2 = preprocess_data(df2)

# 카테고리와 숫자형 변수 리스트 생성
category_list = df.select_dtypes(include=['object']).columns
numeric_list = df.select_dtypes(include=[np.number]).columns.drop(['매출액', '영업이익', '당기순이익(손실)'])

# 전처리 파이프라인 정의
numeric_pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())
category_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'))

prepro_pipe = make_column_transformer((numeric_pipe, numeric_list),
                                      (category_pipe, category_list))

# 전처리 파이프라인을 학습 데이터에 대해 fit 및 transform
X = df.drop(columns=['매출액', '영업이익', '당기순이익(손실)'])
prepro_pipe.fit(X)
X_transformed = prepro_pipe.transform(X)

# df2 전처리
X_new = df2.drop(columns=['BusinessNum'])
X_new_transformed = prepro_pipe.transform(X_new)

# 타겟 변수 목록
targets = ['매출액', '영업이익', '당기순이익(손실)']

results = []
predictions = df2[['BusinessNum']].copy()

for target in targets:
    Y = df[target]
    
    X_train, X_test, Y_train, Y_test = train_test_split(X_transformed, Y, random_state=42)
    
    # 모델 정의
    model = xgb.XGBRegressor()
    
    # 모델 학습
    model.fit(X_train, Y_train)

    Y_train_pred = model.predict(X_train)
    Y_test_pred = model.predict(X_test)

    train_mse = mean_squared_error(Y_train, Y_train_pred)
    test_mse = mean_squared_error(Y_test, Y_test_pred)
    train_r2 = r2_score(Y_train, Y_train_pred)
    test_r2 = r2_score(Y_test, Y_test_pred)

    results.append({
        'target': target,
        'train_rmse': np.sqrt(train_mse),
        'test_rmse': np.sqrt(test_mse),
        'train_r2': train_r2,
        'test_r2': test_r2
    })
    
    # 예측값을 predictions 데이터프레임에 추가
    df2_predictions = model.predict(X_new_transformed)
    predictions[f'{target}_pred'] = df2_predictions
    
    # df2에 실제 값이 존재하는 경우
    if target in df2.columns:
        Y_new = df2[target]
        
        # NaN, infinity, 또는 너무 큰 값 처리
        Y_new = Y_new.replace([np.inf, -np.inf], np.nan).fillna(0)
        df2_predictions = np.where(np.isinf(df2_predictions) | np.isnan(df2_predictions), 0, df2_predictions)
        
        # df2에 대한 MSE와 R2 점수 계산
        new_mse = mean_squared_error(Y_new, df2_predictions)
        new_r2 = r2_score(Y_new, df2_predictions)
        
        results.append({
            'target': target,
            'new_rmse': np.sqrt(new_mse),
            'new_r2': new_r2
        })

# 결과 출력
results_df = pd.DataFrame(results)
print(results_df)
print(predictions.head())

      target     train_rmse      test_rmse  train_r2   test_r2      new_rmse  \
0        매출액  200078.044548  537240.535463  0.998646  0.990138           NaN   
1        매출액            NaN            NaN       NaN       NaN  3.477106e+08   
2       영업이익   70490.655469  117153.793533  0.946841  0.851841           NaN   
3       영업이익            NaN            NaN       NaN       NaN  4.363585e+07   
4  당기순이익(손실)   91824.147724  143945.046174  0.888954  0.725099           NaN   
5  당기순이익(손실)            NaN            NaN       NaN       NaN  4.034884e+07   

     new_r2  
0       NaN  
1 -0.000641  
2       NaN  
3 -0.001014  
4       NaN  
5  0.000356  
   BusinessNum      매출액_pred     영업이익_pred  당기순이익(손실)_pred
0   1010109319  1.988485e+05   7784.440918    11173.652344
1   1010204456  9.572129e+05  59599.863281    75845.796875
2   1010600385  2.916406e+06  42327.734375    67710.789062
3   1010607727  3.681526e+05  43321.761719    52980.148438
4   1010709848  2.916406e+06  42327.734375    

In [6]:
# 데이터로드
path = 'C:/Users/MSI/Desktop/study/Data/기업정보/'
df3 = pd.read_csv(path + '이상치제거_원변수.csv',index_col=0)
df3.drop(columns=['BusinessNum'], inplace=True)
df3 = df3.dropna(subset=['매출액', '영업이익', '당기순이익(손실)'])

# 카테고리와 숫자형 변수 리스트 생성
category_list = df3.select_dtypes(include=['object']).columns
numeric_list = df3.select_dtypes(include=[np.number]).columns.drop(['매출액', '영업이익', '당기순이익(손실)'])

# 스케일러 목록
scalers = [StandardScaler()]

# 모델 목록
models = {
    'XGBoost': xgb.XGBRegressor(),
    'ElasticNet' : ElasticNet(),
    'GradientBoostingRegressor': GradientBoostingRegressor()
}

# 타겟 변수 목록
targets = ['매출액', '영업이익', '당기순이익(손실)']

results = []

for target in targets:
    Y = df3[target]
    X = df3.drop(columns=target)
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)
    
    for scaler in scalers:
        numeric_pipe = make_pipeline(SimpleImputer(strategy='median'), scaler)
        category_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'))
        
        prepro_pipe = make_column_transformer((numeric_pipe, numeric_list),
                                            (category_pipe, category_list))
        
        for model_name, model in models.items():
            model_pipe = make_pipeline(prepro_pipe, model)
            
            model_pipe.fit(X_train, Y_train)
            
            Y_train_pred = model_pipe.predict(X_train)
            Y_test_pred = model_pipe.predict(X_test)
            
            train_mse = mean_squared_error(Y_train, Y_train_pred)
            test_mse = mean_squared_error(Y_test, Y_test_pred)
            train_r2 = r2_score(Y_train, Y_train_pred)
            test_r2 = r2_score(Y_test, Y_test_pred)
            
            results.append({
                'target': target,
                'scaler': scaler.__class__.__name__,
                'model': model_name,
                'train_rmse': np.sqrt(train_mse),
                'test_rmse': np.sqrt(test_mse),
                'train_r2': train_r2,
                'test_r2': test_r2
            })



# 결과를 데이터프레임으로 변환
results_df = pd.DataFrame(results)

# int형으로 변환
results_df['train_rmse'] = results_df['train_rmse'].apply(lambda x: int(float(x)))
results_df['test_rmse'] = results_df['test_rmse'].apply(lambda x: int(float(x)))

In [7]:
results_df

Unnamed: 0,target,scaler,model,train_rmse,test_rmse,train_r2,test_r2
0,매출액,StandardScaler,XGBoost,7594,32587,0.999884,0.997908
1,매출액,StandardScaler,ElasticNet,242296,245637,0.88144,0.881121
2,매출액,StandardScaler,GradientBoostingRegressor,28275,49715,0.998385,0.99513
3,영업이익,StandardScaler,XGBoost,3064,11468,0.998514,0.980254
4,영업이익,StandardScaler,ElasticNet,54518,56690,0.52974,0.517478
5,영업이익,StandardScaler,GradientBoostingRegressor,11841,16479,0.977815,0.959224
6,당기순이익(손실),StandardScaler,XGBoost,8142,24982,0.986968,0.883146
7,당기순이익(손실),StandardScaler,ElasticNet,50480,52401,0.499133,0.485889
8,당기순이익(손실),StandardScaler,GradientBoostingRegressor,22726,27166,0.898479,0.861817


In [8]:
# 데이터로드
path = 'C:/Users/MSI/Desktop/study/Data/기업정보/'
df3 = pd.read_csv(path + '이상치3행이상_원변수.csv',index_col=0)
df3.drop(columns=['BusinessNum'], inplace=True)
df3 = df3.dropna(subset=['매출액', '영업이익', '당기순이익(손실)'])

# 카테고리와 숫자형 변수 리스트 생성
category_list = df3.select_dtypes(include=['object']).columns
numeric_list = df3.select_dtypes(include=[np.number]).columns.drop(['매출액', '영업이익', '당기순이익(손실)'])

# 스케일러 목록
scalers = [StandardScaler()]

# 모델 목록
models = {
    'XGBoost': xgb.XGBRegressor(),
    'ElasticNet' : ElasticNet(),
    'GradientBoostingRegressor': GradientBoostingRegressor()
}

# 타겟 변수 목록
targets = ['매출액', '영업이익', '당기순이익(손실)']

results = []

for target in targets:
    Y = df3[target]
    X = df3.drop(columns=target)
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)
    
    for scaler in scalers:
        numeric_pipe = make_pipeline(SimpleImputer(strategy='median'), scaler)
        category_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'))
        
        prepro_pipe = make_column_transformer((numeric_pipe, numeric_list),
                                            (category_pipe, category_list))
        
        for model_name, model in models.items():
            model_pipe = make_pipeline(prepro_pipe, model)
            
            model_pipe.fit(X_train, Y_train)
            
            Y_train_pred = model_pipe.predict(X_train)
            Y_test_pred = model_pipe.predict(X_test)
            
            train_mse = mean_squared_error(Y_train, Y_train_pred)
            test_mse = mean_squared_error(Y_test, Y_test_pred)
            train_r2 = r2_score(Y_train, Y_train_pred)
            test_r2 = r2_score(Y_test, Y_test_pred)
            
            results.append({
                'target': target,
                'scaler': scaler.__class__.__name__,
                'model': model_name,
                'train_rmse': np.sqrt(train_mse),
                'test_rmse': np.sqrt(test_mse),
                'train_r2': train_r2,
                'test_r2': test_r2
            })



# 결과를 데이터프레임으로 변환
results_df = pd.DataFrame(results)

# int형으로 변환
results_df['train_rmse'] = results_df['train_rmse'].apply(lambda x: int(float(x)))
results_df['test_rmse'] = results_df['test_rmse'].apply(lambda x: int(float(x)))
results_df

Unnamed: 0,target,scaler,model,train_rmse,test_rmse,train_r2,test_r2
0,매출액,StandardScaler,XGBoost,48138,150826,0.999705,0.997228
1,매출액,StandardScaler,ElasticNet,929432,901895,0.890127,0.900882
2,매출액,StandardScaler,GradientBoostingRegressor,221122,166561,0.993781,0.996619
3,영업이익,StandardScaler,XGBoost,17040,45575,0.996061,0.968418
4,영업이익,StandardScaler,ElasticNet,224061,208525,0.31905,0.338843
5,영업이익,StandardScaler,GradientBoostingRegressor,82485,74598,0.907713,0.915386
6,당기순이익(손실),StandardScaler,XGBoost,43781,100257,0.96946,0.822724
7,당기순이익(손실),StandardScaler,ElasticNet,209483,197279,0.300836,0.313601
8,당기순이익(손실),StandardScaler,GradientBoostingRegressor,108122,113897,0.813744,0.771207


In [9]:
# 데이터로드
path = 'C:/Users/MSI/Desktop/study/Data/기업정보/'
df3 = pd.read_csv(path + '이상치bound_원변수.csv',index_col=0)
df3.drop(columns=['BusinessNum'], inplace=True)
df3 = df3.dropna(subset=['매출액', '영업이익', '당기순이익(손실)'])

# 카테고리와 숫자형 변수 리스트 생성
category_list = df3.select_dtypes(include=['object']).columns
numeric_list = df3.select_dtypes(include=[np.number]).columns.drop(['매출액', '영업이익', '당기순이익(손실)'])

# 스케일러 목록
scalers = [StandardScaler()]

# 모델 목록
models = {
    'XGBoost': xgb.XGBRegressor(),
    'ElasticNet' : ElasticNet(),
    'GradientBoostingRegressor': GradientBoostingRegressor()
}

# 타겟 변수 목록
targets = ['매출액', '영업이익', '당기순이익(손실)']

results = []

for target in targets:
    Y = df3[target]
    X = df3.drop(columns=target)
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)
    
    for scaler in scalers:
        numeric_pipe = make_pipeline(SimpleImputer(strategy='median'), scaler)
        category_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'))
        
        prepro_pipe = make_column_transformer((numeric_pipe, numeric_list),
                                            (category_pipe, category_list))
        
        for model_name, model in models.items():
            model_pipe = make_pipeline(prepro_pipe, model)
            
            model_pipe.fit(X_train, Y_train)
            
            Y_train_pred = model_pipe.predict(X_train)
            Y_test_pred = model_pipe.predict(X_test)
            
            train_mse = mean_squared_error(Y_train, Y_train_pred)
            test_mse = mean_squared_error(Y_test, Y_test_pred)
            train_r2 = r2_score(Y_train, Y_train_pred)
            test_r2 = r2_score(Y_test, Y_test_pred)
            
            results.append({
                'target': target,
                'scaler': scaler.__class__.__name__,
                'model': model_name,
                'train_rmse': np.sqrt(train_mse),
                'test_rmse': np.sqrt(test_mse),
                'train_r2': train_r2,
                'test_r2': test_r2
            })



# 결과를 데이터프레임으로 변환
results_df = pd.DataFrame(results)

# int형으로 변환
results_df['train_rmse'] = results_df['train_rmse'].apply(lambda x: int(float(x)))
results_df['test_rmse'] = results_df['test_rmse'].apply(lambda x: int(float(x)))
results_df

Unnamed: 0,target,scaler,model,train_rmse,test_rmse,train_r2,test_r2
0,매출액,StandardScaler,XGBoost,206640,549932,0.998556,0.989666
1,매출액,StandardScaler,ElasticNet,1755891,1719244,0.895753,0.899004
2,매출액,StandardScaler,GradientBoostingRegressor,637419,655593,0.986262,0.985314
3,영업이익,StandardScaler,XGBoost,69092,118176,0.948929,0.849242
4,영업이익,StandardScaler,ElasticNet,223529,222996,0.465459,0.463203
5,영업이익,StandardScaler,GradientBoostingRegressor,140398,144458,0.789121,0.774732
6,당기순이익(손실),StandardScaler,XGBoost,93955,145254,0.883739,0.720074
7,당기순이익(손실),StandardScaler,ElasticNet,204679,206650,0.448255,0.433429
8,당기순이익(손실),StandardScaler,GradientBoostingRegressor,151388,155409,0.698163,0.679568
