In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
import xgboost as xgb
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# 데이터로드
path = 'C:/Users/MSI/Desktop/study/Data/기업정보/'
df = pd.read_csv(path + '원자료변수_이상치_bound변환.csv',index_col=0)
df.drop(columns=['BusinessNum','stYear'], inplace=True)
df = df.dropna(subset=['매출액', '영업이익', '당기순이익(손실)'])

In [None]:
df.info()

In [None]:
df2 = pd.read_csv(path + 'predict_ex.csv',index_col=0)
df2.drop(columns=['stYear'], inplace=True)

In [None]:
df2.info()

In [None]:
# 데이터로드
path = 'C:/Users/MSI/Desktop/study/Data/기업정보/'
df = pd.read_csv(path + '원자료변수_이상치_bound변환.csv',index_col=0)
df.drop(columns=['BusinessNum','stYear'], inplace=True)
df = df.dropna(subset=['매출액', '영업이익', '당기순이익(손실)'])

df2 = pd.read_csv(path + 'predict_ex.csv',index_col=0)
df2.drop(columns=['stYear'], inplace=True)


#카테고리와 숫자형 변수 리스트 생성
category_list = df.select_dtypes(include=['object']).columns
numeric_list = df.select_dtypes(include=[np.number]).columns.drop(['매출액', '영업이익', '당기순이익(손실)'])

# 전처리 파이프라인 정의
numeric_pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())
category_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'))

prepro_pipe = make_column_transformer((numeric_pipe, numeric_list),
                                      (category_pipe, category_list))

# 전처리 파이프라인을 학습 데이터에 대해 fit 및 transform
X = df.drop(columns=['매출액', '영업이익', '당기순이익(손실)'])
prepro_pipe.fit(X)
X_transformed = prepro_pipe.transform(X)

# df2 전처리
X_new = df2.drop(columns=['BusinessNum'])
X_new_transformed = prepro_pipe.transform(X_new)

# 타겟 변수 목록
targets = ['매출액', '영업이익', '당기순이익(손실)']

results = []
predictions = df2[['BusinessNum']].copy()

for target in targets:
    Y = df[target]
    
    X_train, X_test, Y_train, Y_test = train_test_split(X_transformed, Y, random_state=42)
    
    # 모델 정의
    model = xgb.XGBRegressor()
    
    # 모델 학습
    model.fit(X_train, Y_train)

    Y_train_pred = model.predict(X_train)
    Y_test_pred = model.predict(X_test)

    train_mse = mean_squared_error(Y_train, Y_train_pred)
    test_mse = mean_squared_error(Y_test, Y_test_pred)
    train_r2 = r2_score(Y_train, Y_train_pred)
    test_r2 = r2_score(Y_test, Y_test_pred)

    results.append({
        'target': target,
        'train_mse': train_mse,
        'test_mse': test_mse,
        'train_r2': train_r2,
        'test_r2': test_r2
    })
    
    # 예측값을 predictions 데이터프레임에 추가
    predictions[f'{target}_pred'] = model.predict(X_new_transformed)


In [None]:
predictions.to_csv(path + '최종의최종의최종의predict.csv')

In [None]:
df_2022 = pd.read_csv(path+'(2024_bigdata)2022_predict.csv', encoding='cp949')

In [None]:
df_2022

# 하이퍼파라미터 튜닝 후

In [None]:
# 카테고리와 숫자형 변수 리스트 생성
category_list = df.select_dtypes(include=['object']).columns
numeric_list = df.select_dtypes(include=[np.number]).columns.drop(['매출액', '영업이익', '당기순이익(손실)'])

# 전처리 파이프라인 정의
numeric_pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())
category_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'))

prepro_pipe = make_column_transformer((numeric_pipe, numeric_list),
                                      (category_pipe, category_list))

# 전처리 파이프라인을 학습 데이터에 대해 fit 및 transform
X = df.drop(columns=['매출액', '영업이익', '당기순이익(손실)'])
prepro_pipe.fit(X)
X_transformed = prepro_pipe.transform(X)

# df2 전처리
X_new = df2.drop(columns=['BusinessNum'])
X_new_transformed = prepro_pipe.transform(X_new)

# 타겟 변수 목록
targets = ['매출액', '영업이익', '당기순이익(손실)']


predictions = df2[['BusinessNum']].copy()

# 하이퍼파라미터 그리드 설정
param_grid = {
    'xgbregressor__n_estimators': [100, 200, 300],
    'xgbregressor__max_depth': [3, 5, 7],
    'xgbregressor__learning_rate': [0.01, 0.1, 0.3],
    'xgbregressor__subsample': [0.6, 0.8, 1.0],
    'xgbregressor__colsample_bytree': [0.6, 0.8, 1.0]
}

for target in targets:
    Y = df[target]
    
    X_train, X_test, Y_train, Y_test = train_test_split(X_transformed, Y, random_state=42)
    
    # 모델 정의
    model = xgb.XGBRegressor()
    
    # 전체 파이프라인
    model_pipe = make_pipeline(prepro_pipe, model)
    
    # GridSearchCV 설정
    grid_search = GridSearchCV(model_pipe, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    
    # 모델 학습
    grid_search.fit(X_train, Y_train)

    # 최적 모델로 예측
    best_model = grid_search.best_estimator_
    Y_train_pred = best_model.predict(X_train)
    Y_test_pred = best_model.predict(X_test)

    train_mse = mean_squared_error(Y_train, Y_train_pred)
    test_mse = mean_squared_error(Y_test, Y_test_pred)
    train_r2 = r2_score(Y_train, Y_train_pred)
    test_r2 = r2_score(Y_test, Y_test_pred)

    # 예측값을 predictions 데이터프레임에 추가
    predictions[f'{target}_pred'] = best_model.predict(X_new_transformed)