# setting

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
import shap

## target

In [2]:
target = 'EV_EBITDA'

## corporation

In [3]:
corp = 'Accenture'

## model type

In [4]:
model_type = 'LinearRegression'
model_kwargs = {}

In [5]:
model_type = 'XGBRegressor'
model_kwargs = {
    'n_estimators': 30,
    'max_depth': 10,
    'max_leaves': 0,
    'colsample_bytree': 0.5
}

In [6]:
# model_type = 'RandomForestRegressor'
# model_kwargs = {
#     'n_estimators': 10,
#     'max_depth': 10,
#     'max_samples': 1,
#     'random_state': 123
# }

## feature

In [7]:
features = ['매출액증가율(YoY)(연도)', 'Gross Investment (GI)', '무형자산회전율s',
       '매출총이익(천원)', 'EBITDA2마진율(비율)', '유동비율(비율)', '유동자산(천원)']

## etc.

In [8]:
cv_k = 5  # cross validation
cv_rs = 123  # cv - random state

# main

## load data

In [9]:
# datdir = f"../Data/clean_filter_v1.3/ITS_v1.3/{corp}_non_shift_clean_v1.{4 if corp=='Tcs' else 3}.csv"
datdir = f"../01. Data/01. 재무지표1/ITS/{corp}_non_shift_clean_v1.{4 if corp=='Tcs' else 3}.csv"
data = pd.read_csv(datdir)
date = data.Date_x.apply(lambda x: pd.to_datetime(str(x), format='%Y-%m'))
data.index = date

X = data.drop(['Date_x', 'close', 'close_weighted', 'EV_EBITDA', 'PBR'], axis='columns')
X = X[features]
y = data[target]

## random split (k-fold)

In [10]:
''' split train/test '''
r2_train, r2_test, coef = [], [], []
SHAP = pd.DataFrame(columns=features)
kf = KFold(n_splits=cv_k, random_state=cv_rs, shuffle=True)
fold = 0
for train_index, test_index in kf.split(X):
    fold += 1

    X_train = X.iloc[train_index, :]
    y_train = y[train_index]

    X_test = X.iloc[test_index, :]
    y_test = y[test_index]


    ''' scale '''
    scaler = StandardScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)


    ''' train '''
    model = eval(f'{model_type}(**model_kwargs).fit(X_train, y_train)')

    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)


    if model_type == 'LinearRegression':

        explainer = shap.LinearExplainer(model, X_train)
        shap_values = explainer.shap_values(X_test)
        SHAP = SHAP.append(pd.DataFrame(np.concatenate([np.expand_dims([fold] * shap_values.shape[0], axis=1), shap_values], axis=1), 
                                        index=X_test.index, columns=np.concatenate([['fold'], X_test.columns])))

    elif model_type in ['XGBRegressor', 'RandomForestRegressor']:

        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_test)
        SHAP = SHAP.append(pd.DataFrame(np.concatenate([np.expand_dims([fold] * shap_values.shape[0], axis=1), shap_values], axis=1), 
                                        index=X_test.index, columns=np.concatenate([['fold'], X_test.columns])))


mshap = SHAP.drop(['fold'], axis='columns').abs().mean().to_frame()  # index 순서는 random


ntree_limit is deprecated, use `iteration_range` or model slicing instead.
ntree_limit is deprecated, use `iteration_range` or model slicing instead.
ntree_limit is deprecated, use `iteration_range` or model slicing instead.
ntree_limit is deprecated, use `iteration_range` or model slicing instead.
ntree_limit is deprecated, use `iteration_range` or model slicing instead.


In [13]:
mshap.columns = ['mean(|shap|)']
display(mshap)

Unnamed: 0,mean(|shap|)
매출액증가율(YoY)(연도),0.265455
Gross Investment (GI),1.041568
무형자산회전율s,0.65984
매출총이익(천원),1.203795
EBITDA2마진율(비율),0.161434
유동비율(비율),0.171655
유동자산(천원),1.380974
