### type B XGBoost 모델 학습 + 결과 시각화 + SHAP 시각화

In [None]:
import xgboost
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/DS/캡스톤디자인 II/xai/최최종데이터_b_지난주추가.csv", encoding = "cp949")
df = df.drop(columns=['week_start_date'])

columns = ['flw_get','last_flw_get','height_pl','num_leaf','len_leaf','width_leaf','unit_len','dim_pipe',
           'temp_ex_day','temp_ex_night','sr_ex_day','temp_int_day','temp_int_night','hum_int_day','hum_int_night',
           'CO2_day','CO2_night','soil_temp_day','soil_temp_night']

# RobustScaler object
scaler = RobustScaler()

# Scale selected columns
df[columns] = scaler.fit_transform(df[columns])

# Separate independent and dependent variables
X = df.drop(columns=['flw_get'])  # Independent variables
y = df['flw_get']  # Dependent variable

# Group data by week
week_groups = df.groupby('week')

# Select data for training from week 4 to week 25
train_weeks = list(range(5, 26))
train_data = pd.concat([group for week, group in week_groups if week in train_weeks])

# Select data for testing from week 26 to week 30
test_weeks = list(range(26, 31))
test_data = pd.concat([group for week, group in week_groups if week in test_weeks])

# Separate independent and dependent variables for training and testing
X_train = train_data.drop(columns=['flw_get', 'id', 'week', 'obj_num'])  # Independent variables
y_train = train_data['flw_get']  # Dependent variable
X_test = test_data.drop(columns=['flw_get', 'id', 'week', 'obj_num'])  # Independent variables
y_test = test_data['flw_get']  # Dependent variable

xgb_model = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                                 colsample_bytree=1, max_depth=7)
# print(len(X_train), len(X_test))

xgb_model.fit(X_train,y_train)

In [None]:
# 하이퍼파라미터 그리드 설정 >> 새로 지정해준 값들
param_grid = {
    'n_estimators': [1000],
    'max_depth': [6],
    'learning_rate': [0.03],
    'subsample': [0.8],
    'gamma': [0.01],
    'min_child_weight': [5]
}

# GridSearchCV 생성
grid_search = GridSearchCV(
    estimator=xgboost.XGBRegressor(random_state=100, n_jobs=-1),
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=3
)

# 그리드 서치를 사용하여 최적의 모델 훈련
grid_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 출력
print("Best Parameters:", grid_search.best_params_)

# 최적의 모델 얻기
best_model = grid_search.best_estimator_

# 최적의 모델로 예측
y_pred = best_model.predict(X_test)

# 최적의 모델로 평가 지표 계산
mse = mean_squared_error(y_test, y_pred)
print(f"MSE (Best Model): {mse}")

# 예측 결과 시각화
plt.figure(figsize=(15, 10))
plt.plot(range(len(y_test)), y_test, color='blue', label='Actual')
plt.plot(range(len(y_pred)), y_pred, color='red', label='Predicted')
for i in range(1, 11):
    plt.axvline(x=20*i, linestyle='dotted', color='gray')
plt.title("Actual vs. Predicted")
plt.legend()
plt.show()

In [None]:
# 설치
!pip install shap

import shap

# shap_values 계산을 위한 masker 생성
masker = shap.maskers.Independent(data=X_train.values)

# shap.Explainer에 masker 전달
explainer = shap.Explainer(xgb_model, masker)

# shap_values 계산
shap_values = explainer.shap_values(X_test.values)

for selected_feature_name in X_test.columns :
  # 선택한 특성의 이름
  selected_feature_index = X_test.columns.get_loc(selected_feature_name)

  # RobustScaler의 중앙값과 IQR 이용
  center = scaler.center_[df.columns.get_loc(selected_feature_name)-4]
  scale = scaler.scale_[df.columns.get_loc(selected_feature_name)-4]

  # 스케일링된 데이터를 원래 값으로 역변환
  feature_values_original = X_test[selected_feature_name].values * scale + center

  # dependence plot 그리기
  plt.figure(figsize=(10, 6))

  plt.scatter(feature_values_original, shap_values[:, selected_feature_index], c=y_test)
  plt.xlabel(selected_feature_name)
  plt.ylabel('SHAP Value')
  plt.title(f'XGBoost A with SHAP - Dependence Plot for {selected_feature_name}')
  plt.colorbar(label='Actual Target Value')
  plt.savefig(f"{selected_feature_name}_dependence_plot_original.png")
  plt.show()

In [None]:
from xgboost import plot_tree
from matplotlib.pylab import rcParams

# 트리 시각화
rcParams['figure.figsize'] = 100, 200
plot_tree(xgb_model)
plt.show()

In [None]:
# summary plot
import shap

# 모델 설명자 생성
explainer = shap.Explainer(xgb_model)

# SHAP 값 계산
shap_values = explainer.shap_values(X_test)

# 특성 중요도 시각화
shap.summary_plot(shap_values, X_test)
--------------------------------------------------------------------
import shap

# 모델 설명자 생성
explainer = shap.Explainer(xgb_model)

# SHAP 값 계산
shap_values = explainer(X_test)

shap.plots.bar(shap_values)
--------------------------------------------------------------------
# interaction plot
shap_interaction_values = explainer.shap_interaction_values(X_train)

shap.summary_plot(shap_interaction_values, X_train)

plt.show()