## **실습 4. AI 모델링 최적화**
## 본 실습파일은 <u><b>학습자용</u> 입니다.
* 본 과정에서는 웹페이지에서 추출한 Feature(특징) 기반으로 악성사이트를 탐지하는 머신러닝 분류문제를 예제코드를 통해서 해결할 것입니다.
---


### **[실습 프로세스]**
### 0. 데이터 불러오기
### 1. 데이터 전처리
### 2. train_test_split을 이용하여, train_x, test_x, train_y, test_y로 데이터 분리
### 3. GridSearch 활용 AI모델링



In [11]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

In [12]:
# 0. 데이터 불러오기
data = pd.read_csv('x_train.csv')

In [14]:
# 1. 데이터 전처리
data.head()

Unnamed: 0,url_len,url_num_hyphens_dom,url_path_len,url_domain_len,url_hostname_len,url_num_dots,url_num_underscores,url_query_len,url_num_query_para,url_ip_present,url_entropy,url_port,html_num_tags('iframe'),html_num_tags('embed'),html_num_tags('object'),html_num_tags('div'),html_num_tags('head'),html_num_tags('form'),html_num_tags('a')
0,33.0,2.0,0.0,33.0,33.0,2.0,0.0,0.0,0.0,0.0,4.193943,0.0,0.0,0.0,0.0,303.0,1.0,3.0,198.0
1,16.0,1.0,1.0,15.0,15.0,1.0,0.0,0.0,0.0,0.0,4.110093,0.0,0.0,0.0,0.0,73.0,1.0,1.0,9.0
2,53.0,0.0,33.0,20.0,20.0,3.0,1.0,0.0,0.0,0.0,4.372906,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,43.0,0.0,32.0,11.0,11.0,2.0,1.0,0.0,0.0,0.0,4.241077,0.0,0.0,0.0,0.0,45.0,1.0,1.0,275.0
4,14.0,1.0,0.0,14.0,14.0,1.0,0.0,0.0,0.0,0.0,3.913977,0.0,0.0,0.0,0.0,251.0,1.0,2.0,131.0


In [15]:
data.columns

Index(['url_len', 'url_num_hyphens_dom', 'url_path_len', 'url_domain_len',
       'url_hostname_len', 'url_num_dots', 'url_num_underscores',
       'url_query_len', 'url_num_query_para', 'url_ip_present', 'url_entropy',
       'url_port', 'html_num_tags('iframe')', 'html_num_tags('embed')',
       'html_num_tags('object')', 'html_num_tags('div')',
       'html_num_tags('head')', 'html_num_tags('form')', 'html_num_tags('a')'],
      dtype='object')

In [16]:
data.isna().sum()

url_len                    0
url_num_hyphens_dom        0
url_path_len               0
url_domain_len             0
url_hostname_len           0
url_num_dots               0
url_num_underscores        0
url_query_len              0
url_num_query_para         0
url_ip_present             0
url_entropy                0
url_port                   0
html_num_tags('iframe')    0
html_num_tags('embed')     0
html_num_tags('object')    0
html_num_tags('div')       0
html_num_tags('head')      0
html_num_tags('form')      0
html_num_tags('a')         0
dtype: int64

> x_train은 결측치 제거, 열 제거해 준 것

In [24]:
x_train = data
y_train = pd.read_csv('y_train.csv')
x_test = pd.read_csv('x_test.csv')

In [25]:
# KNN, SVM 정규화
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_train_re = scaler.fit_transform(x_train)

In [None]:
# x_test.drop('id')

# <b>Step 0. 라이브러리 import 및 데이터 불러오기
### **가. 라이브러리 import**

* 데이터 프레임 관련 라이브러리

### **나.  학습데이터 불러오기**

### **다.  데이터 전처리**

### **라. train_test_split을 이용하여 train/test  데이터 분리**

- test_size = 0.3
- random_state = 2021

In [9]:
### 2. train_test_split을 이용하여, train_x, test_x, train_y, test_y로 데이터 분리


((2261, 21), (970, 21), (2261,), (970,))

### **마. Confusion Matrix 함수 정의**
#### Confusion Matrix란 Training 을 통한 Prediction 성능을 측정하기 위해 예측 value와 실제 value를 비교하기 위한 표입니다.
#### 아래 함수는 이번 과제에서 confusion matrix 결과를 보기 쉽게 표현한 것으로 사용 예를 참고하여 모델 결과 확인에 사용하시기 바랍니다.

**<span style="color:green">[참고링크] 공식 Document**</span>
 
* confusion matrix(https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html)

In [17]:
from sklearn.metrics import classification_report as creport
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

In [18]:
def plot_confusion_matrix(ax, matrix, labels = ['malicious','benign'], title='Confusion matrix', fontsize=9):
    ax.set_xticks([x for x in range(len(labels))])
    ax.set_yticks([y for y in range(len(labels))])

    # Place labels on minor ticks
    ax.set_xticks([x + 0.5 for x in range(len(labels))], minor=True)
    ax.set_xticklabels(labels, rotation='90', fontsize=fontsize, minor=True)
    ax.set_yticks([y + 0.5 for y in range(len(labels))], minor=True)
    ax.set_yticklabels(labels[::-1], fontsize=fontsize, minor=True)

    # Hide major tick labels
    ax.tick_params(which='major', labelbottom='off', labelleft='off')

    # Finally, hide minor tick marks
    ax.tick_params(which='minor', width=0)

    # Plot heat map
    proportions = [1. * row / sum(row) for row in matrix]
    ax.pcolor(np.array(proportions[::-1]), cmap=plt.cm.Blues)

    # Plot counts as text
    for row in range(len(matrix)):
        for col in range(len(matrix[row])):
            confusion = matrix[::-1][row][col]
            if confusion != 0:
                ax.text(col + 0.5, row + 0.5, int(confusion),
                        fontsize=fontsize,
                        horizontalalignment='center',
                        verticalalignment='center')

    # Add finishing touches
    ax.grid(True, linestyle=':')
    ax.set_title(title, fontsize=fontsize)
    ax.set_xlabel('prediction', fontsize=fontsize)
    ax.set_ylabel('actual', fontsize=fontsize)

    plt.show()

### <span style="color:blue">[예시] Confusion Matrix 사용 방법<span>

- 샘플
#### > confusion = confusion_matrix(test_y, dt_pred)
#### > fig, ax = plt.subplots(figsize=(10,3))
#### > plot_confusion_matrix(ax, confusion, fontsize=30)

---

In [1]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score


In [21]:
# 모델 선언
xgb_model = XGBClassifier()
lgbm_model = LGBMClassifier()
dt_model = DecisionTreeClassifier(random_state=2021)
rf_model = RandomForestClassifier()
knn_model = KNeighborsClassifier()
lr_model = LogisticRegression()
svc_model = SVC()

In [55]:
# 모델 학습
xgb_model.fit(x_train, y_train)
lgbm_model.fit(x_train, y_train)
dt_model.fit(x_train, y_train)
rf_model.fit(x_train, y_train)
knn_model.fit(x_train_re, y_train)
lr_model.fit(x_train, y_train)
svc_model.fit(x_train_re, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  rf_model.fit(x_train, y_train)
  return self._fit(X, y)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)


In [28]:
# 성능 예측
xgb_cv_score = cross_val_score(xgb_model, x_train, y_train, cv=10)
lgbm_cv_score = cross_val_score(lgbm_model, x_train, y_train, cv=10)
dt_cv_score = cross_val_score(xgb_model, x_train, y_train, cv=10)
rf_cv_score = cross_val_score(xgb_model, x_train, y_train, cv=10)
knn_cv_score = cross_val_score(xgb_model, x_train, y_train, cv=10)
lr_cv_score = cross_val_score(xgb_model, x_train, y_train, cv=10)
svc_cv_score = cross_val_score(xgb_model, x_train, y_train, cv=10)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [29]:
# 결과 확인
print('XGB :', xgb_cv_score.mean())
print('lgbm :', lgbm_cv_score.mean())
print('dt :', dt_cv_score.mean())
print('rf :', rf_cv_score.mean())
print('knn :', knn_cv_score.mean())
print('lr :', lr_cv_score.mean())
print('svc :', svc_cv_score.mean())


XGB : 0.9547589372568094
lgbm : 0.9570966074902725
dt : 0.9547589372568094
rf : 0.9547589372568094
knn : 0.9547589372568094
lr : 0.9547589372568094
svc : 0.9547589372568094


# <b>RandomForest GridSearchCV
### 만족할만한 하이퍼파라미터 조합을 찾는 단순한 방법은 수동으로 하이퍼파라미터를 조정하면서 찾는 방법입니다.
### GridSearchcv는 자동으로 복수개의 내부 모형을 생성하고 이를 모두 실행시켜서 최적의 하이퍼파라미터를 탐색해 줍니다.
### 탐색하고자 하는 하이퍼파라미터를 지정하면 가능한 모든 하이퍼파라미터 조합에 대해 교차 검증을 사용해 평가하게 됩니다.


* 주요 파라미터<br>
<table align="left">
    <tr>
        <td align="center">파라미터 명</td><td align="center">설명</td>
    </tr>
     <tr>
        <td align="center">param_grid</td><td>파라미터 딕셔너리</td>
    </tr>
    <tr>
        <td align="center">scoring</td><td>예측 성능을 측정할 평가 방법</td>
    </tr>
    <tr>
        <td align="center">cv</td><td>교차 검증을 위해 분할되는 폴드 수</td>
    </tr>
</table>

**<span style="color:green">[참고링크] 공식 Document**</span>
 
* GridSearchCV(https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)
* model evaluation(https://scikit-learn.org/stable/modules/model_evaluation.html)

In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [39]:
# Grid Search 선언
param = {'max_depth': range(21, 50)} 
  
xgb_grid_model = GridSearchCV(xgb_model,   # 기본 모델 이름
                     param,       # 앞에서 선언한 튜닝용 파라미터 변수
                     cv = 5,      # k-fold Cross validation(default = 5)
                     scoring='f1' # 평가 방법
                     )

lgbm_grid_model = GridSearchCV(lgbm_model,   # 기본 모델 이름
                     param,       # 앞에서 선언한 튜닝용 파라미터 변수
                     cv = 5,      # k-fold Cross validation(default = 5)
                     scoring='f1' # 평가 방법
                     )

dt_grid_model = GridSearchCV(dt_model,   # 기본 모델 이름
                     param,       # 앞에서 선언한 튜닝용 파라미터 변수
                     cv = 5,      # k-fold Cross validation(default = 5)
                     scoring='f1' # 평가 방법
                     )

rf_grid_model = GridSearchCV(rf_model,   # 기본 모델 이름
                     param,       # 앞에서 선언한 튜닝용 파라미터 변수
                     cv = 5,      # k-fold Cross validation(default = 5)
                     scoring='f1' # 평가 방법
                     )

# knn_grid_model = GridSearchCV(knn_model,   # 기본 모델 이름
#                      param,       # 앞에서 선언한 튜닝용 파라미터 변수
#                      cv = 5,      # k-fold Cross validation(default = 5)
#                      scoring='f1' # 평가 방법
#                      )

lr_grid_model = GridSearchCV(lr_model,   # 기본 모델 이름
                     param,       # 앞에서 선언한 튜닝용 파라미터 변수
                     cv = 5,      # k-fold Cross validation(default = 5)
                     scoring='f1' # 평가 방법
                     )

# svc_grid_model = GridSearchCV(svc_model,   # 기본 모델 이름
#                      param,       # 앞에서 선언한 튜닝용 파라미터 변수
#                      cv = 5,      # k-fold Cross validation(default = 5)
#                      scoring='f1' # 평가 방법
#                      )

In [41]:
# 학습하기
xgb_grid_model.fit(x_train, y_train)
lgbm_grid_model.fit(x_train, y_train)
dt_grid_model.fit(x_train, y_train)
rf_grid_model.fit(x_train, y_train)
# knn_grid_model.fit(x_train_re, y_train)
# lr_grid_model.fit(x_train, y_train)
# svc_grid_model.fit(x_train_re, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.clas

ValueError: Invalid parameter 'max_depth' for estimator LogisticRegression(). Valid parameters are: ['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'solver', 'tol', 'verbose', 'warm_start'].

In [42]:
# 중요 정보 확인
# print(model.cv_results_['mean_test_score'])
print('-' * 80)
print('xgb 최적파라미터:', xgb_grid_model.best_params_)
print('xgb 최고성능:', xgb_grid_model.best_score_)
print('=' * 80)
print('lgbm 최적파라미터:', lgbm_grid_model.best_params_)
print('lgbm 최고성능:', lgbm_grid_model.best_score_)
print('=' * 80)
print('dt 최적파라미터:', dt_grid_model.best_params_)
print('dt 최고성능:', dt_grid_model.best_score_)
print('=' * 80)
print('rf 최적파라미터:', rf_grid_model.best_params_)
print('rf 최고성능:', rf_grid_model.best_score_)
print('=' * 80)
# print('knn 최적파라미터:', knn_grid_model.best_params_)
# print('knn 최고성능:', knn_grid_model.best_score_)
# print('=' * 80)
# print('lr 최적파라미터:', lr_grid_model.best_params_)
# print('lr 최고성능:', lr_grid_model.best_score_)
# print('=' * 80)
# print('svc 최적파라미터:', svc_grid_model.best_params_)
# print('svc 최고성능:', svc_grid_model.best_score_)
# print('=' * 80)

--------------------------------------------------------------------------------
xgb 최적파라미터: {'max_depth': 21}
xgb 최고성능: 0.9503355032376908
lgbm 최적파라미터: {'max_depth': 22}
lgbm 최고성능: 0.9533209070848256
dt 최적파라미터: {'max_depth': 21}
dt 최고성능: 0.922826006092458
rf 최적파라미터: {'max_depth': 33}
rf 최고성능: 0.9549832785811102


#### 성능 평가

In [45]:
x_test.drop('Unnamed: 0', axis=1, inplace=True)

In [46]:
# 예측하기
xgb_y_pred = xgb_grid_model.predict(x_test)
lgbm_y_pred = lgbm_grid_model.predict(x_test)
dt_y_pred = dt_grid_model.predict(x_test)
rf_y_pred = rf_grid_model.predict(x_test)
# knn_y_pred = knn_grid_model.predict(x_test)
# lr_y_pred = lr_grid_model.predict(x_test)
# svc_y_pred = svc_grid_model.predict(x_test)

In [47]:
xgb_y_pred

array([0, 0, 0, ..., 0, 1, 1])

In [52]:
# xgb_y_pred = pd.DataFrame({'id': x_test.index, 'label' : xgb_y_pred})
# xgb_y_pred['label'].replace([0, 1], ['benign', 'malicious'] , inplace=True)

lgbm_y_pred = pd.DataFrame({'id': x_test.index, 'label' : lgbm_y_pred})
lgbm_y_pred['label'].replace([0, 1], ['benign', 'malicious'] , inplace=True)

dt_y_pred = pd.DataFrame({'id': x_test.index, 'label' : dt_y_pred})
dt_y_pred['label'].replace([0, 1], ['benign', 'malicious'] , inplace=True)

rf_y_pred = pd.DataFrame({'id': x_test.index, 'label' : rf_y_pred})
rf_y_pred['label'].replace([0, 1], ['benign', 'malicious'] , inplace=True)

In [53]:
lgbm_y_pred

Unnamed: 0,id,label
0,0,benign
1,1,benign
2,2,benign
3,3,malicious
4,4,malicious
...,...,...
2436,2436,benign
2437,2437,malicious
2438,2438,benign
2439,2439,malicious


In [54]:
xgb_y_pred.to_csv('xgb_y_pred_성능튜닝.csv', index=False)
lgbm_y_pred.to_csv('lgbm_y_pred_성능튜닝.csv', index=False)
dt_y_pred.to_csv('dt_y_pred_성능튜닝.csv', index=False)
rf_y_pred.to_csv('rf_y_pred_성능튜닝.csv', index=False)
