In [3]:
import optuna

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.ensemble import VotingClassifier

In [2]:
conda install -c conda-forge lightgbm

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\NTX550\anaconda3

  added / updated specs:
    - lightgbm


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    lightgbm-3.2.1             |   py39hd77b12b_0         754 KB
    ------------------------------------------------------------
                                           Total:         754 KB

The following NEW packages will be INSTALLED:

  lightgbm           pkgs/main/win-64::lightgbm-3.2.1-py39hd77b12b_0 None



Downloading and Extracting Packages

lightgbm-3.2.1       | 754 KB    |            |   0% 
lightgbm-3.2.1       | 754 KB    | #2         |  13% 
lightgbm-3.2.1       | 754 KB    | ########## | 100% 
lightgbm-3.2.1       | 754 KB    | ########## | 100% 
Preparing transaction: ...working... done
Verifying trans

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import joblib

In [None]:
##### data label split

In [5]:
data = pd.read_csv("total.csv")

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1464 entries, 0 to 1463
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_name  1464 non-null   object
 1   brand_name    1462 non-null   object
 2   season        1464 non-null   object
 3   gender        1464 non-null   int64 
 4   rating        1464 non-null   int64 
 5   review_cnt    1464 non-null   int64 
 6   exit          1464 non-null   object
 7   goodsnum      1464 non-null   int64 
 8   price         1464 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 103.1+ KB


In [14]:
data = data.iloc[:,1:]

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1464 entries, 0 to 1463
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   brand_name  1462 non-null   object
 1   season      1464 non-null   object
 2   gender      1464 non-null   int64 
 3   rating      1464 non-null   int64 
 4   review_cnt  1464 non-null   int64 
 5   exit        1464 non-null   object
 6   goodsnum    1464 non-null   int64 
 7   price       1464 non-null   int64 
dtypes: int64(5), object(3)
memory usage: 91.6+ KB


In [17]:
# 2) null 데이터 체크 함수
# Age, Cabin, Embarked, Fare
# 입력 : df --> 누락처리 --> 출력:df

def check_fillna(df):
    df["brand_name"].fillna("RISINGWAVE", inplace=True)
    return df

In [18]:
# 불필요한 컬럼 제거 함수
# df --> 컬럼 제거 --> df

def drop_features(df):
    df.drop(["goodsnum"], axis=1, inplace=True)
    return df

In [19]:
def encode_feature(df):
    # 인코딩 컬럼을 리스트업
    features = ["brand_name", "season", "exit"]
    # 위의 컬럼들을 인코딩
    for f in features:
        le = LabelEncoder()
        le.fit(df.loc[:, f])
        df.loc[:, f] = le.transform(df.loc[:,f])
    return df

In [20]:
### 전처리를 함수화 프로세스로

def musinsa_preprocess(df):
    df = check_fillna(df)
    df = drop_features(df)
    df = encode_feature(df)
    return df
#### ######

In [22]:
y_data = data.loc[:, "rating"]  # like, rating범주화 별로 따로 진행
X_data = data.drop(columns = ["rating"])
X_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1464 entries, 0 to 1463
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   brand_name  1462 non-null   object
 1   season      1464 non-null   object
 2   gender      1464 non-null   int64 
 3   review_cnt  1464 non-null   int64 
 4   exit        1464 non-null   object
 5   goodsnum    1464 non-null   int64 
 6   price       1464 non-null   int64 
dtypes: int64(4), object(3)
memory usage: 80.2+ KB


In [23]:
X_data = musinsa_preprocess(X_data)
X_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1464 entries, 0 to 1463
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   brand_name  1464 non-null   int32
 1   season      1464 non-null   int32
 2   gender      1464 non-null   int64
 3   review_cnt  1464 non-null   int64
 4   exit        1464 non-null   int32
 5   price       1464 non-null   int64
dtypes: int32(3), int64(3)
memory usage: 51.6 KB


In [None]:
#### train_test split & kfold 선언

In [24]:
X_train, X_test, y_train, y_test = train_test_split(
      X_data, y_data, test_size = 0.2, random_state=1111 # 만약 범주화의 경우 stratify
)
kfold = KFold(n_splits = 5, random_state=1111, shuffle=True)

In [None]:
#### RF base line

In [25]:
rf = RandomForestClassifier(n_jobs = -1, random_state=1111)
scores= cross_val_score(rf, X_train, y_train,
                        cv = kfold, scoring = "accuracy")

for iter_count, acc in enumerate(scores):
    print("RF {0}번째 검증 ACC : {1:.4f}".format(iter_count+1, acc))
print("RF 평균 ACC : ", scores.mean())

RF 1번째 검증 ACC : 0.8723
RF 2번째 검증 ACC : 0.8034
RF 3번째 검증 ACC : 0.8761
RF 4번째 검증 ACC : 0.8590
RF 5번째 검증 ACC : 0.8333
RF 평균 ACC :  0.8488270594653574


In [None]:
#### optuna 돌리기 위한 설정

In [26]:
parameters = {
    "n_estimators" : [10,50,100, 500],
    "max_features" : ["sqrt", "log2"],
    "max_depth" : [2,3,5,10,30,50],
    "min_samples_split" : [2,4,6,10,30],
    "min_samples_leaf" : [1,2,3,5,10]
}

In [27]:
def rf_objective(trial):
    params = {
      "criterion" : trial.suggest_categorical("criterion", ["gini", "entropy"]),
      "n_estimators" : trial.suggest_int("n_estimators", 100, 3000),
      "min_samples_leaf" : trial.suggest_int("min_samples_leaf", 1, 15),
      "max_features" : trial.suggest_int("max_features", 1, X_train.shape[1]),
      "max_depth" : trial.suggest_int("max_depth", 1, 100),
      "max_leaf_nodes" : trial.suggest_int("max_leaf_nodes", 2, 1000)
  }
    rf = RandomForestClassifier(n_jobs = -1, random_state = 1111, **params)
    rf.fit(X_train, y_train)
    scores = cross_val_score(rf, X_train, y_train, cv = kfold, scoring = "accuracy")
    acc_mean = scores.mean()

    return acc_mean

In [None]:
rf_study = optuna.create_study(direction = "maximize")
rf_study.optimize(rf_objective, n_trials = 50)

[32m[I 2022-11-17 11:06:34,520][0m A new study created in memory with name: no-name-2ad35105-e0ff-4015-8009-dda1c81cdcc8[0m
[32m[I 2022-11-17 11:06:58,771][0m Trial 0 finished with value: 0.8616402982360428 and parameters: {'criterion': 'gini', 'n_estimators': 2177, 'min_samples_leaf': 15, 'max_features': 2, 'max_depth': 39, 'max_leaf_nodes': 265}. Best is trial 0 with value: 0.8616402982360428.[0m
[32m[I 2022-11-17 11:07:06,584][0m Trial 1 finished with value: 0.8565157301327513 and parameters: {'criterion': 'entropy', 'n_estimators': 573, 'min_samples_leaf': 9, 'max_features': 3, 'max_depth': 13, 'max_leaf_nodes': 744}. Best is trial 0 with value: 0.8616402982360428.[0m
[32m[I 2022-11-17 11:07:26,246][0m Trial 2 finished with value: 0.860785597381342 and parameters: {'criterion': 'gini', 'n_estimators': 1852, 'min_samples_leaf': 14, 'max_features': 2, 'max_depth': 11, 'max_leaf_nodes': 968}. Best is trial 0 with value: 0.8616402982360428.[0m
[32m[I 2022-11-17 11:07:35,31

In [None]:
rf_study.best_params

In [None]:
### 중요 파라미터 확인 및 이를 이용한 그리드서치

In [None]:
optuna.visualization.plot_param_importances(rf_study)

In [None]:
parameters = {
    "n_estimators" : [95, 100, 105, 110, 300],
    "max_features" : ["sqrt"],
    "max_depth" : [3,4,5,6],
    "min_samples_split" : [2,3,4],
    "min_samples_leaf" : [2,3,4]
}
rf = RandomForestClassifier(n_jobs = -1, random_state = 1111)
rf_kf_gs = GridSearchCV(rf,
                         param_grid = parameters,
                         cv = kfold,
                         scoring = "accuracy",
                         n_jobs = -1)
rf_kf_gs.fit(X_train, y_train)

In [None]:
rf_kf_gs_best = rf_kf_gs.best_estimator_
rf_kf_gs_best_ypred = rf_kf_gs_best.predict(X_test)
rf_kf_gs_best_acc = accuracy_score(y_test, rf_kf_gs_best_ypred)
print("RF 그리드서치 정확도 : ", rf_kf_gs_best_acc)

In [None]:
joblib.dump(rf_kf_gs_best, "RF_model_sephora.pkl")

In [None]:
##################################################

In [None]:
### xgboost base line

In [None]:
xgbc = XGBClassifier(n_jobs = -1,
                     random_state = 1111,
                     use_label_encoder = False)
kfold = KFold(n_splits = 5, random_state=1111, shuffle=True)

In [None]:
scores= cross_val_score(xgbc, X_train, y_train,
                        cv = kfold, scoring = "accuracy")

for iter_count, acc in enumerate(scores):
  print("xgbc {0}번째 검증 ACC : {1:.4f}".format(iter_count+1, acc))
print("xgbc 평균 ACC : ", scores.mean())

In [None]:
### xgboost optuna

In [None]:
def xgbc_objective(trial):
  # 1 ) Test Parameter : 가변적으로 test HPT의 값들을 셋팅
    params = {
        "booster" : trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "n_estimators" : trial.suggest_int("n_estimators", 50, 2500),
        "subsample" : trial.suggest_float("subsample", 0.2,0.05),
        "colsample_bytree" : trial.suggest_discrete_uniform("colsample_bytree", 0.2, 1, 0.05),
        "max_depth" : trial.suggest_int("max_depth", 1, 100),
        "reg_alpha" : trial.suggest_float("reg_alpha", 0.003, 100),
        "reg_lambda" : trial.suggest_float("reg_lambda", 0.001, 100),
        "learning_rate" : trial.suggest_float("learning_rate", 0.001, 0.8)
    }
    # 2 ) 1번에서 파라미터를 받아줄 모델 작성
    xgbc = XGBClassifier(n_jobs = -1,
                      random_state = 1111,objective = "binary:logistic",
                      eval_metric  ="error", **params)
    # 3 ) 모델에 대해서 학습
    xgbc.fit(X_train, y_train)
    # 4 ) CV
    scores = cross_val_score(xgbc, X_train, y_train, cv = kfold, scoring = "accuracy")
    acc_mean = scores.mean()
    # 5 ) optimizer에 대한 target --> acc_mean의 스코어
    return acc_mean

In [None]:
xgbc_study = optuna.create_study(direction = "maximize")
xgbc_study.optimize(xgbc_objective, n_trials = 50)

In [None]:
xgbc_study.best_params

In [None]:
# 중요 파라미터 확인 및 그리드 서치

In [None]:
optuna.visualization.plot_param_importances(xgbc_study)

In [None]:
parameters = {
    # 몇 개의 모델을 만들어서 이어달리기를 할지..
    "n_estimators" : [40,50],
    # 조금씩 이동하면서 에러 보상할까, 크게 크게 움직이면서 에러 보상..
    "learning_rate":[ 0.09,0.11],

    # Tree모형에 대한 조절
    "max_depth":[7,8],
    "min_child_weight":[0.4,0.5],
    "gamma":[0, 0.1], 

    # 샘플링쪽 컨트롤
    "subsample":[0.3,0.85],
    "colsample_bytree":[0.3,0.85],

    # 부스팅자체가 overfit--> 규약
    "reg_alpha":[0, 0.005],
    "reg_lambda":[9,10],
}
xgb_kf_gs = GridSearchCV(xgbc,
                            param_grid=parameters,
                                cv = kfold,
                                scoring="accuracy",
                                n_jobs = -1 ,
                                )
xgb_kf_gs.fit(X_train, y_train)

In [None]:
xgb_kf_gs_best = xgb_kf_gs.best_estimator_
xgb_kf_gs_best_ypred = xgb_kf_gs_best.predict(X_test)
xgb_kf_gs_best_acc = accuracy_score(y_test, xgb_kf_gs_best_ypred)
print("XGB 그리드서치 정확도 : ", xgb_kf_gs_best_acc)

In [None]:
#############################################################

In [None]:
### lgbm base line

In [None]:
lgbm = LGBMClassifier(
    random_state = 1111,
    n_jobs = -1
)