In [36]:
import numpy as np
import pandas as pd
import optuna
from lightgbm import LGBMClassifier
from optuna import Trial
from optuna.samplers import TPESampler
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

import pandas as pd
from lightgbm import plot_importance
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from imblearn.over_sampling import SMOTE

import torch
import torch.nn as nn
from pytorch_tabnet.tab_model import TabNetClassifier
import torch.nn.functional as F

In [5]:
from process import input_data, modeling, preprocess
data = pd.read_csv('storage/churn.csv')

In [7]:
class Data_load:
    def __init__(self, path):
        self.path = path #데이터 위치 경로 입력
    
    
    # 데이터 불러오기
    def read_data(self):
        df = pd.read_csv(self.path) 
        var_list = df.columns.tolist() #전체 변수리스트 추출
        num_var = df.select_dtypes(include='float').columns.tolist() + df.select_dtypes(include='int').columns.tolist() #수치형 변수 추출
        obj_var = [x for x in df.columns if x not in num_var] #문자형 변수 추출
        return df, var_list, num_var, obj_var

In [13]:
#전처리 class
class Preprocessing:
    def __init__(self, data, var_list, num_var, obj_var, target, anomaly_per, na, outlier, tree_model):
        self.df = data                                     # 데이터
        self.target = target                               # 타겟 변수
        self.var_list = var_list                           # 전체 변수 리스트
        self.num_var = num_var                             # 수치형 변수 리스트
        self.obj_var = obj_var                             # 문자형 변수 리스트
        self._anomaly_ratio = int(anomaly_per)             # 지정 이상치 범위
        self._anomaly_percentage = int(anomaly_per) / 100
        
        self.na_pre = na                                   #결측치 처리 여부
        self.outlier_pre = outlier                         #이상치 처리 여부
        self.stand_pre = tree_model                        #정규화 처리 여부 => 트리 모델의 경우 생략 가능
        
        if self.na_pre:
            self.df = self.na_preprocess(self.df, self._anomaly_ratio)
        
        if self.outlier_pre:
            self.df = self.outlier_preprocess(self.df, self.num_var)
        
        # 타겟 변수 num_var 일 때 분리(obj_var일 경우 라벨 인코딩해야 하기 때문에 제외하지 않음)
        if self.target in self.num_var:
            self.num_var.remove(self.target)
        
        # 표준화
        if self.stand_pre is False:
            self.df = self.standardize(self.df, self.num_var)
        
        # 라벨 인코딩
        self.df = self.label_encoder(self.df, self.obj_var)
        
        if self.target in self.obj_var:
            self.obj_var.remove(self.target)
    
    # 결측치 확인 및 처리
    def na_preprocess(self, df, anomaly_per):
        
        #Column별 결측치 n% 이상 있을 경우 제외
        remove_v1 = round(df.isnull().sum() / len(df)*100, 2)
        tmp_df = df[remove_v1[remove_v1 < anomaly_per].index]
        
        #Row별 결측치 n% 이상 있을 경우 제외
        idx1 = len(tmp_df.columns) * 0.7
        print('결측치 처리')
        return tmp_df.dropna(thresh=idx1, axis=0)
        
        
    # 이상치 제거
    def outlier_preprocess(self, df, num_var):
        num_data = df.loc[:, num_var]
        
        #IQR 기준
        quartile_1 = num_data.quantile(0.25)
        quartile_3 = num_data.quantile(0.75)
        IQR = quartile_3 - quartile_1

        condition = (num_data < (quartile_1 - 1.5 * IQR)) | (num_data > (quartile_3 + 1.5 * IQR)) # 1.5 수치가 바뀌어야함
        condition = condition.any(axis=1)
        search_df = df[condition]
        print('이상치 처리')
        return df.drop(search_df.index, axis=0)
        
    
        
    #트리 모델이 아닐 경우 표준화 진행, 본 사업 '고객 이탈 모형'에서는 진행 x
    def standardize(self, df, num_var):
        num_data = df.loc[:, num_var]
        non_num_data = df.drop(set(num_var), axis=1)
        
        #표준화
        std_scaler = StandardScaler()
        fitted = std_scaler.fit(num_data)
        output = std_scaler.transform(num_data)
        num_data = pd.DataFrame(output, columns = num_data.columns, index=list(num_data.index.values))
        
        tmp_df = pd.concat([non_num_data, num_data], axis=1)
        print('표준화')
        return tmp_df
        
    
    #문자형 변수를 수치형으로 변환
    def label_encoder(self, df, obj_var):
        obj_data = df.loc[:, obj_var]
        non_obj_data = df.drop(set(obj_var), axis=1)

        #인코딩
        obj_output = pd.DataFrame()
        for obj_col in obj_var:
            lb_encoder = LabelEncoder()
            output = lb_encoder.fit_transform(obj_data.loc[:, obj_col])
            output = pd.DataFrame(output, index = list(obj_data.index.values))
            obj_output = pd.concat([obj_output, output], axis=1)
        obj_output.columns = obj_var
        tmp_df = pd.concat([obj_output, non_obj_data], axis=1)
        print('수치형 변환')
        return tmp_df
    
    def get_df(self):
        return self.df
    

In [14]:
data, var_list, num_var, obj_var = Data_load('storage/churn.csv').read_data()
pp = Preprocessing(data, var_list, num_var, obj_var, target='churn', anomaly_per=10, na=False, outlier=False, tree_model=False)
df = pp.df

표준화
수치형 변환


In [16]:
X = df.drop(['churn'], axis=1)
y = df['churn']
X_test = X.copy()

In [32]:
def objective(trial: Trial) -> float:
    params_lgb = {
        "random_state": 42,
        "verbosity": -1,
        "learning_rate": 0.05,
        "n_estimators": 10000,
        "objective": "binary",
        "metric": "auc",
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 3e-5),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 9e-2),
        "max_depth": trial.suggest_int("max_depth", 1, 20),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "subsample_freq": trial.suggest_int("subsample_freq", 1, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_bin": trial.suggest_int("max_bin", 200, 500),
    }
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

    model = LGBMClassifier(**params_lgb)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        #early_stopping_rounds=100,
        verbose=False,
    )

    lgb_pred = model.predict_proba(X_valid)[:,1]
    lgb_score = roc_auc_score(y_valid, lgb_pred)
    
    return lgb_score

In [33]:
sampler = TPESampler(seed=42)
study = optuna.create_study(
    study_name="lgbm_parameter_opt",
    direction="maximize",
    sampler=sampler,
)

[32m[I 2023-01-20 16:10:34,405][0m A new study created in memory with name: lgbm_parameter_opt[0m


In [41]:
#study.optimize(objective, n_trials=10)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

Best Score: 0.9118336483931948
Best trial: {'reg_alpha': 1.7560829253683595e-07, 'reg_lambda': 0.07339153040632079, 'max_depth': 15, 'num_leaves': 187, 'colsample_bytree': 0.8627622080115674, 'subsample': 0.35183125621386324, 'subsample_freq': 4, 'min_child_samples': 16, 'max_bin': 459}


In [47]:
X_train = df.drop(['churn'], axis=1).values
y_train = df['churn'].values
y_train = y_train.reshape(-1, 1)

In [48]:
from sklearn.model_selection import StratifiedKFold

In [44]:
def Objective(trial):
    mask_type = trial.suggest_categorical("mask_type", ["entmax", "sparsemax"])
    n_da = trial.suggest_int("n_da", 56, 64, step=4)
    n_steps = trial.suggest_int("n_steps", 1, 3, step=1)
    gamma = trial.suggest_float("gamma", 1., 1.4, step=0.2)
    n_shared = trial.suggest_int("n_shared", 1, 3)
    lambda_sparse = trial.suggest_float("lambda_sparse", 1e-6, 1e-3, log=True)
    tabnet_params = dict(n_d=n_da, n_a=n_da, n_steps=n_steps, gamma=gamma,
                     lambda_sparse=lambda_sparse, optimizer_fn=torch.optim.Adam,
                     optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                     mask_type=mask_type, n_shared=n_shared,
                     scheduler_params=dict(mode="min",
                                           patience=trial.suggest_int("patienceScheduler",low=3,high=10), # changing sheduler patience to be lower than early stopping patience 
                                           min_lr=1e-5,
                                           factor=0.5,),
                     scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                     verbose=0,
                     ) #early stopping
    
    folds=StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
    
    CV_score_array    =[]
    
    for n_fold, (train_index, val_index) in enumerate(folds.split(X_train, y_train)):

        train_x, val_x = X_train[train_index], X_train[val_index]
        train_y, val_y = y_train[train_index], y_train[val_index]
        
        classifier = TabNetClassifier(**tabnet_params)
        classifier.fit(X_train=train_x, y_train=train_y,
                  eval_set=[(val_x, val_y)],
                  patience=trial.suggest_int("patience",low=15,high=30), max_epochs=trial.suggest_int('epochs', 1, 100),
                  eval_metric=['auc'])
        CV_score_array.append(classifier.best_cost)
    avg = np.mean(CV_score_array)
    return avg

In [45]:
study = optuna.create_study(direction="maximize", study_name='TabNet optimization')
study.optimize(Objective, timeout=6*60) #5 hours

[32m[I 2023-01-20 16:22:59,559][0m A new study created in memory with name: TabNet optimization[0m
[33m[W 2023-01-20 16:22:59,564][0m Trial 0 failed with parameters: {'mask_type': 'entmax', 'n_da': 64, 'n_steps': 3, 'gamma': 1.2, 'n_shared': 3, 'lambda_sparse': 0.00016406955797675018, 'patienceScheduler': 3} because of the following error: NameError("name 'KFold' is not defined").[0m
Traceback (most recent call last):
  File "C:\Users\dpapf\anaconda3\envs\demand\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\dpapf\AppData\Local\Temp\ipykernel_1376\1448854290.py", line 20, in Objective
    kf = KFold(n_splits=5, random_state=42, shuffle=True)
NameError: name 'KFold' is not defined
[33m[W 2023-01-20 16:22:59,565][0m Trial 0 failed with value None.[0m


NameError: name 'KFold' is not defined