In [4]:
import numpy as np
import pandas as pd
from scipy.stats import randint, uniform
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
import seaborn as sns
import warnings
from imblearn.combine import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.datasets import make_classification
from collections import Counter
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

warnings.filterwarnings("ignore", category=DeprecationWarning)
%matplotlib inline

# 한글 설정
import matplotlib.font_manager as fm

# font_list = fm.findSystemFonts(fontpaths=None, fontext='ttf')
# ttf 폰트 전체개수
# print(len(font_list))
# [(f.name, f.fname) for f in fm.fontManager.ttflist if 'Gothic' in f.name]

#font_path = 'C:\\WINDOWS\\Fonts\\malgunsl.ttf'
#font_prop = fm.FontProperties(fname=font_path, size=12)
#font_name = font_prop.get_name()

#rc('font', family=font_name)

## Missing Values 처리 ##

In [5]:
# 결측값 확인
# input : dataframe 객체
#   checkData : 확인할 데이터
#
# output : feature 별 null 값 건수
def have_missing_value(checkData):
    try:       
        
        data = checkData.isnull().sum();
        
        return data;
    except Exception as e:
        raise e

In [6]:
# 수치형 결측치 처리
# input : dataframe 객체
#   checkData : 확인할 데이터
#   cont       : 수치형 Features
#   imputeValue : 대체할 값  
# output : feature 별 null 값 건수
def Impute_missing_value_numeric(checkData, cont, imputeValue):
    try:
        
        data = checkData.fillna(value=imputeValue, inplace=True)
            
        return data;
    except Exception as e:
        raise e

In [7]:
# 범주형 결측치 One-hot encoding 처리
# input : dataframe 객체
#   checkData : 확인할 데이터
#   cate      : encoding 할 Feature 
# output : feature 별 null 값 건수
def Impute_missing_value_categorical(checkData, cate):
    try:
        
        for i in checkData[cate].dtypes.index:
            
            print("['"+ i +"'].astype('category').cat.categories")
            checkData[i].astype('category').cat.categories
            
            checkData[cate] = checkData[cate].apply(lambda x: x.astype('category').cat.codes)
        
        return  checkData
    except Exception as e:
        raise e

## <font color='red'>비대칭 데이터 처리</font>
- 비대칭 데이터는 데이터 비율을 맞추면 정밀도(precision)가 향상된다.
 - 오버샘플링(Over-Sampling)  : 소수 클래스 데이터를 추가함.
 - 언더샘플링(Under-Sampling) : 다수 클래스 데이터에서 일부만 사용함. Tomek links,
 - 복합샘플링(Combining Over-and Under-Sampling) : SMOTE + Tomek
 - 비대칭 데이터 확인 :  trainData.gender.value_counts()
- pip install -U imbalanced-learn

In [8]:
# Tomek links :  언더샘플링(Under-Sampling) : 다수 클래스 데이터에서 일부만 사용함
# input : X_train, y_train
#  X_train : 
#  y_train
# output : print
def under_Sampling_DecisionTree(X_train, y_train):
    try:
        XX, yy = TomekLinks().fit_sample(X_train, y_train)
        tree0 = DecisionTreeClassifier(max_depth=4, random_state=0)
        tree0.fit(XX, yy)
        y_pred0 = tree0.predict(X_test)

        print(classification_report(y_test, y_pred0))

    except Exception as e:
        raise e

In [9]:
# Smot+Tomek links :  복합샘플링(Combining Over-and Under-Sampling) 
# input : X_train, y_train
#  X_train : 
#  y_train
# output : print
def combining_Sampling_DecisionTree(X_train, y_train):
    try:
        XX, yy = SMOTETomek(random_state=0).fit_sample(X_train, y_train)

        tree3 = DecisionTreeClassifier(max_depth=4, random_state=0)
        tree3.fit(XX, yy)
        y_pred3 = tree3.predict(X_test)

        print(classification_report(y_test, y_pred3))

    except Exception as e:
        raise e

## Modeling ##

In [1]:
# 데이터 분할(Split Data)
# input : dataframe 객체
#   objData    : 분리할 대상 데이터
#   delCols   : 비분석 Features
#   dfy       : target Feature
#   testSize  : 분리할 테스트 사이즈(%)
# output : X_train, X_test, y_train, y_test

def data_split_train_test(objData, delCols, dfy, testSize):
    try:
        objData = objData.drop(delCols, axis=1)
        dfY = objData[dfy]
        dfX = objData.drop(dfy, axis=1)
        
        return train_test_split(dfX, dfY, test_size=testSize, random_state=0)
    
    except Exception as e:
        raise e

In [11]:
# Decision Tree 모델 적용
# input : 데이터 분할로 얻은 X_train, X_test, y_train, y_test
# output : X_train, X_test, y_train, y_test
def decisionTree_modeling(X_train, X_test, y_train, y_test):
    try:
        from sklearn.tree import DecisionTreeClassifier
        from sklearn.metrics import classification_report
        
        tree = DecisionTreeClassifier(max_depth=6, random_state=0)
        tree.fit(X_train, y_train)
        pred_tree = tree.predict(X_test); 
       # print("tree score ::: {}".format(d_tree_score))
        
        return print(classification_report(y_test, pred_tree))
    
    except Exception as e:
        raise e

In [12]:
# SVM 모델 적용
# input : 데이터 분할로 얻은 X_train, X_test, y_train, y_test
# output : X_train, X_test, y_train, y_test
def svm_modeling(X_train, X_test, y_train, y_test):
    try:
        from sklearn.svm import SVC 
        svm = SVC(random_state=0)
        svm.fit(X_train, y_train)
        pred_svm = svm.predict(X_test);
        
        return print(classification_report(y_test, pred_svm))
    
    except Exception as e:
        raise e

In [13]:
# neural network 모델 적용
# input : 데이터 분할로 얻은 X_train, X_test, y_train, y_test
# output : X_train, X_test, y_train, y_test
def neural_network_modeling(X_train, X_test, y_train, y_test):
    try:
        from sklearn.neural_network import MLPClassifier 
        mlp = MLPClassifier()
        mlp.fit(X_train, y_train)
        pred_mlp = mlp.predict(X_test);
        
        return print(classification_report(y_test, pred_mlp))
    
    except Exception as e:
        raise e

In [14]:
# bestModel
# input : model 객체
#   model    : 적용 알고리즘
#   nFolds   : 분할 개수
#  
# output : best parameter
def bestModel(model, nFolds, searchCV, X_data, y_data, isScaler, isPloy, isFeatureUnion, scoring, nJobs, nIter, verbose):
    # GridSearchCV을 위해 파라미터 값을 제한함.
    stepss = []
    grd_prams = {}

    if isPloy == True:
        stepss.append(('polynomialfeatures', PolynomialFeatures()))
        grd_prams.update({'polynomialfeatures__degree':[1, 2]})
        
        if isFeatureUnion == True:
            # create feature union
            features = []
            features.append(('pca', PCA(n_components=3)))
            features.append(('univ_select', SelectKBest(k=10)))
            stepss.append(('features', FeatureUnion(features)))       

    if isScaler == 'STANDARD':
        stepss.append(('standardscaler', StandardScaler()))
    else:
        stepss.append(('minmaxscaler', MinMaxScaler()))
            
    if model == 'SVC':
        stepss.append(('svc', SVC(random_state=0, C=100)))
        grd_prams.update({'svc__C':[0.1, 1, 10, 100], 'svc__gamma':[0.001, 0.01, 0.05, 0.1, 1, 10]})
    elif model == 'XGB': 
        stepss.append(('xgb', XGBClassifier(random_state=0, objective='binary:logistic')))
        grd_prams.update({'xgb__n_estimators': [300, 500],
            'xgb__learning_rate': [0.001, 0.01],
            'xgb__subsample': [0.5, 1],
            'xgb__max_depth': [5, 6],
            'xgb__colsample_bytree': [0.97, 1.24],
            'xgb__min_child_weight': [1, 2],
            'xgb__gamma': [0.001, 0.005],
            'xgb__nthread': [3, 4],
            'xgb__reg_lambda': [0.5, 1.0],
            'xgb__reg_alpha': [0.01, 0.1]
          })
        
    elif model == 'LGBM':
        # 그래디언트 부스팅 결정 트리(GBDT) : GOSS와 EFB 적용하여 GBDT를 새롭게 구현한 알고리즘
        stepss.append(('lgbm', LGBMClassifier(random_state=0, boosting_type='gbdt', objective='binary', metric='auc')))
        grd_prams.update({'lgbm__max_depth': [50, 100],
              'lgbm__learning_rate' : [0.01, 0.05],
              'lgbm__num_leaves': [150, 200],
              'lgbm__n_estimators': [300, 400],
              'lgbm__num_boost_round':[4000, 5000],
              'lgbm__subsample': [0.5, 1],
              'lgbm__reg_alpha': [0.01, 0.1],
              'lgbm__reg_lambda': [0.01, 0.1],
              'lgbm__min_data_in_leaf': [20, 30],
              'lgbm__lambda_l1': [0.01, 0.1],
              'lgbm__lambda_l2': [0.01, 0.1]
            })
        
    pipe = Pipeline(steps=stepss)
    
    cv = StratifiedShuffleSplit(n_splits=nFolds, test_size=nFolds, random_state=0)
    grid = GridSearchCV(pipe, param_grid=grd_prams, n_jobs=nJobs, scoring=scoring, verbose=verbose, cv=cv)
    
    if searchCV == 'RANDOM':
        grid = RandomizedSearchCV(pipe, param_distributions=grd_prams, n_iter=nIter, scoring=scoring, error_score=3, verbose=verbose, n_jobs=nJobs, cv=cv)

    grid.fit(X_data, y_data)
    
    return grid.best_params_

In [15]:
# 파라미터 서치
def bestGBDTNextModel(model, isKfold, nfold, searchCV, Xtrain, ytrain, Xtest, ytest, nIter, scoring, errScore, verbose, nJobs):
    # GridSearchCV을 위해 파라미터 값을 제한함.
    grd_prams = {}
    classifier = XGBClassifier(random_state=0, objective='binary:logistic')
    cv = KFold(n_splits=nfold, shuffle=True, random_state=0)
    
    if model == 'LGBM':
        # 그래디언트 부스팅 결정 트리(GBDT)    
        grd_prams.update({'max_depth': [50, 100],
              'learning_rate' : [0.01, 0.05],
              'num_leaves': [150, 200],
              'n_estimators': [300, 400],
              'num_boost_round':[4000, 5000],
              'subsample': [0.5, 1],
              'reg_alpha': [0.01, 0.1],
              'reg_lambda': [0.01, 0.1],
              'min_data_in_leaf': [20, 30],
              'lambda_l1': [0.01, 0.1],
              'lambda_l2': [0.01, 0.1]
            })
        
        #grd_prams.update({'max_depth': [50, 75, 90, 100],
        #      'learning_rate' : [0.01, 0.05, 0.07, 0.1],
        #      'num_leaves': [300,600,900,1200],
        #      'n_estimators': [100, 300, 500, 900],
        #      'num_boost_round':[1000, 2000, 3000, 4000],
        #      'num_leaves': [30, 60, 120, 150, 200],
        #      'reg_alpha': [0.01, 0.1, 0.5, 0.7, 1.0],
        #      'min_data_in_leaf': [50, 100, 300, 800],
        #      'lambda_l1': [0, 0.1, 0.5, 1.0],
        #      'lambda_l2': [0, 0.01, 1.0]})
        
        classifier = LGBMClassifier(random_state=0, boosting_type='gbdt', objective='binary', metric='auc')
        
    elif model == 'XGB':
        grd_prams.update({'n_estimators': [300, 500],
            'learning_rate': [0.001, 0.01],
            'subsample': [0.5, 1],
            'max_depth': [5, 6],
            'colsample_bytree': [0.97, 1.24],
            'min_child_weight': [1, 2],
            'gamma': [0.001, 0.005],
            'nthread': [3, 4],
            'reg_lambda': [0.5, 1.0],
            'reg_alpha': [0.01, 0.1]
          })
        
        #grd_prams.update({'n_estimators': [300, 500, 700],
        #    'learning_rate': [0.01, 0.03, 0.05, 0.07, 0.09],
        #    'subsample': [0.5, 1],
        #    'max_depth': [4, 5, 6, 7, 8, 9, 10],
        #    'colsample_bytree': [0.52, 0.97, 1,55, 2.32, 3.46],
        #    'min_child_weight': [1, 2, 3, 4],
        #    'gamma': [0.001, 0.01, 0.1, 0, 1],
        #    'nthread': [3, 4, 5],
        #    'reg_lambda': [0.01, 0.1, 0.5, 0.7, 1.0],
        #    'reg_alpha': [0.01, 0.1, 0.5, 0.7, 1.0]
        #  })
    
    if isKfold == False:
        cv = StratifiedShuffleSplit(n_splits=nfold, test_size=0.2, random_state=0)
    
    grid_ = RandomizedSearchCV(classifier, param_distributions=grd_prams, n_iter=nIter, scoring=scoring, error_score=errScore, verbose=verbose, n_jobs=nJobs, cv=cv)

    # 속도 이슈
    if searchCV == 'GRID': 
        grid_ = GridSearchCV(classifier, param_grid=grd_prams, n_jobs=nJobs, scoring=scoring, verbose=verbose, cv=cv)
    
    grid_.fit(Xtrain, ytrain)
    score_ = grid_.score(Xtest, ytest)
    
    #best = {"best_param":grid_.best_params_, 
    #        "best_score":grid_.best_score_, 
    #        "best_estimator":grid_.best_estimator_,
    #        "test_score":score_
    #       }
    
    print("{} grid_.best_score {}".format(model, np.round(grid_.best_score_,3)))
    print("{} grid_.best_score {}".format(model, np.round(score_,3)))
    print("{} best_estimator {}".format(model, grid_.best_estimator_))

    return grid_.best_params_

In [16]:
def drawSMOTETomek(X_org, y_org, X_pca, y_resampled, title1, title2, xlim, ylim, xticks, yticks):
    # Two subplots, unpack the axes array immediately
    f, (ax1, ax2) = plt.subplots(1,2,figsize=(18,8))

    c0 = ax1.scatter(X_org[y_org == 0, 0], X_org[y_org == 0, 1], label="Class #0", alpha=0.5)
    c1 = ax1.scatter(X_org[y_org == 1, 0], X_org[y_org == 1, 1], label="Class #1", alpha=0.5)
    ax1.set_title(title1)

    ax2.scatter(X_pca[y_resampled == 0, 0], X_pca[y_resampled == 0, 1], label="Class #0", alpha=0.5)
    ax2.scatter(X_pca[y_resampled == 1, 0], X_pca[y_resampled == 1, 1], label="Class #1", alpha=0.5)
    ax2.set_title(title2)

    # make nice plotting
    for ax in (ax1, ax2):
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.get_xaxis().tick_bottom()
        ax.get_yaxis().tick_left()
        ax.spines['left'].set_position(('outward', 10))
        ax.spines['bottom'].set_position(('outward', 10))
        ax.set_xlim(xlim)      # [-1, 10]
        ax.set_ylim(ylim)      # [-5, 20]
        ax.set_xticks(xticks)  # range(-1, 10)
        ax.set_yticks(yticks)  # range(-5, 20)

    plt.figlegend((c0, c1), ('Class #0', 'Class #1'), loc='lower center', ncol=2, labelspacing=0.)
    plt.tight_layout(pad=3)
    plt.show()