In [163]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.datasets import make_classification

train = pd.read_csv('C:/Users/Administrator/Desktop/개인공부자료/정형데이터분석/bank.csv')
train_copy = train.copy() 

In [164]:
train

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11157,33,blue-collar,single,primary,no,1,yes,no,cellular,20,apr,257,1,-1,0,unknown,no
11158,39,services,married,secondary,no,733,no,no,unknown,16,jun,83,4,-1,0,unknown,no
11159,32,technician,single,secondary,no,29,no,no,cellular,19,aug,156,2,-1,0,unknown,no
11160,43,technician,married,secondary,no,0,no,yes,cellular,8,may,9,2,172,5,failure,no


## EDA - target variable

In [165]:
# Distribution of the Target Column
train['deposit'].value_counts()

no     5873
yes    5289
Name: deposit, dtype: int64

## 결측값 계산

In [166]:
# 결측치 비율 정의
missing = 100 * train.isnull().sum() / len(train)

In [167]:
# 내림차순 정렬 & 상위 5개 변수
missing.sort_values(ascending=False).head(5).round(1)

deposit      0.0
loan         0.0
job          0.0
marital      0.0
education    0.0
dtype: float64

결측치를 처리하는 법
- XGBoost 사용 - can handle missing values with no need for imputation  
   (training할때 결측값을 어느 쪽의 node로 보낼지 판단함. 어느쪽으로 보내는게 loss를 최소화 할지 스스로 판단)  
   
- 결측값의 비율이 높은 column을 drop. 근데 어느 column들이 모델에 helpful할지 미리 알 수가 없으므로 여기서는 우선 모든 columns을 keep하는 방식

##  Column Types

In [168]:
# Number of each type of column
train.dtypes.value_counts()

object    10
int64      7
dtype: int64

In [169]:
# Number of unique classes i|n each object column
train.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

job          12
marital       3
education     4
default       2
housing       2
loan          2
contact       3
month        12
poutcome      4
deposit       2
dtype: int64

## Encoding Categorical Variables
- 2개의 값으로만 구성되어 있는 feature는 Label Encoding을 하고, 2개 이상의 값으로 구성되어 있는 feature는 One-Hot Encoding  
- 2개 이상의 feature에 Label Encoding을 하면, 라벨을 부여하는 방식이 임의적이라는 문제 &   1, 2, 3, 4… 는 단지 범주를 구분하기 위한 건데 실제 머신러닝 모델을 돌리면 모델이 이 숫자의 크기를 간주하는 문제 (즉 ‘4가 1보다 4배 크다’라는 식으로)   
--> **따라서 2개 이상의 feature를 가진 변수는 One-Hot Encoding 을 함**    
- 어떤 방식이 더 효율적인지에 대해서는 아직도 논쟁 중  

- The only downside to one-hot encoding is that the number of features (dimensions of the data) can explode with categorical variables with many categories. To deal with this, we can perform one-hot encoding followed by PCA or other dimensionality reduction methods to reduce the number of dimensions (while still trying to preserve information).

### 1. Categorical var중에서도 범주가 2개인 변수는 label encoding만 적용 

In [170]:
# Create a label encoder object
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in train:
    # categorical var만 대상으로 함
    if train[col].dtype == 'object':
        # category(범주)가 2개 이하인 경우에만 label encoding 적용
        if len(list(train[col].unique())) <= 2:
            # Train on the training data
            le.fit(train[col])
            train[col] = le.transform(train[col])
          
            le_count += 1
            
print('%d columns were label encoded.' % le_count)

4 columns were label encoded.


In [171]:
# category가 2개인 변수들(총 4개)에 대해 label encoding이 수행됨을 알 수 있음
train

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,0,2343,1,0,unknown,5,may,1042,1,-1,0,unknown,1
1,56,admin.,married,secondary,0,45,0,0,unknown,5,may,1467,1,-1,0,unknown,1
2,41,technician,married,secondary,0,1270,1,0,unknown,5,may,1389,1,-1,0,unknown,1
3,55,services,married,secondary,0,2476,1,0,unknown,5,may,579,1,-1,0,unknown,1
4,54,admin.,married,tertiary,0,184,0,0,unknown,5,may,673,2,-1,0,unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11157,33,blue-collar,single,primary,0,1,1,0,cellular,20,apr,257,1,-1,0,unknown,0
11158,39,services,married,secondary,0,733,0,0,unknown,16,jun,83,4,-1,0,unknown,0
11159,32,technician,single,secondary,0,29,0,0,cellular,19,aug,156,2,-1,0,unknown,0
11160,43,technician,married,secondary,0,0,0,1,cellular,8,may,9,2,172,5,failure,0


In [172]:
train_copy

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11157,33,blue-collar,single,primary,no,1,yes,no,cellular,20,apr,257,1,-1,0,unknown,no
11158,39,services,married,secondary,no,733,no,no,unknown,16,jun,83,4,-1,0,unknown,no
11159,32,technician,single,secondary,no,29,no,no,cellular,19,aug,156,2,-1,0,unknown,no
11160,43,technician,married,secondary,no,0,no,yes,cellular,8,may,9,2,172,5,failure,no


### 2. Categorical var중에서도 범주가 2개 이상인 변수(나머지 변수)는 one-hot-encoding 적용

In [173]:
# one-hot encoding of categorical variables
train = pd.get_dummies(train)

print('Training Features shape: ', train.shape)

Training Features shape:  (11162, 49)


In [174]:
train

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,59,0,2343,1,0,5,1042,1,-1,0,...,0,0,1,0,0,0,0,0,0,1
1,56,0,45,0,0,5,1467,1,-1,0,...,0,0,1,0,0,0,0,0,0,1
2,41,0,1270,1,0,5,1389,1,-1,0,...,0,0,1,0,0,0,0,0,0,1
3,55,0,2476,1,0,5,579,1,-1,0,...,0,0,1,0,0,0,0,0,0,1
4,54,0,184,0,0,5,673,2,-1,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11157,33,0,1,1,0,20,257,1,-1,0,...,0,0,0,0,0,0,0,0,0,1
11158,39,0,733,0,0,16,83,4,-1,0,...,1,0,0,0,0,0,0,0,0,1
11159,32,0,29,0,0,19,156,2,-1,0,...,0,0,0,0,0,0,0,0,0,1
11160,43,0,0,0,1,8,9,2,172,5,...,0,0,1,0,0,0,1,0,0,0


In [175]:
### 원핫인코딩의 또다른 방식 (파통머)

# 범주형 변수들을 label encoding / one hot encoding 해주어야함
## 교수님 방식대로 label encoder --> one hot encoder
"""
categ_columns = ['poutcome'] # 범주형 변수

def dummy(data,col):
    lab=LabelEncoder() #0~c-1로 클래스 부여
    aa=lab.fit_transform(train_copy[col]).reshape(-1,1)
    ohe=OneHotEncoder(sparse=False)
    column_names=[col+'_'+ str(i) for i in lab.classes_]
    return(pd.DataFrame(ohe.fit_transform(aa),columns=column_names))
"""

"\ncateg_columns = ['poutcome'] # 범주형 변수\n\ndef dummy(data,col):\n    lab=LabelEncoder() #0~c-1로 클래스 부여\n    aa=lab.fit_transform(train_copy[col]).reshape(-1,1)\n    ohe=OneHotEncoder(sparse=False)\n    column_names=[col+'_'+ str(i) for i in lab.classes_]\n    return(pd.DataFrame(ohe.fit_transform(aa),columns=column_names))\n"

In [176]:
"""
for column in categ_columns:
    temp_df=dummy(train_copy,column)
"""

'\nfor column in categ_columns:\n    temp_df=dummy(train_copy,column)\n'

In [177]:
temp_df

Unnamed: 0,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0
...,...,...,...,...
11157,0.0,0.0,0.0,1.0
11158,0.0,0.0,0.0,1.0
11159,0.0,0.0,0.0,1.0
11160,1.0,0.0,0.0,0.0


이 방식은 비효율적임. 지정한 categorical변수에 대해서만 one-hot-encoding이 이루어지고 출력됨  
--> 기존 변수는 지우고 인코딩된 변수로 치환해주는 작업 필요.. & categorical변수도 찾아서 지정해줘야함

**그러므로 첫번째 방식이 낫다 (if문으로 categorical변수 알아서 찾아주고 class가 2개 이하인 것들만 labelencoding해줌)  
나머지 변수들(class가 3개 이상인 것)은 pd.get_dummies 하면 one-hot-encoding 됨**

In [178]:
train.shape

(11162, 49)

## 전체 data를 training set과 test set으로 split
- 쪼개기 전에 dataframe 형식을 array 형식으로 변환해주어야 함

In [179]:
## DF.iloc[:,0].values  --> values를 붙여주면 array형식으로 바뀜

from sklearn.model_selection import train_test_split

X = train.loc[:, train.columns != 'deposit'].values  ## target var인 deposit만 제외한 모든 변수는 X로
y = train['deposit'].values                          ## target var인 deposit은 y로


X_train, X_test, y_train,y_test = \
    train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [180]:
# 표준화 (X값만)

from sklearn.preprocessing import StandardScaler
std = StandardScaler()
X_train_std = std.fit_transform(X_train)
X_test_std = std.transform(X_test)

In [181]:
X_train_std

array([[ 2.66467604, -0.1259472 , -0.23711337, ..., -0.22073702,
        -0.32127781,  0.58203043],
       [-0.27367123, -0.1259472 , -0.26949743, ..., -0.22073702,
        -0.32127781,  0.58203043],
       [-0.94529346, -0.1259472 , -0.30098194, ..., -0.22073702,
        -0.32127781,  0.58203043],
       ...,
       [-0.18971845, -0.1259472 , -0.31057722, ..., -0.22073702,
         3.11257104, -1.71812322],
       [-0.35762401, -0.1259472 , -0.39363635, ..., -0.22073702,
        -0.32127781,  0.58203043],
       [ 0.90166768, -0.1259472 , -0.3819421 , ..., -0.22073702,
        -0.32127781,  0.58203043]])

### 분류 모델 정의

In [184]:
# Time for Classification Models
import time


from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score

dict_classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Nearest Neighbors": KNeighborsClassifier(),
    "Linear SVM": SVC(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "Decision Tree": tree.DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=18),
    "Neural Net": MLPClassifier(alpha=1),
    "Naive Bayes": GaussianNB()
}

no_classifiers = len(dict_classifiers.keys())



##### train score / cross-val score / accuracy를 데이터프레임 형식으로 출력 #####
### 단점 - test score를 못구함

def batch_classify(X_train, Y_train, verbose = True):
    df_results = pd.DataFrame(data=np.zeros(shape=(no_classifiers,4)), columns = ['classifier', 'train_score', 'cross val mean score', 'accuracy'])
    count = 0
    for key, classifier in dict_classifiers.items():
        cross_val = cross_val_score(classifier, X_train, Y_train, cv=3)
        cross_val_mean = cross_val.mean()
        pred = cross_val_predict(classifier, X_train, Y_train, cv=3)
        classifier.fit(X_train, Y_train)
        accuracy = accuracy_score(Y_train, pred)
        train_score = classifier.score(X_train, Y_train)
        
        df_results.loc[count,'classifier'] = key
        df_results.loc[count,'train_score'] = train_score
        df_results.loc[count,'cross val mean score'] = cross_val_mean
        df_results.loc[count,'accuracy'] = accuracy
        
        count+=1
    return df_results

In [197]:
## 표준화 한 데이터로 학습

df_results = batch_classify(X_train_std, y_train)
print(df_results.sort_values(by='train_score', ascending=False))

                     classifier  train_score  cross val mean score  accuracy
4                 Decision Tree     1.000000              0.788685  0.790989
5                 Random Forest     0.997056              0.838218  0.842954
2                    Linear SVM     0.887111              0.843594  0.843594
6                    Neural Net     0.874056              0.847434  0.849994
3  Gradient Boosting Classifier     0.863049              0.845129  0.845130
1             Nearest Neighbors     0.833739              0.745170  0.745168
0           Logistic Regression     0.830027              0.828618  0.828619
7                   Naive Bayes     0.713810              0.710867  0.710867


### XGBoost Classifier

In [200]:
# XGBoost Classifier
from xgboost import XGBClassifier
from xgboost import plot_importance

xgb = XGBClassifier(learning_rate=0.1, max_depth=4)
xgb.fit(X_train_std, y_train)

train_pred = xgb.predict(X_train_std)
test_pred = xgb.predict(X_test_std)

In [201]:
from sklearn.metrics import accuracy_score 

## train / test accuracy
print(accuracy_score(y_train, train_pred))
print(accuracy_score(y_test, test_pred))

0.8776398310508128
0.8498059122126008


In [202]:
result=cross_val_score(xgb, X_train_std, y_train , cv=10, scoring='accuracy')
print('The cross validated score for XGBoost is:',result.mean())

The cross validated score for XGBoost is: 0.8531913639474606


In [191]:
## grid search  --- 돌아가는데 좀 오래걸림 (모델이 무겁다는 뜻)

from sklearn.model_selection import GridSearchCV

xgb = XGBClassifier()

xgb_param_grid = {
    'n_estimators' : [100, 200, 400, 600],
    'learning_rate' : [0.01, 0.05, 0.1, 0.15, 0.2],
    'max_depth' : [4,6,8,12],
}

xgb_grid = GridSearchCV(xgb, param_grid = xgb_param_grid, scoring = 'accuracy')
xgb_grid.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_es...
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=None, tree_method=None,
                                     validate_parameters=None, 

In [192]:
xgb_grid.best_score_

0.8601014333543867

In [193]:
xgb_grid.best_params_

{'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 100}

In [None]:
# plot feature importance





### LightGBM Classifier

In [207]:
from lightgbm import LGBMClassifier, plot_importance

lgb = LGBMClassifier(n_estimator=400)
lgb.fit(X_train_std, y_train)

pred_train=lgb.predict(X_train_std)
pred_test=lgb.predict(X_test_std)

## train / test accuracy
print(accuracy_score(y_train, pred_train))
print(accuracy_score(y_test, pred_test))

0.9166773326507104
0.848014332636608


In [208]:
result=cross_val_score(lgb, X_train_std, y_train , cv=10, scoring='accuracy')
print('The cross validated score for LightGBM is:',result.mean())

The cross validated score for LightGBM is: 0.8617666707054698


In [None]:
# plot feature importance



