In [1]:
# Data Wrangling
import pandas as pd

from pandas import Series, DataFrame
import numpy as np

# Visualization
import matplotlib.pylab as plt
from matplotlib import font_manager, rc
%matplotlib inline
rc('font', family='malgun gothic')
import seaborn as sns
%matplotlib inline

# EDA

# Preprocessing & Feature Engineering
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import StratifiedKFold
from sklearn.cluster import KMeans

from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from random import uniform
from scipy.stats import gmean
from scipy.stats import randint

# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Modeling
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.base import ClassifierMixin
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostClassifier
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import ExtraTreesClassifier
# Evaluation
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

# Utility
import os
import time
import random
import warnings; warnings.filterwarnings("ignore")
from IPython.display import Image
import pickle
from tqdm import tqdm
import platform
from itertools import combinations
from scipy.stats.mstats import gmean
from sklearn.decomposition import PCA
import datetime

In [2]:
# Read Data
tr = pd.read_csv("data/trainset.csv")
test = pd.read_csv("data/testset_final.csv")
train=tr[test.columns]
train['매칭성공여부']=tr['매칭성공여부']

In [3]:
# train 에는 존재하지만 test 에는 존재하지 않아 쓸 수 없는 column들
set(tr.columns)-set(test.columns)

{'고객ID', '매니저ID', '매니저주소', '매니저최초가입일', '매니저최초서비스일', '매칭성공여부'}

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23009 entries, 0 to 23008
Data columns (total 30 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   SEQ         23009 non-null  object 
 1   접수일         23009 non-null  object 
 2   접수시각        3113 non-null   object 
 3   장기서비스여부     23009 non-null  int64  
 4   최초서비스일      23009 non-null  object 
 5   전체회차        23009 non-null  int64  
 6   현재회차        23009 non-null  int64  
 7   서비스일자       23009 non-null  object 
 8   서비스시작시간     23009 non-null  object 
 9   서비스종료시간     23009 non-null  object 
 10  기존고객여부      23009 non-null  int64  
 11  결재형태        23009 non-null  object 
 12  서비스주소       23009 non-null  object 
 13  주거형태        23009 non-null  object 
 14  평수          13856 non-null  object 
 15  고객가입일       23009 non-null  object 
 16  반려동물        20234 non-null  object 
 17  부재중여부       13856 non-null  float64
 18  우선청소        10979 non-null  object 
 19  쿠폰사용여부      23009 non-nul

In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4244 entries, 0 to 4243
Data columns (total 29 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   SEQ         4244 non-null   object 
 1   접수일         4244 non-null   object 
 2   접수시각        900 non-null    object 
 3   장기서비스여부     4244 non-null   int64  
 4   최초서비스일      4244 non-null   object 
 5   전체회차        4244 non-null   int64  
 6   현재회차        4244 non-null   int64  
 7   서비스일자       4244 non-null   object 
 8   서비스시작시간     4244 non-null   object 
 9   서비스종료시간     4244 non-null   object 
 10  기존고객여부      4244 non-null   int64  
 11  결재형태        4238 non-null   object 
 12  서비스주소       4244 non-null   object 
 13  주거형태        4244 non-null   object 
 14  평수          2946 non-null   object 
 15  고객가입일       4084 non-null   object 
 16  반려동물        3868 non-null   object 
 17  부재중여부       2946 non-null   float64
 18  우선청소        2381 non-null   object 
 19  쿠폰사용여부      4244 non-null  

In [6]:
# 데이터 전처리 과정
def preprocess(train):
    # Preprocessing
    train['CS교육이수여부']=train['CS교육이수여부'].fillna(2)
    train['청소교육이수여부']=train['청소교육이수여부'].fillna(2)
    train['부재중여부']=train['부재중여부'].fillna(100)
    train['결재형태']=train['결재형태'].fillna('미수')
    train['평수']=train['평수'].fillna('모름')
    train['반려동물']=train['반려동물'].fillna('모름')
    train['매니저이동방법']=train['매니저이동방법'].fillna('모름')
    train['우선청소']=train['우선청소'].fillna('없음')
    train['고객가입일']=train['고객가입일'].fillna('2020-0-0')
    
    # Make Feature
    train['endtime']=train['서비스종료시간'].apply(lambda x: int(x.split(':')[0]))
    train['starttime']=train['서비스시작시간'].apply(lambda x: int(x.split(':')[0]))
    train['betweentime']=train['endtime']-train['starttime']
    
    train['접수월']=train['접수일'].apply(lambda x: x[:7])
    train['접수일일']=train['접수일'].apply(lambda x: x[8:])
    train['접수일']=train['접수일'].apply(lambda x: x.split('-')[0]+x.split('-')[1]+x.split('-')[2])
    
    train['고객서비스월']=train['최초서비스일'].apply(lambda x: x[:7])
    train['고객서비스일']=train['최초서비스일'].apply(lambda x: x[8:])
    
    train['고객가입월']=train['고객가입일'].apply(lambda x: str(x)[:7])
    train['고객가입일일']=train['고객가입일'].apply(lambda x: str(x)[8:])
    train['고객가입일']=train['고객가입일'].apply(lambda x: x.split('-')[0]+x.split('-')[1]+x.split('-')[2] if type(x)==str else x)
    
    train['매니저생년월일']=train['매니저생년월일'].apply(lambda x: str(x)[:4])
    train['나이대']=train['매니저생년월일'].apply(lambda x: x[2])
    
    train['서비스일자']=train['서비스일자'].apply(lambda x: x.split('-')[0]+x.split('-')[1]+x.split('-')[2])
    train['최초서비스일']=train['최초서비스일'].apply(lambda x: x.split('-')[0]+x.split('-')[1]+x.split('-')[2])
    
    train['ser_add_dae']=train['서비스주소'].apply(lambda x: x[:2])
    train['ser_add_so']=train['서비스주소'].apply(lambda x: x[3:5])
    train['ser_add_so']=train['ser_add_so'].apply(lambda x: '천안/아산' if x in ['천안','아산'] else x)
    train['ser_add_so']=train['ser_add_so'].apply(lambda x: '원주/춘천' if x in ['원주','춘천'] else x)
    train['so_가능']=train['ser_add_so']==train['근무가능지역']
    train['dae_가능']=train['ser_add_dae']==train['근무가능지역']
    train['지역_가능']=train['so_가능'].astype(int)+train['dae_가능'].astype(int)
    
    train['잔여회차']=train['전체회차']-train['현재회차']
    train['종료여부']=train['잔여회차'].apply(lambda x: 1 if x==0 else 0)
    
    train['부재중청소']=train['부재중여부']+train['부재중서비스가능여부']
    
    return train

In [7]:
# train과 test에 동일한 전처리 실행
train=preprocess(train)
test= preprocess(test)

In [8]:
# 범주형 변수와 수치형 변수를 분리
obb=['장기서비스여부', '기존고객여부','결재형태','서비스주소','주거형태','평수','반려동물','부재중여부','우선청소','쿠폰사용여부','매니저사용휴대폰',
     '매니저성별','매니저이동방법','근무가능지역','CS교육이수여부','청소교육이수여부','부재중서비스가능여부','추천인여부','endtime',
     'starttime','betweentime','ser_add_dae','ser_add_so','so_가능','dae_가능','지역_가능','접수월','접수일일','고객서비스월','고객서비스일',
     '고객가입월','고객가입일일','나이대','부재중청소','잔여회차','종료여부','매니저생년월일','고객가입일']
ff=['전체회차','현재회차','서비스일자','접수일','최초서비스일']

In [9]:
train=pd.concat([train[obb].astype(str),train[ff].astype(int)],axis=1)
test=pd.concat([test[obb].astype(str),test[ff].astype(int)],axis=1)

In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23009 entries, 0 to 23008
Data columns (total 43 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   장기서비스여부      23009 non-null  object
 1   기존고객여부       23009 non-null  object
 2   결재형태         23009 non-null  object
 3   서비스주소        23009 non-null  object
 4   주거형태         23009 non-null  object
 5   평수           23009 non-null  object
 6   반려동물         23009 non-null  object
 7   부재중여부        23009 non-null  object
 8   우선청소         23009 non-null  object
 9   쿠폰사용여부       23009 non-null  object
 10  매니저사용휴대폰     23009 non-null  object
 11  매니저성별        23009 non-null  object
 12  매니저이동방법      23009 non-null  object
 13  근무가능지역       23009 non-null  object
 14  CS교육이수여부     23009 non-null  object
 15  청소교육이수여부     23009 non-null  object
 16  부재중서비스가능여부   23009 non-null  object
 17  추천인여부        23009 non-null  object
 18  endtime      23009 non-null  object
 19  starttime    23009 non-nu

# 군집화

In [11]:
# 매니저 column과 고객 column을 분리해줌

cus_col = [ '장기서비스여부','전체회차', '현재회차', 
        '기존고객여부', '결재형태', '서비스주소', '주거형태', '평수',
       '반려동물', '부재중여부', '우선청소', '쿠폰사용여부','부재중서비스가능여부', '추천인여부', 
           'endtime', 'starttime', 'betweentime', '접수월', '접수일일', '고객서비스월',
       '고객서비스일', '고객가입월', '고객가입일일', '잔여회차']

man_col = [ '매니저성별', '매니저사용휴대폰',
       '매니저이동방법', '근무가능지역', 'CS교육이수여부', '청소교육이수여부', 'ser_add_dae',
       'ser_add_so', 'so_가능', 'dae_가능', '지역_가능', '나이대']

In [12]:
class KMeansFeaturizer:
    """ 숫자 데이터를 k-평균 클러스터 멤버십으로 변환.

    이 변환기는 입력 데이터에 k-평균을 수행해 각 데이터 포인트를 가장 가까운 클러스터의 id로 변환한다.
    만약 목표 변수가 주어지면 유사한 데이터 포인트와 함께 grouping되고,
    분류 경계에 따르는 클러스터를 생성하기 위해 스케일링되고, k-평균 입력에 포함된다.
    """

    def __init__(self, k = 100, target_scale = 5.0, random_state = None):
        self.k = k
        self.target_scale = target_scale
        self.random_state = random_state

    def fit(self, X, y = None):
        """ 입력 데이터에 k-평균을 수행하고 중심점을 찾는다.
        """
        if y is None: # 목표 변수가 없으면 단순한 k-평균 수행
            km_model = KMeans(n_clusters = self.k, n_init = 20, random_state = self.random_state)
            km_model.fit(X)
            
            self.inertia_ = km_model.inertia_
            self.km_model = km_model
            self.cluster_centers_ = km_model.cluster_centers_
            return self

        # 목표 변수가 있으면, 적절한 스케일링을 적용하고, 이를 k-평균에 대한 입력 데이터에 포함시킨다.
        data_with_target = np.hstack((X, y[:, np.newaxis] * self.target_scale))
        # 데이터와 타겟에 대해 사전 학습할 k-평균 모델 구축
        km_model_pretrain = KMeans(n_clusters = self.k, n_init = 20, random_state = self.random_state)
        km_model_pretrain.fit(data_with_target)

        # k평균을 두번째로 실행해 목표 변수 없이 원시 공간에서 클러스터를 얻는다. 사전 학습을 통해 얻은 중심점을 활용해 초기화한다.
        # 반복을 통해 클러스터 할당과 중심점 계산을 다시 수행한다.

        km_model = KMeans(n_clusters = self.k, init = km_model_pretrain.cluster_centers_[:,:data_with_target.shape[1]-1], n_init = 1, max_iter = 1)

        km_model.fit(X)
        
        self.inertia_ = km_model.inertia_
        self.km_model = km_model
        self.cluster_centers_ = km_model.cluster_centers_
        return self

    def transform(self, X, y = None):
        """ 각 입력 데이터 포인트에 대해 가장 가까운 클러스터 ID 산출
        """
        clusters = self.km_model.predict(X)
        return clusters[:, np.newaxis]

    def fit_transform(self, X, y = None):
        self.fit(X, y)
        return self.transform(X, y)

In [13]:
km = KMeansFeaturizer(k=150,random_state = 20182830)

In [14]:
cus_train = train[cus_col]
cus_test = test[cus_col]

# customer 데이터를 문자형으로 변경
for i in cus_col:
    cus_train.loc[:,[i]] = cus_train.loc[:,[i]].astype("str")
    cus_test.loc[:,[i]] = cus_test.loc[:,[i]].astype("str")

In [15]:
man_train = train[man_col]
man_test = test[man_col]

# manager 데이터를 문자형으로 변경
for i in man_col:
    man_train.loc[:,[i]] = man_train.loc[:,[i]].astype("str")
    man_test.loc[:,[i]] = man_test.loc[:,[i]].astype("str")

In [16]:
# 원핫인코딩
cus_train = pd.get_dummies(cus_train)
cus_test = pd.get_dummies(cus_test)
man_train = pd.get_dummies(man_train)
man_test = pd.get_dummies(man_test)

In [17]:
# train column 중 test column에 없는 column
for i in cus_train.columns:
    if i in cus_test:
        pass
    else: print(i)

서비스주소_경남 창원시
서비스주소_부산 동래구
endtime_22
endtime_23
endtime_3
starttime_0
betweentime_7


In [18]:
# train column 중 test column에 없는 column 0으로 대체
cus_test["서비스주소_경남 창원시"] = 0
cus_test["서비스주소_부산 동래구"] = 0
cus_test["endtime_22"] = 0
cus_test["endtime_23"] = 0
cus_test["endtime_3"] = 0
cus_test["starttime_0"] = 0
cus_test["betweentime_7"] = 0

In [19]:
# 다중값 처리
cus_test.loc[cus_test["반려동물_개,고양이"] == 1,["반려동물_고양이"]] = 1
cus_test.loc[cus_test["반려동물_개,고양이"] == 1,["반려동물_개"]] = 1

cus_test.loc[cus_test["반려동물_개,기타"] == 1,["반려동물_개"]] = 1
cus_test.loc[cus_test["반려동물_개,기타"] == 1,["반려동물_기타"]] = 1

cus_test.loc[cus_test["반려동물_고양이,기타"] == 1,["반려동물_고양이"]] = 1
cus_test.loc[cus_test["반려동물_고양이,기타"] == 1,["반려동물_기타"]] = 1

cus_test = cus_test.drop(["반려동물_개,고양이","반려동물_개,기타","반려동물_고양이,기타"],axis = 1)

cus_test = cus_test[cus_train.columns]

In [20]:
# train column 중 test column에 없는 column
for i in man_train.columns:
    if i in man_test:
        pass
    else: print(i)

ser_add_dae_경남
ser_add_so_동래
ser_add_so_창원


In [21]:
# train column 중 test column에 없는 column 0으로 대체
man_test["ser_add_dae_경남"] = 0
man_test["ser_add_so_동래"] = 0
man_test["ser_add_so_창원"] = 0

In [22]:
man_test = man_test[man_train.columns]

In [23]:
# k-means clustering
cus_train_kmean = km.fit_transform(cus_train)
cus_test_kmean = km.transform(cus_test)

man_train_kmean = km.fit_transform(man_train)
man_test_kmean = km.transform(man_test)

In [24]:
# k-means 결과 concatenate
kmean_train = pd.concat([pd.DataFrame(cus_train_kmean),pd.DataFrame(man_train_kmean)], axis = 1)

kmean_test = pd.concat([pd.DataFrame(cus_test_kmean),pd.DataFrame(man_test_kmean)], axis = 1)

kmean_train.columns = ["cus","man"]
kmean_test.columns = ["cus","man"]

In [25]:
# customer 군집, manager군집 문자형으로 변환
kmean_train["cus"] = kmean_train["cus"].astype("str")
kmean_test["cus"] = kmean_test["cus"].astype("str")
kmean_train["man"] = kmean_train["man"].astype("str")
kmean_test["man"] = kmean_test["man"].astype("str")

In [26]:
# customer군집, manager군집 조합
kmean_train["cus_man"] = kmean_train["cus"].astype("str") + "_" + kmean_train["man"].astype("str")
kmean_test["cus_man"] = kmean_test["cus"].astype("str") + "_" + kmean_test["man"].astype("str")

#  -----------------------------------------------------------------------

In [27]:
train_manager=train[['매니저성별','매니저이동방법', '근무가능지역', '부재중서비스가능여부',
                     '매니저생년월일','CS교육이수여부','청소교육이수여부','so_가능', 'dae_가능', '지역_가능','부재중청소','나이대']]

In [28]:
test_manager=test[['매니저성별','매니저이동방법', '근무가능지역', '부재중서비스가능여부',
                     '매니저생년월일','CS교육이수여부','청소교육이수여부','so_가능', 'dae_가능', '지역_가능','부재중청소','나이대']]

In [29]:
train_customer=train[["장기서비스여부",'기존고객여부', '결재형태',
       '서비스주소', '주거형태', 'endtime', 'starttime', 'betweentime', 'ser_add_dae', 'ser_add_so',
      '접수월', '접수일일', '잔여회차','종료여부','고객가입일',
       '고객서비스월', '고객서비스일', '고객가입월', '고객가입일일','평수','반려동물','우선청소','부재중여부']]

In [30]:
test_customer=test[["장기서비스여부",'기존고객여부', '결재형태',
       '서비스주소', '주거형태', 'endtime', 'starttime', 'betweentime', 'ser_add_dae', 'ser_add_so',
      '접수월', '접수일일', '잔여회차','종료여부','고객가입일',
       '고객서비스월', '고객서비스일', '고객가입월', '고객가입일일','평수','반려동물','우선청소','부재중여부']]

In [31]:
train_manager_dummy = train_manager.copy()#pd.get_dummies(train_manager)
test_manager_dummy = test_manager.copy()#pd.get_dummies(test_manager)

In [32]:
for i in train_manager_dummy.columns:
    if i in test_manager_dummy:
        pass
    else: print(i)

test_manager_dummy=test_manager_dummy[train_manager_dummy.columns]

In [33]:
train_customer_dummy= train_customer.copy()#pd.get_dummies(train_customer)
test_customer_dummy=test_customer.copy() #pd.get_dummies(test_customer)

In [34]:
train_na=pd.concat([train_customer_dummy,train_manager_dummy,train[ff]],axis=1)
test_na=pd.concat([test_customer_dummy,test_manager_dummy,test[ff]],axis=1)

In [35]:
train_na = pd.concat([train_na,kmean_train],axis = 1)
test_na = pd.concat([test_na,kmean_test],axis = 1)

In [36]:
# feature, class 정의
y = tr['매칭성공여부']
X = train_na

In [37]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23009 entries, 0 to 23008
Data columns (total 43 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   장기서비스여부      23009 non-null  object
 1   기존고객여부       23009 non-null  object
 2   결재형태         23009 non-null  object
 3   서비스주소        23009 non-null  object
 4   주거형태         23009 non-null  object
 5   endtime      23009 non-null  object
 6   starttime    23009 non-null  object
 7   betweentime  23009 non-null  object
 8   ser_add_dae  23009 non-null  object
 9   ser_add_so   23009 non-null  object
 10  접수월          23009 non-null  object
 11  접수일일         23009 non-null  object
 12  잔여회차         23009 non-null  object
 13  종료여부         23009 non-null  object
 14  고객가입일        23009 non-null  object
 15  고객서비스월       23009 non-null  object
 16  고객서비스일       23009 non-null  object
 17  고객가입월        23009 non-null  object
 18  고객가입일일       23009 non-null  object
 19  평수           23009 non-nu

In [38]:
# Save Data
X.to_csv('X.csv', index=False)
y.to_csv('y.csv', index=False)
test_na.to_csv('test.csv', index=False)