## **회원 탈퇴 예측**
- 클러스터링 행동 분석은 사용방법에 따라 많은 가능성이 있는 기술이다.
- 행동패턴을 분석 할 수 있으면 어떤 고객이 탈퇴할지와 같은 예측도 가능
- 탈퇴 방지를 하기 위해 정책을 준비하는 것도 가능

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd

In [3]:
customer = pd.read_csv('customer_join.csv')
uselog_months = pd.read_csv('use_log_months.csv')

In [4]:
len(uselog_months)

36842

In [5]:
year_months = list(uselog_months['연월'].unique())
uselog = pd.DataFrame()
for i in range(1, len(year_months)):
    tmp = uselog_months.loc[uselog_months['연월'] == year_months[i]]
    tmp.rename(columns = {'count' : 'count_0'}, inplace = True)
    tmp_before = uselog_months.loc[uselog_months['연월'] == year_months[i - 1]]
    del tmp_before['연월']
    tmp_before.rename(columns = {'count' : 'count_1'}, inplace = True)
    tmp = pd.merge(tmp, tmp_before, on = 'customer_id', how = 'left')
    uselog = pd.concat([uselog, tmp], ignore_index = True)

In [6]:
uselog

Unnamed: 0,연월,customer_id,count_0,count_1
0,201805,AS002855,5,4.0
1,201805,AS009373,4,3.0
2,201805,AS015233,7,
3,201805,AS015315,3,6.0
4,201805,AS015739,5,7.0
...,...,...,...,...
33846,201903,TS995853,8,11.0
33847,201903,TS998593,8,7.0
33848,201903,TS999079,3,2.0
33849,201903,TS999231,6,6.0


## **탈퇴 전월의 탈퇴 고객 데이터를 작성**
- 탈퇴한 월이 아닌 탈퇴 전월의 데이터를 작성한다.
- 탈퇴를 예측하는 목적은 탈퇴 방지를 위한 것.
- 해당 월에 탈퇴 신청하고 다음달 말일에 탈퇴처리함
- 탈퇴 전월로부터 탈퇴 신청확률을 예측

In [7]:
from dateutil.relativedelta import relativedelta

In [8]:
exit_customer = customer.loc[customer['is_deleted'] == 1]
exit_customer['exit_date'] = None
exit_customer['end_date'] = pd.to_datetime(exit_customer['end_date'])

In [9]:
for i in range(len(exit_customer)):
    exit_customer['exit_date'].iloc[i] = exit_customer['end_date'].iloc[i] - relativedelta(months = 1)

In [10]:
exit_customer['연월'] = exit_customer['exit_date'].dt.strftime('%Y%m')

In [11]:
exit_customer.head(3)

Unnamed: 0,customer_id,name,class,gender,start_date,end_date,campaign_id,is_deleted,class_name,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date,연월
708,TS511179,XXXXXX,C01,F,2016-05-01,2018-04-30,CA1,1,0_종일,10500,2_일반,3.0,3.0,3,3,0,2018-04-30,23,2018-03-30 00:00:00,201803
729,TS443736,XXXX,C02,M,2016-05-01,2018-04-30,CA1,1,1_주간,7500,2_일반,3.0,3.0,3,3,0,2018-04-30,23,2018-03-30 00:00:00,201803
730,HD542886,XX,C01,M,2016-05-01,2018-04-30,CA1,1,0_종일,10500,2_일반,1.0,1.0,1,1,0,2018-04-30,23,2018-03-30 00:00:00,201803


In [14]:
uselog['연월'] =  uselog['연월'].astype(str)

In [15]:
exit_uselog = pd.merge(uselog, exit_customer, on = ['customer_id', '연월'], how = 'left')

In [16]:
exit_uselog.head()

Unnamed: 0,연월,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date
0,201805,AS002855,5,4.0,,,,,NaT,,...,,,,,,,,,,
1,201805,AS009373,4,3.0,,,,,NaT,,...,,,,,,,,,,
2,201805,AS015233,7,,,,,,NaT,,...,,,,,,,,,,
3,201805,AS015315,3,6.0,,,,,NaT,,...,,,,,,,,,,
4,201805,AS015739,5,7.0,,,,,NaT,,...,,,,,,,,,,


In [17]:
len(exit_uselog)

33851

In [18]:
exit_uselog = exit_uselog.dropna(subset = ['name'])
print(len(exit_uselog))
print(len(exit_uselog['customer_id'].unique()))
exit_uselog.head()

1104
1104


Unnamed: 0,연월,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period,exit_date
19,201805,AS055680,3,3.0,XXXXX,C01,M,2018-03-01,2018-06-30,CA1,...,10500.0,2_일반,3.0,3.0,3.0,3.0,0.0,2018-06-30,3.0,2018-05-30 00:00:00
57,201805,AS169823,2,3.0,XX,C01,M,2017-11-01,2018-06-30,CA1,...,10500.0,2_일반,3.0,3.0,4.0,2.0,1.0,2018-06-30,7.0,2018-05-30 00:00:00
110,201805,AS305860,5,3.0,XXXX,C01,M,2017-06-01,2018-06-30,CA1,...,10500.0,2_일반,3.333333,3.0,5.0,2.0,0.0,2018-06-30,12.0,2018-05-30 00:00:00
128,201805,AS363699,5,3.0,XXXXX,C01,M,2018-02-01,2018-06-30,CA1,...,10500.0,2_일반,3.333333,3.0,5.0,2.0,0.0,2018-06-30,4.0,2018-05-30 00:00:00
147,201805,AS417696,1,4.0,XX,C03,F,2017-09-01,2018-06-30,CA1,...,6000.0,2_일반,2.0,1.0,4.0,1.0,0.0,2018-06-30,9.0,2018-05-30 00:00:00


## **지속회원의 데이터 작성**
- 탈퇴 회원의 데이터 수는 1104, 지속회원 데이터 수는 name의 결측치를 제거했음에도 27422개, 데이터 불균형이 있음.
- 샘플 수를 조정. 모든 기간의 회원 데이터를 사용하지 않고, 하나의 기간만 사용.

In [19]:
conti_customer = customer.loc[customer['is_deleted'] == 0]
conti_uselog = pd.merge(uselog, conti_customer, on = ['customer_id'], how = 'left')

In [20]:
print(len(conti_uselog))
conti_uselog = conti_uselog.dropna(subset = ['name'])
print(len(conti_uselog))

33851
27422


In [21]:
conti_uselog = conti_uselog.sample(frac = 1).reset_index(drop = True)
conti_uselog = conti_uselog.drop_duplicates(subset = 'customer_id')
print(len(conti_uselog))
conti_uselog.head()

2842


Unnamed: 0,연월,customer_id,count_0,count_1,name,class,gender,start_date,end_date,campaign_id,...,class_name,price,campaign_name,mean,median,max,min,routine_flg,calc_date,membership_period
0,201903,HD426886,10,8.0,XXXX,C03,M,2018-04-05,,CA3,...,2_야간,6000.0,1_입회비무료,8.416667,8.0,11.0,5.0,1.0,2019-04-30,12.0
1,201903,HI611999,8,8.0,XXXXX,C01,M,2018-11-02,,CA1,...,0_종일,10500.0,2_일반,7.4,8.0,9.0,6.0,1.0,2019-04-30,5.0
2,201811,GD639344,5,5.0,XXXXX,C02,F,2015-08-01,,CA1,...,1_주간,7500.0,2_일반,3.833333,4.5,5.0,2.0,1.0,2019-04-30,44.0
3,201810,PL505814,5,3.0,XXXX,C02,F,2016-01-01,,CA1,...,1_주간,7500.0,2_일반,4.666667,5.0,6.0,3.0,1.0,2019-04-30,39.0
4,201808,AS073168,6,6.0,XXXXX,C01,M,2017-11-01,,CA1,...,0_종일,10500.0,2_일반,6.083333,6.0,8.0,3.0,1.0,2019-04-30,17.0


In [22]:
predict_data = pd.concat([conti_uselog, exit_uselog], ignore_index = True)
print(len(predict_data))
predict_data.head()

3946


Unnamed: 0,calc_date,campaign_id,campaign_name,class,class_name,count_0,count_1,customer_id,end_date,exit_date,...,max,mean,median,membership_period,min,name,price,routine_flg,start_date,연월
0,2019-04-30,CA3,1_입회비무료,C03,2_야간,10,8.0,HD426886,,,...,11.0,8.416667,8.0,12.0,5.0,XXXX,6000.0,1.0,2018-04-05,201903
1,2019-04-30,CA1,2_일반,C01,0_종일,8,8.0,HI611999,,,...,9.0,7.4,8.0,5.0,6.0,XXXXX,10500.0,1.0,2018-11-02,201903
2,2019-04-30,CA1,2_일반,C02,1_주간,5,5.0,GD639344,,,...,5.0,3.833333,4.5,44.0,2.0,XXXXX,7500.0,1.0,2015-08-01,201811
3,2019-04-30,CA1,2_일반,C02,1_주간,5,3.0,PL505814,,,...,6.0,4.666667,5.0,39.0,3.0,XXXX,7500.0,1.0,2016-01-01,201810
4,2019-04-30,CA1,2_일반,C01,0_종일,6,6.0,AS073168,,,...,8.0,6.083333,6.0,17.0,3.0,XXXXX,10500.0,1.0,2017-11-01,201808


## **예측할 달의 재적기간 작성**

In [25]:
predict_data['period'] = 0
predict_data['now_date'] = pd.to_datetime(predict_data['연월'], format = '%Y%m')
predict_data['start_date'] = pd.to_datetime(predict_data['start_date'])
for i in range(len(predict_data)):
    delta = relativedelta(predict_data['now_date'][i], predict_data['start_date'][i])
    predict_data['period'][i] = int(delta.years * 12 + delta.months)

In [26]:
predict_data.head()

Unnamed: 0,calc_date,campaign_id,campaign_name,class,class_name,count_0,count_1,customer_id,end_date,exit_date,...,median,membership_period,min,name,price,routine_flg,start_date,연월,period,now_date
0,2019-04-30,CA3,1_입회비무료,C03,2_야간,10,8.0,HD426886,,,...,8.0,12.0,5.0,XXXX,6000.0,1.0,2018-04-05,201903,10,2019-03-01
1,2019-04-30,CA1,2_일반,C01,0_종일,8,8.0,HI611999,,,...,8.0,5.0,6.0,XXXXX,10500.0,1.0,2018-11-02,201903,3,2019-03-01
2,2019-04-30,CA1,2_일반,C02,1_주간,5,5.0,GD639344,,,...,4.5,44.0,2.0,XXXXX,7500.0,1.0,2015-08-01,201811,39,2018-11-01
3,2019-04-30,CA1,2_일반,C02,1_주간,5,3.0,PL505814,,,...,5.0,39.0,3.0,XXXX,7500.0,1.0,2016-01-01,201810,33,2018-10-01
4,2019-04-30,CA1,2_일반,C01,0_종일,6,6.0,AS073168,,,...,6.0,17.0,3.0,XXXXX,10500.0,1.0,2017-11-01,201808,9,2018-08-01


In [27]:
predict_data.isna().sum() / len(predict_data)

calc_date            0.000000
campaign_id          0.000000
campaign_name        0.000000
class                0.000000
class_name           0.000000
count_0              0.000000
count_1              0.060821
customer_id          0.000000
end_date             0.720223
exit_date            0.720223
gender               0.000000
is_deleted           0.000000
max                  0.000000
mean                 0.000000
median               0.000000
membership_period    0.000000
min                  0.000000
name                 0.000000
price                0.000000
routine_flg          0.000000
start_date           0.000000
연월                   0.000000
period               0.000000
now_date             0.000000
dtype: float64

In [28]:
predict_data = predict_data.dropna(subset = ['count_1'])
predict_data.isna().sum() / len(predict_data)

calc_date            0.000000
campaign_id          0.000000
campaign_name        0.000000
class                0.000000
class_name           0.000000
count_0              0.000000
count_1              0.000000
customer_id          0.000000
end_date             0.716136
exit_date            0.716136
gender               0.000000
is_deleted           0.000000
max                  0.000000
mean                 0.000000
median               0.000000
membership_period    0.000000
min                  0.000000
name                 0.000000
price                0.000000
routine_flg          0.000000
start_date           0.000000
연월                   0.000000
period               0.000000
now_date             0.000000
dtype: float64

## **카테고리 변수 처리 및 가공**

In [29]:
predict_data.columns

Index(['calc_date', 'campaign_id', 'campaign_name', 'class', 'class_name',
       'count_0', 'count_1', 'customer_id', 'end_date', 'exit_date', 'gender',
       'is_deleted', 'max', 'mean', 'median', 'membership_period', 'min',
       'name', 'price', 'routine_flg', 'start_date', '연월', 'period',
       'now_date'],
      dtype='object')

In [32]:
target_col = ['campaign_name', 'class_name', 'gender', 'count_1', 'routine_flg', 'period', 'is_deleted']

In [33]:
predict_data = predict_data[target_col]
predict_data.head()

Unnamed: 0,campaign_name,class_name,gender,count_1,routine_flg,period,is_deleted
0,1_입회비무료,2_야간,M,8.0,1.0,10,0.0
1,2_일반,0_종일,M,8.0,1.0,3,0.0
2,2_일반,1_주간,F,5.0,1.0,39,0.0
3,2_일반,1_주간,F,3.0,1.0,33,0.0
4,2_일반,0_종일,M,6.0,1.0,9,0.0


In [34]:
predict_data = pd.get_dummies(predict_data)
predict_data.head()

Unnamed: 0,count_1,routine_flg,period,is_deleted,campaign_name_0_입회비반액할인,campaign_name_1_입회비무료,campaign_name_2_일반,class_name_0_종일,class_name_1_주간,class_name_2_야간,gender_F,gender_M
0,8.0,1.0,10,0.0,0,1,0,0,0,1,0,1
1,8.0,1.0,3,0.0,0,0,1,1,0,0,0,1
2,5.0,1.0,39,0.0,0,0,1,0,1,0,1,0
3,3.0,1.0,33,0.0,0,0,1,0,1,0,1,0
4,6.0,1.0,9,0.0,0,0,1,1,0,0,0,1


In [35]:
del predict_data['campaign_name_2_일반']
del predict_data['class_name_2_야간']
del predict_data['gender_M']
predict_data.head()

Unnamed: 0,count_1,routine_flg,period,is_deleted,campaign_name_0_입회비반액할인,campaign_name_1_입회비무료,class_name_0_종일,class_name_1_주간,gender_F
0,8.0,1.0,10,0.0,0,1,0,0,0
1,8.0,1.0,3,0.0,0,0,1,0,0
2,5.0,1.0,39,0.0,0,0,0,1,1
3,3.0,1.0,33,0.0,0,0,0,1,1
4,6.0,1.0,9,0.0,0,0,1,0,0


## **의사결정나무 알고리즘을 사용해서 탈퇴 예측모델**

In [36]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [37]:
exit = predict_data.loc[predict_data['is_deleted'] == 1]
conti = predict_data.loc[predict_data['is_deleted'] == 0].sample(len(exit))

In [39]:
X = pd.concat([exit, conti], ignore_index = True)
y = X['is_deleted']

In [42]:
del X['is_deleted']

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [45]:
model = DecisionTreeClassifier(random_state = 0)
model.fit(X_train, y_train)
pred = model.predict(X_test)

In [46]:
pred

array([1., 0., 0., 1., 0., 1., 0., 1., 1., 0., 1., 1., 1., 1., 0., 1., 0.,
       0., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 0., 0., 0.,
       1., 1., 1., 1., 0., 1., 1., 0., 0., 1., 1., 0., 1., 1., 1., 0., 1.,
       0., 0., 0., 0., 1., 0., 1., 1., 1., 0., 0., 1., 0., 1., 0., 0., 0.,
       1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1.,
       0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0.,
       0., 1., 0., 1., 1., 0., 1., 0., 1., 0., 0., 0., 1., 1., 0., 1., 0.,
       1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1.,
       1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 1., 0.,
       1., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 0., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 1., 1., 1., 0., 1., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0.,
       1., 1., 1., 0., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 0., 0., 0., 0.

In [47]:
results_test = pd.DataFrame({'y_test' : y_test, 'y_pred' : pred})
results_test.head()

Unnamed: 0,y_test,y_pred
1091,0.0,1.0
1786,0.0,0.0
1439,0.0,0.0
745,1.0,1.0
820,1.0,0.0


## **예측 모델 평가**

In [48]:
correct = len(results_test.loc[results_test['y_test'] == results_test['y_pred']])
data_count = len(results_test)
score_test = correct / data_count
print(score_test)

0.9049881235154394


In [49]:
print(model.score(X_test, y_test))
print(model.score(X_train, y_train))

0.9049881235154394
0.9774212715389186


In [50]:
model = DecisionTreeClassifier(max_depth = 5, random_state = 0)
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(model.score(X_test, y_test))
print(model.score(X_train, y_train))

0.9192399049881235
0.9257278669043375


## **모델에 기여하고 있는 변수 확인**

In [51]:
importance = pd.DataFrame({'feature_name' : X.columns, 'feature_importances' : model.feature_importances_})
importance

Unnamed: 0,feature_name,feature_importances
0,count_1,0.359686
1,routine_flg,0.131147
2,period,0.50632
3,campaign_name_0_입회비반액할인,0.002531
4,campaign_name_1_입회비무료,0.0
5,class_name_0_종일,0.000317
6,class_name_1_주간,0.0
7,gender_F,0.0


## **회원의 탈퇴 예측**

In [52]:
count_1 = 3
routing_flg = 1
period = 10
campaign_name = '입회비무료'
class_name = '종일'
gender = 'M'

In [53]:
if campaign_name == '입회비반값할인':
    campaign_name_list = [1, 0]
elif campaign_name == '입회비무료':
    campaign_name_list = [0, 1]
elif campaign_name == '일반' :
    campaign_name_list = [0, 0]
if class_name == '종일':
    class_name_list = [1, 0]
elif class_name == '주간' :
    class_name_list = [0, 1]
elif class_name == '야간' : 
    class_name_list = [0, 0]
if gender == 'F':
    gender_list = [1]
elif gender == 'M' : 
    gender_list = [0]

input_data = [count_1, routing_flg, period]
input_data.extend(campaign_name_list)
input_data.extend(class_name_list)
input_data.extend(gender_list)

In [54]:
print(model.predict([input_data]))

[1.]


In [55]:
# AUC - ROC 계산을 위해 각 클래스에 확률이 필요
print(model.predict_proba([input_data]))

[[0. 1.]]
