# 프로모션 효율 예측
## Random Forest

고객 데이터와 거래 데이터를 함께 활용하여  
프로모션에 대한 고객의 반응 여부를 예측한다.

## 모듈 및 데이터 로드

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
member = pd.read_csv('member.csv')
transaction = pd.read_csv('transaction.csv')

In [3]:
member.head()

Unnamed: 0,id,recency,zip_code,is_referral,channel,conversion
0,906145,10,Surburban,0,Phone,0
1,184478,6,Rural,1,Web,0
2,394235,7,Surburban,1,Web,0
3,130152,9,Rural,1,Web,0
4,940352,2,Urban,0,Web,0


In [4]:
member.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64000 entries, 0 to 63999
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           64000 non-null  int64 
 1   recency      64000 non-null  int64 
 2   zip_code     64000 non-null  object
 3   is_referral  64000 non-null  int64 
 4   channel      64000 non-null  object
 5   conversion   64000 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 2.9+ MB


In [5]:
member.describe()

Unnamed: 0,id,recency,is_referral,conversion
count,64000.0,64000.0,64000.0,64000.0
mean,550694.137797,5.763734,0.50225,0.146781
std,259105.689773,3.507592,0.499999,0.35389
min,100001.0,1.0,0.0,0.0
25%,326772.0,2.0,0.0,0.0
50%,551300.0,6.0,1.0,0.0
75%,774914.5,9.0,1.0,0.0
max,999997.0,12.0,1.0,1.0


In [6]:
transaction.head()

Unnamed: 0,id,num_item,total_amount
0,906145,5,34000
1,906145,1,27000
2,906145,4,33000
3,184478,4,29000
4,394235,4,33000


In [7]:
transaction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196836 entries, 0 to 196835
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   id            196836 non-null  int64
 1   num_item      196836 non-null  int64
 2   total_amount  196836 non-null  int64
dtypes: int64(3)
memory usage: 4.5 MB


In [8]:
transaction.describe()

Unnamed: 0,id,num_item,total_amount
count,196836.0,196836.0,196836.0
mean,550557.552932,3.078365,21837.102969
std,259254.795613,1.478408,8218.005565
min,100001.0,1.0,8000.0
25%,326719.0,2.0,15000.0
50%,550918.0,3.0,22000.0
75%,774916.0,4.0,29000.0
max,999997.0,6.0,38000.0


###### 결측치가 하나도 없는 예쁜 데이터임을 확인
###### member.id = transaction.id를 기준으로 join하기 위해 준비해 보자

In [9]:
# 거래 데이터에 아이템별 평균 가격 추가
transaction['avg_price'] = transaction['total_amount'] / transaction['num_item']
transaction.head(2)

Unnamed: 0,id,num_item,total_amount,avg_price
0,906145,5,34000,6800.0
1,906145,1,27000,27000.0


In [10]:
tran_mean = transaction.groupby('id').mean()
tran_cnt = transaction['id'].value_counts()
tran = pd.concat([tran_mean, tran_cnt], axis=1)
tran

Unnamed: 0_level_0,num_item,total_amount,avg_price,count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100001,3.500000,26000.000000,7500.000000,2
100008,5.000000,26000.000000,5200.000000,1
100032,2.666667,20666.666667,9366.666667,3
100036,3.000000,25800.000000,13273.333333,5
100070,3.250000,21250.000000,8537.500000,4
...,...,...,...,...
999932,5.000000,32000.000000,6400.000000,1
999981,2.000000,22750.000000,12875.000000,4
999990,3.000000,28000.000000,10388.888889,3
999995,2.000000,27000.000000,13500.000000,1


In [11]:
member.set_index('id', inplace=True)
member

Unnamed: 0_level_0,recency,zip_code,is_referral,channel,conversion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
906145,10,Surburban,0,Phone,0
184478,6,Rural,1,Web,0
394235,7,Surburban,1,Web,0
130152,9,Rural,1,Web,0
940352,2,Urban,0,Web,0
...,...,...,...,...,...
838295,10,Urban,0,Web,0
547316,5,Urban,1,Phone,0
131575,6,Urban,1,Phone,0
603659,1,Surburban,1,Multichannel,0


In [12]:
df = pd.merge(member, tran, left_on='id', right_on='id')
df.head(2)

Unnamed: 0_level_0,recency,zip_code,is_referral,channel,conversion,num_item,total_amount,avg_price,count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
906145,10,Surburban,0,Phone,0,3.333333,31333.333333,14016.666667,3
184478,6,Rural,1,Web,0,4.0,29000.0,7250.0,1


## Missing Value 처리

In [13]:
# csv 파일을 불러오면서 결측치가 없음을 확인했지만 한번 더 짚고 넘어가자
df.isna().sum()

recency         0
zip_code        0
is_referral     0
channel         0
conversion      0
num_item        0
total_amount    0
avg_price       0
count           0
dtype: int64

## 카테고리 데이터 처리

In [14]:
df[df.select_dtypes('object').columns].nunique()

zip_code    3
channel     3
dtype: int64

In [15]:
df['zip_code'].unique()

array(['Surburban', 'Rural', 'Urban'], dtype=object)

In [16]:
df['channel'].unique()

array(['Phone', 'Web', 'Multichannel'], dtype=object)

###### 병합해줘야 할 카테고리는 관측되지 않는 모습  
###### 원핫인코딩으로 처리해 주자

In [17]:
df = pd.get_dummies(df, columns=['zip_code', 'channel'])
df.head()

Unnamed: 0_level_0,recency,is_referral,conversion,num_item,total_amount,avg_price,count,zip_code_Rural,zip_code_Surburban,zip_code_Urban,channel_Multichannel,channel_Phone,channel_Web
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
906145,10,0,0,3.333333,31333.333333,14016.666667,3,False,True,False,False,True,False
184478,6,1,0,4.0,29000.0,7250.0,1,True,False,False,False,False,True
394235,7,1,0,4.0,20500.0,5125.0,2,False,True,False,False,False,True
130152,9,1,0,1.75,20750.0,14875.0,4,True,False,False,False,False,True
940352,2,0,0,3.0,31000.0,10333.333333,1,False,False,True,False,False,True


## Random Forest Classifier 모델링 및 평가

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X = df.drop(columns='conversion', axis=1)
y = df['conversion']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

In [22]:
def getscore(model):
    y_pred1 = model.predict(X_train)
    y_pred2 = model.predict(X_test)
    y_proba1 = model.predict_proba(X_train)[:, 1]
    y_proba2 = model.predict_proba(X_test)[:, 1]
    acc1, acc2 = accuracy_score(y_train, y_pred1), accuracy_score(y_test, y_pred2)
    roc1, roc2 = roc_auc_score(y_train, y_proba1), roc_auc_score(y_test, y_proba2)
    return f'acc: {acc1:.4f} {acc2:.4f} roc: {roc1:.4f} {roc2:.4f}'

In [23]:
for i in range(2, 20):
    model = RandomForestClassifier(max_depth=i).fit(X_train, y_train)
    print(f'{i} : {getscore(model)}')

2 : acc: 0.8531 0.8562 roc: 0.7594 0.7547
3 : acc: 0.8588 0.8618 roc: 0.7663 0.7585
4 : acc: 0.8588 0.8618 roc: 0.7762 0.7660
5 : acc: 0.8588 0.8618 roc: 0.7834 0.7694
6 : acc: 0.8591 0.8618 roc: 0.7842 0.7694
7 : acc: 0.8605 0.8627 roc: 0.7919 0.7736
8 : acc: 0.8645 0.8650 roc: 0.8016 0.7755
9 : acc: 0.8680 0.8664 roc: 0.8128 0.7773
10 : acc: 0.8735 0.8691 roc: 0.8288 0.7787
11 : acc: 0.8782 0.8707 roc: 0.8494 0.7797
12 : acc: 0.8823 0.8716 roc: 0.8788 0.7785
13 : acc: 0.8899 0.8732 roc: 0.9114 0.7784
14 : acc: 0.8964 0.8722 roc: 0.9407 0.7769
15 : acc: 0.9059 0.8733 roc: 0.9689 0.7717
16 : acc: 0.9160 0.8721 roc: 0.9865 0.7713
17 : acc: 0.9314 0.8715 roc: 0.9955 0.7674
18 : acc: 0.9448 0.8717 roc: 0.9986 0.7653
19 : acc: 0.9641 0.8701 roc: 0.9998 0.7611


###### max depth가 12~13인 구간에서 test set의 accuracy score에 큰 변화가 없고 roauc의 증가세도 꺾이는 모습
###### max_depth=12를 가지는 최종 모델 선정

In [24]:
fmodel = RandomForestClassifier(max_depth=12).fit(X_train, y_train)
y_pred = model.predict(X_test)

In [25]:
from sklearn.metrics import confusion_matrix, classification_report

In [26]:
confusion_matrix(y_test, y_pred)

array([[16148,   274],
       [ 2221,   557]], dtype=int64)

In [27]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.98      0.93     16422
           1       0.67      0.20      0.31      2778

    accuracy                           0.87     19200
   macro avg       0.77      0.59      0.62     19200
weighted avg       0.85      0.87      0.84     19200



In [28]:
y.value_counts()

conversion
0    54606
1     9394
Name: count, dtype: int64

###### 종속변수의 클래스 불균형 때문에 y=1인 경우에 예측성능이 상대적으로 낮게 나타나는 것으로 보인다. 
###### 오버샘플링을 통해 완화할 수 있는 문제로 판단됨. 
###### 당장 모델 튜닝보다는 classifier, regressor를 하나씩 만들어 보는게 중요하니 우선은 넘어가자

## Random Forest Regressor 모델링 및 평가

In [29]:
from sklearn.ensemble import RandomForestRegressor

In [40]:
def conv(x):
    return (1 if x >= 0.5 else 0)

In [58]:
def getscore2(model):
    y_pred1 = model.predict(X_train)
    y_pred1 = [1 if x >= 0.5 else 0 for x in y_pred1]
    y_pred2 = model.predict(X_test)
    y_pred2 = [1 if x >= 0.5 else 0 for x in y_pred2]
    acc1, acc2 = accuracy_score(y_train, y_pred1), accuracy_score(y_test, y_pred2)
    return f'acc: {acc1:.4f} {acc2:.4f}'

In [59]:
for i in range(2, 20):
    model = RandomForestRegressor(max_depth=i).fit(X_train, y_train)
    print(f'{i} : {getscore2(model)}')

2 : acc: 0.8588 0.8618
3 : acc: 0.8588 0.8618
4 : acc: 0.8639 0.8664
5 : acc: 0.8648 0.8662
6 : acc: 0.8706 0.8703
7 : acc: 0.8742 0.8723
8 : acc: 0.8766 0.8729
9 : acc: 0.8799 0.8746
10 : acc: 0.8828 0.8756
11 : acc: 0.8874 0.8758
12 : acc: 0.8919 0.8765
13 : acc: 0.8975 0.8757
14 : acc: 0.9032 0.8773
15 : acc: 0.9129 0.8762
16 : acc: 0.9238 0.8752
17 : acc: 0.9352 0.8743
18 : acc: 0.9497 0.8739
19 : acc: 0.9663 0.8728


###### accuracy score 기준으로 train set과 test set에서의 성능이 0.87 정도로 준수한 i=9를 선택해 보자

In [65]:
fmodel = RandomForestRegressor(max_depth=9).fit(X_train, y_train)
y_temp = fmodel.predict(X_test)
y_pred = [1 if x >= 0.5 else 0 for x in y_temp]

In [62]:
confusion_matrix(y_test, y_pred)

array([[16284,   138],
       [ 2265,   513]], dtype=int64)

In [63]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.99      0.93     16422
           1       0.79      0.18      0.30      2778

    accuracy                           0.87     19200
   macro avg       0.83      0.59      0.62     19200
weighted avg       0.86      0.87      0.84     19200



In [69]:
# threshold 조절도 가능하다. 
y_pred_adj = [1 if x >= 0.65 else 0 for x in y_temp]

In [70]:
print(classification_report(y_test, y_pred_adj))

              precision    recall  f1-score   support

           0       0.87      1.00      0.93     16422
           1       0.88      0.14      0.24      2778

    accuracy                           0.87     19200
   macro avg       0.88      0.57      0.58     19200
weighted avg       0.87      0.87      0.83     19200



###### Threshold를 조정해 줌으로써 0과 1 두 클래스에서의 정밀도를 모두 챙긴 모습이다