# 예측 모형 실습: 쿠폰 지급을 위한 예측 모형

배송료 무료 쿠폰을 줬을 때, 구매 가능성이 가장 높을 유저를 예측

In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv('/content/gooppang.csv')
df.dtypes

age              float64
basket           float64
checkout            bool
region            object
coupon              bool
gender            object
monthly_spend    float64
shipping_fee     float64
subscriber          bool
tenure           float64
dtype: object

In [3]:
train_df, test_df = train_test_split(df, train_size=0.8, random_state=0)

In [4]:
# Column selection
ct = ColumnTransformer([
  ('bin', 'passthrough', ['coupon', 'subscriber']),
  ('cat', OneHotEncoder(drop='first'), ['region', 'gender']),
  ('num', StandardScaler(), ['age', 'monthly_spend', 'tenure']),
])

# Example with logistic regression 

In [5]:
# Vanilla logistic regression
lr_pipe = Pipeline([
  ('trans', ct),
  ('clf', LogisticRegression()),
])
lr_pipe.fit(train_df, y=train_df.checkout)

Pipeline(steps=[('trans',
                 ColumnTransformer(transformers=[('bin', 'passthrough',
                                                  ['coupon', 'subscriber']),
                                                 ('cat',
                                                  OneHotEncoder(drop='first'),
                                                  ['region', 'gender']),
                                                 ('num', StandardScaler(),
                                                  ['age', 'monthly_spend',
                                                   'tenure'])])),
                ('clf', LogisticRegression())])

In [6]:
train_roc_auc = roc_auc_score(y_true=train_df.checkout, y_score=lr_pipe.predict_proba(train_df)[:, 1])
test_roc_auc = roc_auc_score(y_true=test_df.checkout, y_score=lr_pipe.predict_proba(test_df)[:, 1])
print(f'ROC AUC: {train_roc_auc:.2f} (train), {test_roc_auc:.2f} (test)')

ROC AUC: 0.68 (train), 0.67 (test)


In [7]:
# Regularized logistic regrssion w/ CV
lr_grid_params = {
  'clf__C': np.logspace(-2, 2, 10),
}
lr_gs = GridSearchCV(lr_pipe, lr_grid_params, scoring='roc_auc', n_jobs=-1)
lr_gs = lr_gs.fit(train_df, y=train_df.checkout)
lr_gs.cv_results_

{'mean_fit_time': array([0.24679217, 0.28523946, 0.28598151, 0.31955605, 0.32527761,
        0.33229818, 0.32082562, 0.3353507 , 0.33506613, 0.31481252]),
 'mean_score_time': array([0.03005567, 0.02704015, 0.02656927, 0.02650957, 0.02682438,
        0.02607336, 0.02746377, 0.03036914, 0.02669311, 0.02457561]),
 'mean_test_score': array([0.67169036, 0.67421546, 0.67539772, 0.67594987, 0.67597137,
        0.67594722, 0.67592943, 0.67593617, 0.67593863, 0.67593939]),
 'param_clf__C': masked_array(data=[0.01, 0.027825594022071243, 0.0774263682681127,
                    0.21544346900318834, 0.5994842503189409,
                    1.6681005372000592, 4.6415888336127775,
                    12.915496650148826, 35.93813663804626, 100.0],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'params': [{'clf__C': 0.01},
  {'clf__C': 0.027825594022071243},
  {'clf__C': 0.077426368268112

In [8]:
lr_test_preds = lr_gs.best_estimator_.predict_proba(test_df)[:, 1]
roc_auc_score(y_true=test_df.checkout, y_score=lr_test_preds)

0.6695381247188712

In [9]:
test_with_coupon_df = test_df.assign(coupon=1)
p_checkout_with_coupon = lr_gs.best_estimator_.predict_proba(test_with_coupon_df)[:, 1]
(
    test_df
    .assign(p_checkout=p_checkout_with_coupon)
    .sort_values('p_checkout', ascending=False)
)

Unnamed: 0,age,basket,checkout,region,coupon,gender,monthly_spend,shipping_fee,subscriber,tenure,p_checkout
28396,31.0,87.8,True,E,False,f,304.49,0.00,True,49.0,0.570723
2482,68.0,99.4,False,C,True,f,280.55,0.00,True,41.0,0.554410
3913,75.0,81.2,False,C,True,f,262.97,0.00,True,39.0,0.534344
35826,31.0,39.5,False,B,False,f,285.68,20.00,True,48.0,0.494141
16346,38.0,76.7,False,A,False,f,232.75,0.00,True,23.0,0.478561
...,...,...,...,...,...,...,...,...,...,...,...
14870,38.0,184.3,False,D,False,m,82.54,71.99,False,3.0,0.016662
14708,35.0,168.0,False,D,False,m,81.93,53.50,False,2.0,0.016580
32924,34.0,165.4,False,D,True,m,71.33,62.02,False,3.0,0.016437
58441,34.0,182.7,False,D,False,m,73.47,39.41,False,2.0,0.016324


# Example with XGBoost Classifier

In [10]:
# Vanilla XGBoost
xgb_pipe = Pipeline([
  ('trans', ct),
  ('clf', XGBClassifier()),
])
xgb_pipe.fit(train_df, y=train_df.checkout)
train_roc_auc = roc_auc_score(y_true=train_df.checkout, y_score=xgb_pipe.predict_proba(train_df)[:, 1])
test_roc_auc = roc_auc_score(y_true=test_df.checkout, y_score=xgb_pipe.predict_proba(test_df)[:, 1])
print(f'ROC AUC: {train_roc_auc:.2f} (train), {test_roc_auc:.2f} (test)')

ROC AUC: 0.69 (train), 0.67 (test)


In [11]:
xgb_grid_params = {
  'clf__max_depth': range(1, 7),
}
xgb_gs = GridSearchCV(xgb_pipe, xgb_grid_params, scoring='roc_auc', n_jobs=-1)
xgb_gs = xgb_gs.fit(train_df, y=train_df.checkout)
xgb_gs.cv_results_

{'mean_fit_time': array([1.74624119, 2.32282157, 3.01122279, 3.74382057, 4.55158944,
        5.16608725]),
 'mean_score_time': array([0.05154123, 0.05717463, 0.06533656, 0.07574782, 0.08850231,
        0.0960104 ]),
 'mean_test_score': array([0.67394145, 0.67480953, 0.67164048, 0.67052668, 0.66691607,
        0.66219534]),
 'param_clf__max_depth': masked_array(data=[1, 2, 3, 4, 5, 6],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'clf__max_depth': 1},
  {'clf__max_depth': 2},
  {'clf__max_depth': 3},
  {'clf__max_depth': 4},
  {'clf__max_depth': 5},
  {'clf__max_depth': 6}],
 'rank_test_score': array([2, 1, 3, 4, 5, 6], dtype=int32),
 'split0_test_score': array([0.66386306, 0.66264234, 0.65884879, 0.65543097, 0.65393142,
        0.65002542]),
 'split1_test_score': array([0.68849764, 0.68835705, 0.68682087, 0.68657056, 0.68307766,
        0.67712775]),
 'split2_test_score': array([0.66749787, 0.6671824 , 0

In [12]:
xgb_test_preds = xgb_gs.best_estimator_.predict_proba(test_df)[:, 1]
roc_auc_score(y_true=test_df.checkout, y_score=xgb_test_preds)

0.6668948681978104