# Qiita ~Uplift Modelingで介入効果を最大化する~¶

In [3]:
import numpy as np
import pandas as pd
import random
import sklearn
import statsmodels.api as sm
import sys
sys.path.append("/home/yuta_saito/notebook/qiita/uplift-modeling/uplift_tools/")
from metrics import *

from operator import itemgetter
from pandas import DataFrame, Series
from plotly.offline import iplot, plot, init_notebook_mode
from plotly.graph_objs import Histogram, Box, Scatter, Figure, Layout, Bar

from sklearn.cross_validation import train_test_split, cross_val_score, StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel, SelectKBest
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, roc_curve, auc, confusion_matrix, precision_recall_curve
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR, LinearSVR

## データの前処理

In [4]:
### データの読み込み
csv_file = "http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv"
data_df = pd.read_csv(csv_file)

### メールを送らなかった人たちのデータを削除
data_df = data_df.loc[data_df["segment"] != "No E-Mail"].reset_index(drop=True)
#data_df.head()

In [5]:
### カテゴリカル変数をダミー化
categorical_columns = ["zip_code", "channel"]
dummies = pd.get_dummies(data_df[categorical_columns],drop_first=True)
data_df = pd.concat([data_df.drop(categorical_columns, axis=1), dummies],axis=1)

In [6]:
### 説明変数を分けておく
X = data_df.drop(["segment","visit","conversion","spend","history_segment"],axis=1)

In [7]:
### 男性向けメールをtreatment, 女性向けメールをcontrolとする
treat = (data_df.segment == "Mens E-Mail").tolist()
cv = (data_df.visit == 1).tolist()

In [8]:
### train, test半分ずつに分ける
train_cv, test_cv, train_treat, test_treat, X_train, X_test = train_test_split(cv,treat,X,test_size=0.5,random_state=2)

In [9]:
### indexをリセット
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

### Separate Model Approach

In [10]:
## 学習用に用いるデータを生成
num = len(train_cv)
treat_cv = [train_cv[i] for i in range(num) if train_treat[i] == True]
control_cv = [train_cv[i] for i in range(num) if train_treat[i] == False]
treat_X = DataFrame([X_train.loc[i] for i in range(num) if train_treat[i] == True])
control_X = DataFrame([X_train.loc[i] for i in range(num) if train_treat[i] == False])

## 介入群(treat)と統制群(control)それぞれについてロジスティック回帰モデルで学習
lr_treat = LogisticRegression(C=0.1)
lr_treat.fit(treat_X, treat_cv)
lr_control = LogisticRegression(C=0.1)
lr_control.fit(control_X, control_cv)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [11]:
### uplift-scoreの算出
## 介入を受ける場合のサイト訪問確率
treat_proba = lr_treat.predict_proba(X_test)
treat_proba = treat_proba[:, 1]

## 介入を受けない場合のサイト訪問確率
control_proba = lr_control.predict_proba(X_test)
control_proba = control_proba[:, 1]

## 今回は、(介入を受ける場合のサイト訪問確率) / (介入を受けない場合のサイト訪問確率)をuplift_scoreとする。
uplift_score = treat_proba / control_proba

### 結果を描画

In [1]:
#uplift_bar(test_cv, test_treat, uplift_score)

In [12]:
df = uplift_frame_clf(outcome=test_cv, treat=test_treat, score=uplift_score)

In [13]:
#uplift_curve(df)

### Cross Variable Transmation

In [14]:
### データの読み込み
data_df = pd.read_csv(csv_file)

### メールを送らなかった人たちのデータを削除
data_df = data_df.loc[data_df["segment"] != "No E-Mail"].reset_index(drop=True)
#data_df.head()

In [15]:
### Zラベルの作成
Z_df = data_df[["segment", "visit"]].drop_duplicates()
Z_df["Z"] = np.array([1,0,0,1])
data_df = pd.merge(data_df,Z_df,on=["segment","visit"])

In [16]:
### カテゴリカル変数をダミー化
categorical_columns = ["zip_code", "channel"]
dummies = pd.get_dummies(data_df[categorical_columns],drop_first=True)
data_df = pd.concat([data_df.drop(categorical_columns, axis=1), dummies],axis=1)

In [17]:
### 説明変数を分けておく
X = data_df.drop(["segment","visit","conversion","spend","history_segment","Z"],axis=1)

In [18]:
### 男性向けメールをtreatment, 女性向けメールをcontrolとする
treat = (data_df.segment == "Mens E-Mail").tolist()
cv = (data_df.visit == 1).tolist()
z = (data_df.Z == 1).tolist()

In [19]:
### P(G=T) = P(G=C) = 1/2が成り立っているかを簡単に確認
data_df[data_df.segment == "Mens E-Mail"].shape[0] / data_df.shape[0]

0.49906310020143346

In [20]:
### train, test半分ずつに分ける
train_cv, test_cv, train_treat, test_treat, X_train, X_test, z_train, z_test = train_test_split(cv,treat,X,z,test_size=0.5,random_state=2)

In [21]:
### indexをリセット
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

In [22]:
### Class Variable Transformation
lr_z = LogisticRegression(C=10)
lr_z.fit(X_train,z_train)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [23]:
### z-scoreの算出(z-score = 2P(Z=1|X) - 1)
z_score = 2*lr_z.predict_proba(X_test)-1
z_score = (z_score[:,1]).tolist()

### 結果の描画

In [24]:
#uplift_bar(test_cv, test_treat, z_score)

In [25]:
df_z = uplift_frame_clf(outcome=test_cv, treat=test_treat, score=z_score)

In [26]:
#uplift_curve(df_z, "Z Score")

In [27]:
### z-scoreを算出したロジスティック回帰モデルの係数を表示
for feature, coef in zip(X_test.columns, lr_z.coef_[0]):
    print(f"{feature} / {round(coef, 4)}")

recency / -0.0057
history / 0.0
mens / 0.1186
womens / 0.0022
newbie / -0.0352
zip_code_Surburban / -0.0315
zip_code_Urban / -0.0064
channel_Phone / 0.0086
channel_Web / 0.0045
