# Qiita ~ 傾向スコアを用いて観察データからUpliftをモデリングする~

In [2]:
import numpy as np
import pandas as pd
import sklearn
import statsmodels.api as sm
import random
import sys
sys.path.append("/home/yuta_saito/notebook/qiita/uplift-modeling/uplift_tools/")
from metrics import *

from operator import itemgetter
from pandas import DataFrame, Series
from plotly.offline import iplot, plot, init_notebook_mode
from plotly.graph_objs import Histogram, Box, Scatter, Figure, Layout, Bar

from sklearn.cross_validation import train_test_split, cross_val_score, StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel, SelectKBest
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, roc_curve, auc, confusion_matrix, precision_recall_curve
from sklearn.model_selection import KFold, TimeSeriesSplit, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR, LinearSVR

## Regression Uplift Modeling ~ Separate Model Approach ~

In [3]:
### データの読み込み
data_df = pd.read_csv('https://github.com/iwanami-datascience/vol3/raw/master/kato%26hoshino/q_data_x.csv')
#data_df.head()

In [4]:
### 説明変数と目的変数を指定
cols = ["child_dummy", "area_kanto", "area_tokai", "area_keihanshin", 
        "T", "F1", "F2", "F3", "M1", "M2"]

X = data_df[cols]
y = DataFrame(data_df.gamesecond)

In [5]:
### CMありをtreatment, CMなしをcontrolとする
treat = (data_df.cm_dummy == 1).tolist()

In [6]:
### train, test半分ずつに分ける
y_train, y_test, train_treat, test_treat, X_train, X_test = train_test_split(y.values,treat,X,test_size=0.5,random_state=2)

In [7]:
### indexをリセット
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

In [8]:
### Separate Model Approach
## 学習用に用いるデータを生成
num = len(y_train)
treat_y = np.reshape(np.array([y_train[i] for i in range(num) if train_treat[i] == True]), -1)
control_y = np.reshape(np.array([y_test[i] for i in range(num) if train_treat[i] == False]), -1)
treat_X = DataFrame([X_train.loc[i] for i in range(num) if train_treat[i] == True])
control_X = DataFrame([X_train.loc[i] for i in range(num) if train_treat[i] == False])

In [9]:
### ランダムフォレスト回帰を用いる
params_rf = {"max_depth":[10,50,100,200,500]}
gs_rf_treat = GridSearchCV(RandomForestRegressor(n_estimators=500),
                           param_grid=params_rf,cv=5)
gs_rf_control = GridSearchCV(RandomForestRegressor(n_estimators=500),
                             param_grid=params_rf,cv=5)

### 介入群と非介入群それぞれでパラメータチューニング及びスマホアプリ利用時間を予測するモデルを構築
gs_rf_treat.fit(treat_X, treat_y)
gs_rf_control.fit(control_X, control_y)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [10, 50, 100, 200, 500]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [10]:
### uplift-scoreの算出
## CMを見た場合のアプリ利用時間
pred_treat = gs_rf_treat.predict(X_test)

## CMを見なかった場合のアプリ利用時間
pred_control = gs_rf_control.predict(X_test)

## 今回は、(CMを見た場合のアプリ利用時間) - (CMを見なかった場合のアプリ利用時間) をuplift_scoreとする。
uplift_score = pred_treat - pred_control

In [11]:
uplift_df = uplift_frame_reg(y_test.T.tolist()[0], test_treat, uplift_score)

In [12]:
#uplift_curve(uplift_df)

In [13]:
lr = LinearRegression()
lr.fit(X_test, uplift_score)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [14]:
for feature, coef in zip(X_train.columns, lr.coef_):
    print(f"{feature} / {round(coef, 4)}")

child_dummy / 977.2688
area_kanto / -1516.0004
area_tokai / -3030.0692
area_keihanshin / 551.5799
T / 12165.5144
F1 / 1234.037
F2 / 1782.0532
F3 / 970.1624
M1 / 2589.4751
M2 / 3362.6475


## Uplift Modeling ~Transformed Outcome Tree Model~

In [15]:
### データの読み込み
data_df = pd.read_csv('https://github.com/iwanami-datascience/vol3/raw/master/kato%26hoshino/q_data_x.csv')
#data_df.head()

In [16]:
###傾向スコアを求める
## 説明変数
cols_ = ["age", "sex", "TVwatch_day", "marry_dummy", "child_dummy", "inc", "pmoney", 
         "area_kanto", "area_tokai","area_keihanshin", 
         "job_dummy1", "job_dummy2", "job_dummy3", "job_dummy4", "job_dummy5", "job_dummy6",
         "fam_str_dummy1", "fam_str_dummy2", "fam_str_dummy3","fam_str_dummy4"]

X_ = data_df[cols_].copy()

## 切片の導入
X_.loc[:,"Intercept"] = 1

## CM視聴有無ダミー
z1 = data_df.cm_dummy

In [17]:
## StatsModelsのLogitにより傾向スコアを推定
glm = sm.Logit(z1, X_)
result = glm.fit()
ps = result.predict(X_)
#ps

Optimization terminated successfully.
         Current function value: 0.542152
         Iterations 6


In [18]:
### Propensity Scoreで補正された目的変数のカラムを作成
data_df.loc[:, "ps"] = ps
data_df.loc[:, "adj_gamesecond"] = 0
data_df.loc[data_df.cm_dummy == 1, "adj_gamesecond"] =  data_df.loc[data_df.cm_dummy == 1, "gamesecond"] / data_df.loc[data_df.cm_dummy == 1, "ps"]
data_df.loc[data_df.cm_dummy == 0, "adj_gamesecond"] =  -data_df.loc[data_df.cm_dummy == 0, "gamesecond"] / (1-data_df.loc[data_df.cm_dummy == 0, "ps"])
adj_y = data_df[["gamesecond", "adj_gamesecond"]]

In [19]:
### CMありをtreatment, CMなしをcontrolとする
treat = (data_df.cm_dummy == 1).tolist()

In [20]:
### train, test半分ずつに分ける
adj_y_train, adj_y_test, train_treat, test_treat, X_train, X_test = train_test_split(adj_y,treat,X,test_size=0.5,random_state=2)

In [21]:
### indexをリセット
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
adj_y_train = adj_y_train.reset_index(drop=True)
adj_y_test = adj_y_test.reset_index(drop=True)

In [22]:
### ランダムフォレスト回帰を用いる
params_rf = {"max_depth":[10,50,100,200,500]}
gs_rf = GridSearchCV(RandomForestRegressor(n_estimators=500),
                     param_grid=params_rf,cv=5)

### パラメータチューニング及び補正スマホアプリ利用時間を予測するモデルを構築
gs_rf.fit(X_train,adj_y_train.adj_gamesecond)

### 補正uplift_scoreを求める
adj_uplift_score = gs_rf.predict(X_test)

In [23]:
adj_uplift_df = uplift_frame_reg(adj_y_test.gamesecond.tolist(), test_treat, adj_uplift_score)

In [24]:
#uplift_curve(adj_uplift_df, "Adj-Uplift Score")

In [25]:
lr = LinearRegression()
lr.fit(X_train, gs_rf.predict(X_train))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [26]:
for feature, coef in zip(X_train.columns, lr.coef_):
    print(f"{feature} / {round(coef, 4)}")

child_dummy / -7608.9853
area_kanto / -2585.4107
area_tokai / -4459.0262
area_keihanshin / 11156.2133
T / 4450.7031
F1 / 412.1042
F2 / 3012.7645
F3 / -612.99
M1 / -5603.3779
M2 / 11165.0668
