# Qiita ~ 傾向スコアを用いて観察データからUpliftをモデリングする~

In [2]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
import statsmodels.api as sm
import random
import sys
sys.path.append("/home/yuta_saito/notebook/Qiita/uplift-modeling/uplift_tools/")
from metrics import *

from operator import itemgetter
from pandas import DataFrame, Series
from plotly.offline import iplot, plot, init_notebook_mode
from plotly.graph_objs import Histogram, Box, Scatter, Figure, Layout, Bar

from sklearn.cross_validation import train_test_split, cross_val_score, StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel, SelectKBest
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, roc_curve, auc, confusion_matrix, precision_recall_curve
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR, LinearSVR

## Regression Uplift Modeling ~ Separate Model Approach ~

In [4]:
### データの読み込み
data_df = pd.read_csv('https://github.com/iwanami-datascience/vol3/raw/master/kato%26hoshino/q_data_x.csv')
#data_df.head()

In [5]:
### 説明変数と目的変数を指定
cols = ["child_dummy", "area_kanto", "area_tokai", "area_keihanshin", "T", "F1", "F2", "F3", "M1", "M2"]
X = data_df[cols]
y = DataFrame(data_df.gamesecond)

In [6]:
### CMありをtreatment, CMなしをcontrolとする
treat = list(data_df.cm_dummy == 1)

In [7]:
### train, test半分ずつに分ける
y_train, y_test, train_treat, test_treat, X_train, X_test = train_test_split(y,treat,X,test_size=0.5,random_state=2)

In [8]:
### indexをリセット
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [10]:
### Separate Model Approach
## 学習用に用いるデータを生成
num = len(y_train)
treat_y = DataFrame([y_train.loc[i] for i in range(num) if train_treat[i] == True])
control_y = DataFrame([y_test.loc[i] for i in range(num) if train_treat[i] == False])
treat_X = DataFrame([X_train.loc[i] for i in range(num) if train_treat[i] == True])
control_X = DataFrame([X_train.loc[i] for i in range(num) if train_treat[i] == False])

In [11]:
### ランダムフォレスト回帰を用いる
params_rf = {"max_depth":[10,50,100,200,500]}
gs_rf_treat = GridSearchCV(RandomForestRegressor(n_estimators=500),param_grid=params_rf,cv=5)
gs_rf_control = GridSearchCV(RandomForestRegressor(n_estimators=500),param_grid=params_rf,cv=5)

### 介入群と非介入群それぞれでパラメータチューニング及びスマホアプリ利用時間を予測するモデルを構築
gs_rf_treat.fit(treat_X, treat_y)
gs_rf_control.fit(control_X, control_y)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Pl

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [10, 50, 100, 200, 500]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [12]:
### uplift-scoreの算出
## CMを見た場合のアプリ利用時間
pred_treat = gs_rf_treat.predict(X_test)

## CMを見なかった場合のアプリ利用時間
pred_control = gs_rf_control.predict(X_test)

## 今回は、(CMを見た場合のアプリ利用時間) - (CMを見なかった場合のアプリ利用時間) をuplift_scoreとする。
uplift_score = pred_treat - pred_control

In [13]:
uplift_df = uplift_frame_reg(y_test.gamesecond.tolist(), test_treat, uplift_score)

In [14]:
uplift_curve(uplift_df)

In [15]:
lr = LinearRegression()
lr.fit(X_test, uplift_score)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [16]:
for feature, coef in zip(X_train.columns.tolist(), list(lr.coef_)):
    print("{}".format(feature), " / ", "{}".format(round(coef,4)))

child_dummy  /  1066.7313
area_kanto  /  -1439.2777
area_tokai  /  -3042.6154
area_keihanshin  /  624.7007
T  /  12164.7189
F1  /  1292.577
F2  /  1779.1661
F3  /  1013.0342
M1  /  2687.7289
M2  /  3393.5936


## Uplift Modeling ~Transformed Outcome Tree Model~

In [17]:
###傾向スコアを求める
## 説明変数
cols_ = ["age", "sex", "TVwatch_day", "marry_dummy", "child_dummy", "inc", "pmoney", "area_kanto", "area_tokai","area_keihanshin", "job_dummy1", "job_dummy2", "job_dummy3", "job_dummy4", "job_dummy5", "job_dummy6","fam_str_dummy1", "fam_str_dummy2", "fam_str_dummy3","fam_str_dummy4"]
X_ = data_df[cols_]

## 切片の導入
X_["Intercept"] = 1

## CM視聴有無ダミー
z1 = data_df.cm_dummy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [19]:
## StatsModelsのLogitにより傾向スコアを推定
glm = sm.Logit(z1, X_)
result = glm.fit()
ps = result.predict(X_)
#ps

Optimization terminated successfully.
         Current function value: 0.542152
         Iterations 6


In [20]:
### Propensity Scoreで補正された目的変数のカラムを作成
data_df["ps"] = ps
data_df["adj_gamesecond"] = 0
data_df["adj_gamesecond"][data_df.cm_dummy == 1] =  data_df["gamesecond"][data_df.cm_dummy == 1] / data_df["ps"][data_df.cm_dummy == 1]
data_df["adj_gamesecond"][data_df.cm_dummy == 0] =  -data_df["gamesecond"][data_df.cm_dummy == 0] / (1-data_df["ps"][data_df.cm_dummy == 0])
adj_y = DataFrame(data_df[["gamesecond", "adj_gamesecond"]])



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [21]:
### CMありをtreatment, CMなしをcontrolとする
treat = list(data_df.cm_dummy == 1)

In [22]:
### train, test半分ずつに分ける
adj_y_train, adj_y_test, train_treat, test_treat, X_train, X_test = train_test_split(adj_y,treat,X,test_size=0.5,random_state=2)

In [23]:
### indexをリセット
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
adj_y_train = adj_y_train.reset_index(drop=True)
adj_y_test = adj_y_test.reset_index(drop=True)

In [25]:
### ランダムフォレスト回帰を用いる
params_rf = {"max_depth":[10,50,100,200,500]}
gs_rf = GridSearchCV(RandomForestRegressor(n_estimators=500),param_grid=params_rf,cv=5)

### パラメータチューニング及び補正スマホアプリ利用時間を予測するモデルを構築
gs_rf.fit(X_train,adj_y_train.adj_gamesecond)

### 補正uplift_scoreを求める
adj_uplift_score = gs_rf.predict(X_test)

In [26]:
adj_uplift_df = uplift_frame_reg(adj_y_test.gamesecond.tolist(), test_treat, adj_uplift_score)

In [28]:
uplift_curve(adj_uplift_df, "Adj-Uplift Score")

In [29]:
lr = LinearRegression()
lr.fit(X_train, gs_rf.predict(X_train))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [31]:
for feature, coef in zip(X_train.columns.tolist(), list(lr.coef_)):
    print("{}".format(feature), " / ", "{}".format(round(coef,4)))

child_dummy  /  -7479.1805
area_kanto  /  -2485.2828
area_tokai  /  -4402.7254
area_keihanshin  /  11009.4959
T  /  4267.2656
F1  /  348.1678
F2  /  2921.4506
F3  /  -639.2757
M1  /  -5799.453
M2  /  10888.532
