# Qiita ~ 傾向スコアを用いて観察データからUpliftをモデリングする（おまけ）~

In [2]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
import statsmodels.api as sm
import random
import sys
sys.path.append("/home/yuta_saito/notebook/Qiita/uplift-modeling/uplift_tools/")
from metrics import *

from operator import itemgetter
from pandas import DataFrame, Series
from plotly.offline import iplot, plot, init_notebook_mode
from plotly.graph_objs import Histogram, Box, Scatter, Figure, Layout, Bar

from sklearn.cross_validation import train_test_split, cross_val_score, StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel, SelectKBest
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, roc_curve, auc, confusion_matrix, precision_recall_curve
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR, LinearSVR

## Cross Variable Transmation

In [3]:
### データの読み込み
data_df = pd.read_csv('https://github.com/iwanami-datascience/vol3/raw/master/kato%26hoshino/q_data_x.csv')
#data_df.head()

In [4]:
### 説明変数
cols = ["child_dummy", "area_kanto", "area_tokai", "area_keihanshin", "T", "F1", "F2", "F3", "M1", "M2"]
X = data_df[cols]

In [5]:
### Zラベルの作成
Z_df = data_df[["cm_dummy", "gamedummy"]].drop_duplicates()
Z_df["Z"] = np.array([1,0,0,1])
data_df = pd.merge(data_df,Z_df,on=["cm_dummy", "gamedummy"])

In [6]:
### CMありをtreatment, CMなしをcontrolとする
treat = list(data_df.cm_dummy == 1)
cv = list(data_df.gamedummy == 1)
z = list(data_df.Z == 1)

In [7]:
### train, test半分ずつに分ける
train_cv, test_cv, train_treat, test_treat, X_train, X_test, z_train, z_test = train_test_split(cv,treat,X,z,test_size=0.5,random_state=2)

In [8]:
### indexをリセット
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

In [9]:
### Z=1となる確率を予測するロジスティック回帰モデル構築
lr_z = LogisticRegression(C=0.01)
lr_z.fit(X_train,z_train)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
### z-scoreの算出(z-score = 2P(Z=1|X) - 1)
z_score = 2*lr_z.predict_proba(X_test)-1
z_score = list(z_score[:,1])

In [11]:
z_df = uplift_frame_clf(test_cv, test_treat, z_score)

In [12]:
uplift_curve(z_df, "Z Score")

In [13]:
### z-scoreを算出したロジスティック回帰モデルの係数を表示
for feature, coef in zip(X_test.columns.tolist(), list(lr_z.coef_[0])):
    print("{}".format(feature), " / ", "{}".format(round(coef,4)))

child_dummy  /  0.0239
area_kanto  /  -0.3468
area_tokai  /  0.2823
area_keihanshin  /  0.8965
T  /  0.0027
F1  /  0.1595
F2  /  -0.2699
F3  /  -0.0892
M1  /  0.1931
M2  /  0.1633


### Adjust using Propensity Score

In [14]:
### データの読み込み
data_df = pd.read_csv('https://github.com/iwanami-datascience/vol3/raw/master/kato%26hoshino/q_data_x.csv')
#data_df.head()

In [16]:
###傾向スコアを求める
## 説明変数
cols_ = ["age", "sex", "TVwatch_day", "marry_dummy", "child_dummy", "inc", "pmoney", "area_kanto", "area_tokai","area_keihanshin", "job_dummy1", "job_dummy2", "job_dummy3", "job_dummy4", "job_dummy5", "job_dummy6","fam_str_dummy1", "fam_str_dummy2", "fam_str_dummy3","fam_str_dummy4"]
X_ = data_df[cols_]

## 切片の導入
X_["Intercept"] = 1

## CM視聴有無ダミー
z1 = data_df.cm_dummy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [17]:
## StatsModelsのLogitにより傾向スコアを推定
glm = sm.Logit(z1, X_)
result = glm.fit()
ps = result.predict(X_)
#ps

Optimization terminated successfully.
         Current function value: 0.542152
         Iterations 6


In [18]:
### Zラベルの作成
data_df["ps"] = ps
data_df["adj_gamedummy"] = 0
data_df["adj_gamedummy"][data_df.cm_dummy == 1] =  data_df["gamedummy"][data_df.cm_dummy == 1] / data_df["ps"][data_df.cm_dummy == 1]
data_df["adj_gamedummy"][data_df.cm_dummy == 0] =  -data_df["gamedummy"][data_df.cm_dummy == 0] / (1-data_df["ps"][data_df.cm_dummy == 0])
adj_z = DataFrame(data_df[["gamedummy", "adj_gamedummy"]])



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [19]:
### CMありをtreatment, CMなしをcontrolとする
treat = list(data_df.cm_dummy == 1)
cv = list(data_df.gamedummy == 1)

In [20]:
### train, test半分ずつに分ける
train_cv, test_cv, train_treat, test_treat, X_train, X_test, adj_z_train, adj_z_test = train_test_split(cv,treat,X,adj_z,test_size=0.5,random_state=2)

In [21]:
### indexをリセット
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
adj_z_train = adj_z_train.reset_index(drop=True)
adj_z_test = adj_z_test.reset_index(drop=True)

In [22]:
### ランダムフォレスト回帰を用いてZ-Scoreを予測するモデルを構築
rf = RandomForestRegressor(n_estimators=1000, max_depth=500)
rf.fit(X_train,adj_z_train.adj_gamedummy)

### 補正されたZ-Scoreを予測
adj_z_score = rf.predict(X_test)

In [23]:
adj_z_df = uplift_frame_clf(test_cv, test_treat, adj_z_score)

In [24]:
uplift_curve(adj_z_df, "Adj-Z Score")

In [25]:
lr = LinearRegression()
lr.fit(X_train, rf.predict(X_train))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [26]:
for feature, coef in zip(X_train.columns.tolist(), list(lr.coef_)):
    print("{}".format(feature), " / ", "{}".format(round(coef,4)))

child_dummy  /  -0.0848
area_kanto  /  -0.0398
area_tokai  /  -0.028
area_keihanshin  /  0.0508
T  /  0.2287
F1  /  0.0106
F2  /  0.0529
F3  /  -0.0461
M1  /  0.0412
M2  /  0.1182
