In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from pyrsm import gains, gains_plot, lift, lift_plot, confusion, profit_max, ROME_max
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import copy

Using TensorFlow backend.


In [2]:
data = pd.read_csv('../intuit75k_new.csv')
categorical_columns=['zip_bins','sex','zip801','zip804']
othercol=['numords','last','dollars','sincepurch','bizflag','owntaxprod','version1','upgraded','training']
keep=categorical_columns+othercol+['label']
combind_data=data.loc[:,keep]
combind_data[categorical_columns] = combind_data[categorical_columns].apply(lambda x: LabelEncoder().fit_transform(x))

In [3]:
X_train=combind_data.loc[combind_data.training==1].drop(columns='label').drop(columns='training')
y_train=combind_data.loc[combind_data.training==1].label
X_test=combind_data.loc[combind_data.training==0].drop(columns='label').drop(columns='training')
y_test=combind_data.loc[combind_data.training==0].label

In [4]:
Xs = np.concatenate((X_train, X_test), axis=0)

In [5]:
# Create the parameter grid: gbm_param_grid
gbm_param_grid = {
    'n_estimators': range(100,400,50),
    'max_depth': range(2,10)
}


# Instantiate the regressor: gbm
gbm = xgb.XGBClassifier()

# Perform grid search: grid_auc
randomized_auc = RandomizedSearchCV(
    param_distributions=gbm_param_grid,estimator=gbm,scoring="roc_auc",n_iter=50,cv=5,verbose=1
)

# Fit grid_mse to the data
randomized_auc.fit(X_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed: 31.5min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=1, nthread=None,
                                           objective='binary:logistic',
                                           random_state=0, reg_alpha=0,
                                           reg_lambda=1, scale_pos_weight=1,
                                           seed=None, silent=None, subsample=1,
                                           verbosity=1),
                   iid='de

In [6]:
print("Best parameters found: ", randomized_auc.best_params_)
print("higest auc found: ", np.abs(randomized_auc.best_score_))

Best parameters found:  {'n_estimators': 100, 'max_depth': 3}
higest auc found:  0.7694380137321539


In [7]:
#grid search result
preds =randomized_auc.predict_proba(X_test)
fpr, tpr, thresholds = metrics.roc_curve(y_test.values, preds[:, 1])
auc_rf = metrics.auc(fpr, tpr)
auc_rf

0.7663529101507227

In [8]:
preds=preds[:,1]

In [9]:
margin=60
cost=1.41
breakeven_rate=cost/margin
#real profit of res1,without pred/2 
testdata=combind_data.loc[combind_data.training==0]
testdata['xgboost']=preds
testdata['pred_click']=1
testdata.loc[testdata['xgboost']<breakeven_rate,'pred_click']=0
tp=testdata.loc[testdata.pred_click==1].loc[testdata.label==1]
revenue=len(tp)*margin
pred_true=sum(testdata.pred_click)
totalcost=pred_true*cost
profit=revenue-totalcost
profit

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


38315.79

In [10]:
#scaled profit of res2 on the whole dataset
total=763334
testdata2=combind_data.loc[combind_data.training==0]
testdata2['randomforest']=preds
testdata2['pred_click']=1
testdata2.loc[testdata2['randomforest']/2<breakeven_rate,'pred_click']=0
tp=testdata2.loc[testdata2.pred_click==1].loc[testdata2.label==1]
pred_true=sum(testdata2.pred_click)
pred_true_rate=pred_true/len(testdata2)
send_number=total*pred_true_rate
adj_response_rate=len(tp)/pred_true/2
exp_buyers=adj_response_rate*send_number
totalcost=send_number*cost
exp_profit=exp_buyers*margin-totalcost

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [11]:
exp_profit

452923.72001066676

In [16]:
#expected profit on the test set of res2
send_number=len(testdata2)*pred_true_rate
adj_response_rate=len(tp)/pred_true/2
exp_buyers=adj_response_rate*send_number
totalcost=send_number*cost
exp_profit_test=exp_buyers*margin-totalcost
exp_profit_test

13350.36

In [None]:
# model auc and projected profit of res2:

In [17]:
df=[["xgboost",round(auc_rf,4),round(exp_profit_test,4)],["random forest",0.7643,13016.34],["nn3",0.7656,13014.96],["MLP keras",0.7518,12456.66]]
model_performance=pd.DataFrame(df,columns=['Model',"Auc","Expected profit of res2 on test"])

In [18]:
model_performance

Unnamed: 0,Model,Auc,Expected profit of res2 on test
0,xgboost,0.7664,13350.36
1,random forest,0.7643,13016.34
2,nn3,0.7656,13014.96
3,MLP keras,0.7518,12456.66


In [20]:
# using the xgboost model to make prediction:
X_test_new=copy.copy(X_test)
preds_res2 =randomized_auc.predict_proba(X_test_new)
X_test_new['xgb']=preds_res2[:,1]
X_test_new['mailto_wave2']=True
X_test_new.loc[X_test_new['xgb']/2<breakeven_rate,'mailto_wave2']=False

In [217]:
X_test_new['id']=data.loc[data.training==0].id

In [218]:
X_test_new['label']=y_test

In [219]:
X_test_new.loc[X_test_new['label']==1,'mailto_wave2']=False

In [220]:
list=X_test_new.loc[:,['id','mailto_wave2']]

In [222]:
list.to_csv("Qiuyi_Xi_Zhengyu_Jake_TheNameless.csv")