In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from pyrsm import gains, gains_plot, lift, lift_plot, confusion, profit_max, ROME_max
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf
import copy
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical

Using TensorFlow backend.


In [2]:
#data = pd.read_csv('intuit75k_withstate.csv')
data = pd.read_csv('../intuit75k_new.csv')
data['label']=1
data.loc[data.res1 == 'No','label'] = 0
#onehot=['zip_state','sex']
onehot=['sex']
tostandard=['numords','last','dollars','sincepurch']
othercol=['bizflag','owntaxprod','version1','upgraded','training','zip_bins']
keep=onehot+tostandard+othercol+['label']

In [3]:
# transform data: using one-hot encoding
standardizer=preprocessing.StandardScaler()
onehot_encoder=preprocessing.OneHotEncoder(sparse=False)
subdata=data.loc[:,keep]
data1=copy.copy(subdata)
onehotdata=onehot_encoder.fit_transform(data1.loc[:,onehot])
with_onehot = pd.DataFrame(onehotdata)
with_onehot.columns=onehot_encoder.get_feature_names()
without_onehot=data1.drop(columns=onehot)
combind_data= pd.concat([without_onehot,with_onehot],axis=1)

In [4]:
# split data into train and test


X_train=combind_data.loc[combind_data.training==1].drop(columns='label').drop(columns='training')
y_train=combind_data.loc[combind_data.training==1].label
X_test=combind_data.loc[combind_data.training==0].drop(columns='label').drop(columns='training')
y_test=combind_data.loc[combind_data.training==0].label

# standarlization
standardizer.fit(X_train[tostandard])
X_train[tostandard] = standardizer.transform(X_train[tostandard])
X_test[tostandard] = standardizer.transform(X_test[tostandard])


#X=combind_data.drop(columns='label')
#y=combind_data.label

In [5]:
Xs = np.concatenate((X_train, X_test), axis=0)

In [19]:
## Train MLP model 

In [6]:
# define the keras model. Starting from 1 hidden layer
model = Sequential()
model.add(Dense(50,activation='relu',input_dim=X_train.shape[1]))
model.add(Dense(1, activation='sigmoid'))

In [7]:
early_stopping_monitor = EarlyStopping(patience=4)

In [8]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.AUC()])
model1_training=model.fit(X_train, y_train, epochs=100,callbacks=[early_stopping_monitor], verbose=False,validation_split=0.2)

In [9]:
model1_training.history['val_loss']

[0.1735242588179452,
 0.17042317731607529,
 0.16944055260930743,
 0.16874778256529854,
 0.16890197452477046,
 0.1743095489115942,
 0.1687492711544037,
 0.16815092720304217,
 0.1696509228320349,
 0.17107860312007722,
 0.16879682648749578,
 0.1679681167772838,
 0.16827168679237367,
 0.1688724715482621,
 0.16852366757960546,
 0.1680398109299796]

In [None]:
# Then I enlarged the capacity of the model by adding another layer

In [10]:
model2=Sequential()
model2.add(Dense(50,activation='relu',input_dim=X_train.shape[1]))
model2.add(Dense(15,activation='relu'))
model2.add(Dense(1, activation='sigmoid'))

In [11]:
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.AUC()])
model2_training = model2.fit(X_train, y_train, epochs=100, validation_split=0.2, callbacks=[early_stopping_monitor], verbose=False)
model2_training.history['val_loss']

[0.17449952316284179,
 0.1701170527935028,
 0.16988816650708516,
 0.16983868619373865,
 0.16941280966713315,
 0.17464681793394543,
 0.16893626424812136,
 0.16985435447806405,
 0.16836157646065666,
 0.1683837203071231,
 0.16832149578843797,
 0.16871232827504476,
 0.1726594325417564,
 0.17001824081511724,
 0.17100101248423258]

In [12]:
model3=Sequential()
model3.add(Dense(60,activation='relu',input_dim=X_train.shape[1]))
model3.add(Dense(1, activation='sigmoid'))
model3.compile(loss='binary_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.AUC()])
model3_training = model2.fit(X_train, y_train, epochs=100, validation_split=0.2, callbacks=[early_stopping_monitor], verbose=False)
model3_training.history['val_loss']

[0.16936201013837543,
 0.17001591492834545,
 0.170359029173851,
 0.1696455550080254,
 0.170324678057716]

In [None]:
# we can see that the loss of model 2 is worse than the model1

In [None]:
# So we shrink the capacity of the nn by decreasing the nodes on the first layer

In [13]:
model4=Sequential()
model4.add(Dense(30,activation='relu',input_dim=X_train.shape[1]))
model4.add(Dense(1, activation='sigmoid'))
model4.compile(loss='binary_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.AUC()])

model4_training = model3.fit(X_train, y_train, epochs=100, validation_split=0.2, callbacks=[early_stopping_monitor], verbose=False)
model4_training.history['val_loss']

[0.17418206684929985,
 0.17097678401356653,
 0.17175154992512295,
 0.17147149160362427,
 0.1691337476571401,
 0.1684588403474717,
 0.16858727360907055,
 0.16835953164100648,
 0.16953019509996686,
 0.16749064132145472,
 0.169178090992428,
 0.16753637280918304,
 0.16851238024802434,
 0.16935969447521937]

In [None]:
# the loss is not obviously less than model1. so we keep model1 and stop enlarge the capacity of the model.

In [None]:
# Next, we try using softmax and accuracy to see if the model can generate better profit.

In [14]:
y_train_cate=to_categorical(y_train)

In [15]:
early_stopping_monitor=EarlyStopping(patience=4)

In [17]:
model5=Sequential()
model5.add(Dense(50,activation='relu',input_dim=X_train.shape[1]))
model5.add(Dense(2, activation='softmax'))
model5.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model5_training = model5.fit(X_train, y_train_cate, epochs=100, validation_split=0.2, callbacks=[early_stopping_monitor], verbose=False)
model5_training.history['val_loss']

[0.173213343393235,
 0.1705479483945029,
 0.16983507405008588,
 0.1704693902617409,
 0.16886121462640308,
 0.16914565891878947,
 0.17083334933008468,
 0.1684260798295339,
 0.17306324926444464,
 0.1684007877508799,
 0.16916894172486804,
 0.16859731895583016,
 0.16900427661623274,
 0.1691545715786162]

In [19]:
model6=Sequential()
model6.add(Dense(30,activation='relu',input_dim=X_train.shape[1]))
model6.add(Dense(2, activation='softmax'))
model6.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model6_training = model6.fit(X_train, y_train_cate, epochs=100, validation_split=0.2, callbacks=[early_stopping_monitor], verbose=False)
model6_training.history['val_loss']

[0.17502979739507038,
 0.17071771400883085,
 0.16909530858766464,
 0.16872137498287929,
 0.17419714268616268,
 0.16878529275031318,
 0.1761442494051797,
 0.16845422232718696,
 0.1681656505720956,
 0.16799587097054436,
 0.16941966664791108,
 0.16932243266559782,
 0.17653834667092277,
 0.16829873877479917]

In [None]:
# The loss of model 6 is larger than model5, model5 is larger than model1, so using model1 as the final model here

In [None]:
# To calculate the profit, AUC of model2.

In [21]:
margin=60
cost=1.41
breakeven_rate=cost/margin
breakeven_rate

0.0235

In [26]:
preds=model.predict_proba(X_test)
testdata=combind_data.loc[combind_data.training==0]
testdata['deeplearning_result']=preds
testdata['pred_click']=1
testdata.loc[testdata['deeplearning_result']<breakeven_rate,'pred_click']=0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [27]:
tp=testdata.loc[testdata.pred_click==1].loc[testdata.label==1]
revenue=len(tp)*margin
pred_true=sum(testdata.pred_click)
totalcost=pred_true*cost
profit=revenue-totalcost
profit

37623.66

In [28]:
fpr, tpr, thresholds = metrics.roc_curve(y_test.values, deeplearning_result)
auc_rf = metrics.auc(fpr, tpr)
auc_rf

0.7518452799091356

In [30]:
#scaled profit
total=763334
testdata2=combind_data.loc[combind_data.training==0]
testdata2['randomforest']=preds
testdata2['pred_click']=1
testdata2.loc[testdata2['randomforest']/2<breakeven_rate,'pred_click']=0
tp=testdata2.loc[testdata2.pred_click==1].loc[testdata2.label==1]
pred_true=sum(testdata2.pred_click)
pred_true_rate=pred_true/len(testdata2)
send_number=total*pred_true_rate
adj_response_rate=len(tp)/pred_true/2
exp_buyers=adj_response_rate*send_number
totalcost=send_number*cost
exp_profit=exp_buyers*margin-totalcost
exp_profit

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


422604.09353066667

In [31]:
#expected profit on the test set of res2
send_number=len(testdata2)*pred_true_rate
adj_response_rate=len(tp)/pred_true/2
exp_buyers=adj_response_rate*send_number
totalcost=send_number*cost
exp_profit_test=exp_buyers*margin-totalcost
exp_profit_test

12456.66