In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

from sklearn import tree

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

## This sets the plot style
## to have a grid on a white background
sns.set_style("whitegrid")

In [57]:
df = pd.read_csv('Root_Insurance_data.csv')

In [58]:
print(df.columns)
print(df.dtypes)
df.head()

Index(['Currently Insured', 'Number of Vehicles', 'Number of Drivers',
       'Marital Status', 'bid', 'rank', 'click', 'policies_sold'],
      dtype='object')
Currently Insured     object
Number of Vehicles     int64
Number of Drivers      int64
Marital Status        object
bid                    int64
rank                   int64
click                   bool
policies_sold          int64
dtype: object


Unnamed: 0,Currently Insured,Number of Vehicles,Number of Drivers,Marital Status,bid,rank,click,policies_sold
0,unknown,2,1,M,10,2,False,0
1,Y,3,1,M,10,5,False,0
2,Y,3,2,M,10,5,False,0
3,N,3,2,S,10,4,False,0
4,unknown,2,2,S,10,2,False,0


In [59]:
dummies1=pd.get_dummies(df['Currently Insured'], prefix='Currently Insured' )
dummies2=pd.get_dummies(df['Marital Status'], prefix='Marital Status' )
df['click'] = np.where(df['click']==False,0,1)

In [60]:
dfnew=pd.concat([dummies1,dummies2,df],axis=1)
dfnew.drop(['Currently Insured','Marital Status'], axis=1, inplace=True)

In [61]:
X = dfnew[['Currently Insured_N','Currently Insured_Y','Currently Insured_unknown','Number of Vehicles','Number of Drivers','Marital Status_M','Marital Status_S']]
y = dfnew[['click', 'policies_sold']]

In [62]:
## Now split the data
## Have 20% for testing
## Set 614 as the random state
## and stratify the split
X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                test_size=.2,
                                                shuffle=True,
                                                random_state=614,
                                                stratify=y)

In [63]:
logistic = LogisticRegression()
logistic.fit(X_train,y_train['click'])

#dectree = tree.DecisionTreeClassifier()             
#dectree.fit(X_train, y_train)          

#forest = RandomForestClassifier()
#forest.fit(X_train, y_train)

#knn = KNeighborsClassifier(n_neighbors = 2)
#knn.fit(X_train,y_train)

#svm = SVC(kernel = 'linear')
#svm.fit(X_train,y_train)

#voting_clf = VotingClassifier(
                #[('lr',LogisticRegression()),
                #('rf',RandomForestClassifier()),
                #('svm',SVC()),
                #('knn',KNeighborsClassifier())],
                #voting = "hard")
#voting_clf.fit(X_train,y_train)


LogisticRegression()

In [64]:
logistic.predict(X_test)
print('Accuracy of Logistic regression classifier on test set: {}%'.format(logistic.score(X_test, y_test['click'])*100))
#print('Accuracy of Decision Tree  classifier on test set: {}%'.format(dectree.score(X_test, y_test)*100))
#print('Accuracy of Random Forest classifier on test set {}%'.format(forest.score(X_test, y_test)*100))
#print('Accuracy of KNN classifier on test set {}%'.format(knn.score(X_test, y_test)*100))
#print('Accuracy of SVM classifier on test set {}%'.format(svm.score(X_test, y_test)*100))
#print('Accuracy of Ensemble voting classifier on test set {}%'.format(voting_clf.score(X_test, y_test)*100))

#the accuracy is high, so the model is reasonable enough.

Accuracy of Logistic regression classifier on test set: 81.2%


In [65]:
#There are 1817 pepole clicking among the 10,000 records, 
#so I think we should show the ads to those people who have a probability of 0.3 to click in our ads.
#When the sample size is large enough, we can consider the frequency of one event's occurence to be it's probability, 
#which means that a person have a probability of 18.17% to click in our ads.

#The threshold could be set as other reasonable values and I choose 0.3 here arbitrarily.
y_pred=pd.DataFrame((logistic.predict_proba(X)[:,1]>=0.3).astype(int),columns = ['y_pred'])

In [84]:
click_prob = logistic.predict_proba(X_train)[:,1]

In [67]:
y_pred

Unnamed: 0,y_pred
0,0
1,0
2,0
3,0
4,1
...,...
9995,0
9996,0
9997,0
9998,0


In [68]:
dfpred=pd.concat([dfnew,y_pred],axis=1)

In [69]:
data=dfpred.drop(index=dfpred.loc[(dfpred['y_pred']==0)].index)

In [70]:
data

#We gain a new dataset here. Records left nwo are people who have 30% probability to click in our ads, 
#and we will use this dataset to determine our bid strategy.
#Though some people do not click in actually, we still leave them in our new dataset, 
#because the reliable model we get "think" these people have 30% probability to click in, not 100%,
#and they still could be our potential customers.

Unnamed: 0,Currently Insured_N,Currently Insured_Y,Currently Insured_unknown,Marital Status_M,Marital Status_S,Number of Vehicles,Number of Drivers,bid,rank,click,policies_sold,y_pred
4,0,0,1,0,1,2,2,10,2,0,0,1
9,0,0,1,1,0,1,2,10,1,0,0,1
14,0,0,1,0,1,2,2,10,2,1,1,1
16,0,0,1,1,0,1,2,10,1,1,1,1
28,0,0,1,1,0,1,2,10,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
9972,0,0,1,1,0,1,2,10,2,1,0,1
9975,0,0,1,1,0,1,1,10,1,1,1,1
9984,0,0,1,1,0,2,2,10,3,0,0,1
9988,0,0,1,0,1,1,1,10,1,1,1,1


In [71]:
# logistic regression model, output: policies_sold
logistic_s = LogisticRegression()
logistic_s.fit(X_train,y_train['policies_sold'])

#dectree = tree.DecisionTreeClassifier()             
#dectree.fit(X_train, y_train)          

#forest = RandomForestClassifier()
#forest.fit(X_train, y_train)

#knn = KNeighborsClassifier(n_neighbors = 2)
#knn.fit(X_train,y_train)

#svm = SVC(kernel = 'linear')
#svm.fit(X_train,y_train)

#voting_clf = VotingClassifier(
                #[('lr',LogisticRegression()),
                #('rf',RandomForestClassifier()),
                #('svm',SVC()),
                #('knn',KNeighborsClassifier())],
                #voting = "hard")
#voting_clf.fit(X_train,y_train)

LogisticRegression()

In [72]:
# accuracy for the policies_sold model
logistic_s.predict(X_test)
print('Accuracy of Logistic regression classifier on test set: {}%'.format(logistic_s.score(X_test, y_test['policies_sold'])*100))
#print('Accuracy of Decision Tree  classifier on test set: {}%'.format(dectree.score(X_test, y_test)*100))
#print('Accuracy of Random Forest classifier on test set {}%'.format(forest.score(X_test, y_test)*100))
#print('Accuracy of KNN classifier on test set {}%'.format(knn.score(X_test, y_test)*100))
#print('Accuracy of SVM classifier on test set {}%'.format(svm.score(X_test, y_test)*100))
#print('Accuracy of Ensemble voting classifier on test set {}%'.format(voting_clf.score(X_test, y_test)*100))


Accuracy of Logistic regression classifier on test set: 92.15%


In [85]:
sold_prob = logistic_s.predict_proba(X_train)[:,1]

In [88]:
#define new column
sold_new = np.divide(sold_prob,click_prob)
y_new = pd.DataFrame(sold_new)

In [92]:
# linear regression modeling for new column
from sklearn.linear_model import LinearRegression
reg = LinearRegression(copy_X = True)
reg.fit(X_train, y_new)

LinearRegression()

In [101]:
# prediction of sold_prob/click_prob for test set
test_pre = reg.predict(X_test)
# 2d array -> 1d array
test_pre = test_pre.flatten()

In [129]:
# sold_prob/click_prob calculated from two logistic model
test_cal = np.divide(logistic_s.predict_proba(X_test)[:,1],logistic.predict_proba(X_test)[:,1])

In [151]:
#MSE
diff =test_pre-test_cal
np.power(np.linalg.norm(diff),2)/len(diff)

0.00016367656330716596