In [180]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

## This sets the plot style
## to have a grid on a white background
sns.set_style("whitegrid")

In [181]:
inputfile = 'E:/学习/Boot Camp/Root Insurance/Root_Insurance_data.csv'
df = pd.read_csv(inputfile)

In [182]:
print(df.columns)
print(df.dtypes)
df.head()

Index(['Currently Insured', 'Number of Vehicles', 'Number of Drivers',
       'Marital Status', 'bid', 'rank', 'click', 'policies_sold'],
      dtype='object')
Currently Insured     object
Number of Vehicles     int64
Number of Drivers      int64
Marital Status        object
bid                    int64
rank                   int64
click                   bool
policies_sold          int64
dtype: object


Unnamed: 0,Currently Insured,Number of Vehicles,Number of Drivers,Marital Status,bid,rank,click,policies_sold
0,unknown,2,1,M,10,2,False,0
1,Y,3,1,M,10,5,False,0
2,Y,3,2,M,10,5,False,0
3,N,3,2,S,10,4,False,0
4,unknown,2,2,S,10,2,False,0


In [183]:
dummies1=pd.get_dummies(df['Currently Insured'], prefix='Currently Insured' )
dummies2=pd.get_dummies(df['Marital Status'], prefix='Marital Status' )
df['click'] = np.where(df['click']==False,0,1)

In [184]:
dfnew=pd.concat([dummies1,dummies2,df],axis=1)
dfnew.drop(['Currently Insured','Marital Status'], axis=1, inplace=True)

In [185]:
X = dfnew[['Currently Insured_N','Currently Insured_Y','Currently Insured_unknown','Number of Vehicles','Number of Drivers','Marital Status_M','Marital Status_S','rank']]
y = dfnew['click']

In [186]:
## Now split the data
## Have 20% for testing
## Set 614 as the random state
## and stratify the split
X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                test_size=.2,
                                                shuffle=True,
                                                random_state=614,
                                                stratify=y)

In [192]:
log_reg = LogisticRegression()

In [193]:
log_reg.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [273]:
log_reg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(log_reg.score(X_test, y_test)))

#the accuracy is high, so the model is reasonable enough.

Accuracy of logistic regression classifier on test set: 0.82


In [279]:
#There are 1817 pepole clicking among the 10,000 records, 
#so I think we should show the ads to those people who have a probability of 0.3 to click in our ads.
#When the sample size is large enough, we can consider the frequency of one event's occurence to be it's probability, 
#which means that a person have a probability of 18.17% to click in our ads.

#The threshold could be set as other reasonable values and I choose 0.3 here arbitrarily.
y_pred=pd.DataFrame((log_reg.predict_proba(X)[:,1]>=0.3).astype(int),columns = ['y_pred'])

In [280]:
y_pred

Unnamed: 0,y_pred
0,0
1,0
2,0
3,0
4,1
...,...
9995,0
9996,0
9997,0
9998,0


In [281]:
dfpred=pd.concat([dfnew,y_pred],axis=1)

In [282]:
data=dfpred.drop(index=dfpred.loc[(dfpred['y_pred']==0)].index)

In [283]:
data

#We gain a new dataset here. Records left nwo are people who have 30% probability to click in our ads, 
#and we will use this dataset to determine our bid strategy.
#Though some people do not click in actually, we still leave them in our new dataset, 
#because the reliable model we get "think" these people have 30% probability to click in, not 100%,
#and they still could be our potential customers.

Unnamed: 0,Currently Insured_N,Currently Insured_Y,Currently Insured_unknown,Marital Status_M,Marital Status_S,Number of Vehicles,Number of Drivers,bid,rank,click,policies_sold,y_pred
4,0,0,1,0,1,2,2,10,2,0,0,1
9,0,0,1,1,0,1,2,10,1,0,0,1
14,0,0,1,0,1,2,2,10,2,1,1,1
16,0,0,1,1,0,1,2,10,1,1,1,1
22,0,1,0,0,1,1,2,10,2,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
9975,0,0,1,1,0,1,1,10,1,1,1,1
9981,0,1,0,1,0,1,2,10,1,1,1,1
9985,1,0,0,0,1,1,1,10,2,0,0,1
9988,0,0,1,0,1,1,1,10,1,1,1,1
