In [227]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

from sklearn import tree

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

## This sets the plot style
## to have a grid on a white background
sns.set_style("whitegrid")

In [228]:
df = pd.read_csv('Root_Insurance_data.csv')

In [229]:
dummies1=pd.get_dummies(df['Currently Insured'], prefix='Currently Insured' )
dummies2=pd.get_dummies(df['Marital Status'], prefix='Marital Status' )
df['click'] = np.where(df['click']==False,0,1)

In [230]:
dfnew=pd.concat([dummies1,dummies2,df],axis=1)
dfnew.drop(['Currently Insured','Marital Status'], axis=1, inplace=True)

In [231]:
X = dfnew[['Currently Insured_N','Currently Insured_Y','Currently Insured_unknown','Number of Vehicles','Number of Drivers','Marital Status_M','Marital Status_S', 'rank']]
y = dfnew[['click', 'policies_sold']]

In [232]:
## Now split the data
## Have 20% for testing
## Set 614 as the random state
## and stratify the split
X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                test_size=.2,
                                                shuffle=True,
                                                random_state=614,
                                                stratify=y)
X_train_befo = X_train[['Currently Insured_N','Currently Insured_Y','Currently Insured_unknown','Number of Vehicles','Number of Drivers','Marital Status_M','Marital Status_S']]
X_test_befo = X_test[['Currently Insured_N','Currently Insured_Y','Currently Insured_unknown','Number of Vehicles','Number of Drivers','Marital Status_M','Marital Status_S']]

In [233]:
logistic = LogisticRegression()
logistic.fit(X_train_befo,y_train['click'])

array([[-0.44509893, -0.42614813,  0.8703522 , -0.54722821,  0.13504459,
        -0.12554517,  0.1246503 ]])

In [234]:
logistic.predict(X_test_befo)
print('Accuracy of Logistic regression classifier on test set: {}%'.format(logistic.score(X_test_befo, y_test['click'])*100))
#the accuracy is high, so the model is reasonable enough.

Accuracy of Logistic regression classifier on test set: 81.2%


In [235]:
click_prob = logistic.predict_proba(X_train_befo)[:,1]

In [236]:
# logistic regression model, output: policies_sold
logistic_s = LogisticRegression()
logistic_s.fit(X_train_befo,y_train['policies_sold'])

LogisticRegression()

In [237]:
# accuracy for the policies_sold model
logistic_s.predict(X_test_befo)
print('Accuracy of Logistic regression classifier on test set: {}%'.format(logistic_s.score(X_test_befo, y_test['policies_sold'])*100))

Accuracy of Logistic regression classifier on test set: 92.15%


In [238]:
sold_prob = logistic_s.predict_proba(X_train_befo)[:,1]

In [239]:
#define new column
sold_new = np.divide(sold_prob,click_prob)
y_new = pd.DataFrame(sold_new)

In [240]:
# linear regression modeling for new column
from sklearn.linear_model import LinearRegression
reg = LinearRegression(copy_X = True)
reg.fit(X_train_befo, y_new)

LinearRegression()

In [242]:
# prediction of sold_prob/click_prob for test set
test_pre = reg.predict(X_test_befo)
# 2d array -> 1d array
test_pre = test_pre.flatten()
#sold_prob/click_prob calculated from two logistic model
test_cal = np.divide(logistic_s.predict_proba(X_test_befo)[:,1],logistic.predict_proba(X_test_befo)[:,1])
#MSE
diff =test_pre-test_cal
np.power(np.linalg.norm(diff),2)/len(diff)

0.00016367656330716596

In [243]:
logistic_pre_click = LogisticRegression()
logistic_pre_click.fit(X_train,y_train['click'])


LogisticRegression()

In [244]:
print('Accuracy of Logistic regression classifier on test set: {}%'.format(logistic_pre_click.score(X_test, y_test['click'])*100))

Accuracy of Logistic regression classifier on test set: 82.39999999999999%


In [245]:
logistic_pre_sold = LogisticRegression()
logistic_pre_sold.fit(X_train,y_train['policies_sold'])

print('Accuracy of Logistic regression classifier on test set: {}%'.format(logistic_pre_sold.score(X_test, y_test['policies_sold'])*100))

Accuracy of Logistic regression classifier on test set: 92.15%


In [270]:
X1 = dfnew[['Currently Insured_N','Currently Insured_Y','Currently Insured_unknown','Number of Vehicles','Number of Drivers','Marital Status_M','Marital Status_S','rank']]

# add more columns to predict marketing spend and #(sold_policies)
# click_less_th: threshhold for customers with less click-throuh-rate
# buy_many_th: threshhold for customers with high sold_ratio given that they clicked.
# buy_less_th: threshhold for customers with low sold_ratio given that they clicked.
def Data_for_pre(Data, click_less_th, buy_many_th, buy_less_th):
    Data_copy = Data.copy()
    Data_copy_temp = Data_copy[['Currently Insured_N','Currently Insured_Y','Currently Insured_unknown','Number of Vehicles','Number of Drivers','Marital Status_M','Marital Status_S']]
    spend = pd.DataFrame(np.zeros(10000),columns = ['spend_diff'])
    click_less=pd.DataFrame((logistic.predict_proba(Data_copy_temp)[:,1]<=click_less_th).astype(int),columns = ['click_less'])
    buy_many = pd.DataFrame((reg.predict(Data_copy_temp)>buy_many_th).astype(int),columns = ['buy_many'])
    buy_less = pd.DataFrame((reg.predict(Data_copy_temp)<=buy_less_th).astype(int),columns = ['buy_less'])
    Data_new=pd.concat([Data_copy, spend, buy_many, buy_less, click_less],axis=1)
    return Data_new

X_new = Data_for_pre(X1, 0.2, 0.5, 0.35)


In [248]:
# make the rank of customers who have high click_through-rate & low sold_rate given that they clicked lower by 1 if it possible. 
def rank_lower(X):
    X_copy = X.copy()
    X_copy.loc[(X_copy['click_less']==0)&(X_copy['buy_less']==1.0)&(X_copy['rank']<5.0),'rank'] += 1
    X_copy.loc[(X_copy['click_less']==0)&(X_copy['buy_less']==1.0)&(X_copy['rank']<5.0),'spend_diff'] -= 1
    return X_copy


In [249]:
# make the rank of customers who have low click_through-rate & high sold_rate given that they clicked higher by 1 . 
def rank_higher(X):
    X_copy = X.copy()
    X_copy.loc[(X_copy['click_less']==1.0)&(X_copy['buy_many']==1.0)&(X_copy['rank']>1.0),'rank'] -= 1
    X_copy.loc[(X_copy['click_less']==1.0)&(X_copy['buy_less']==1.0)&(X_copy['rank']>1.0),'spend_diff'] += 1
    return X_copy

In [250]:
# do both(rank_higher/rank_lower
def rank_change_both(X):
    X_copy = X.copy()
    X_copy = rank_lower(X_copy)
    X_copy = rank_higher(X_copy)
    return X_copy

In [251]:
# increase the rank of all customers by 1 . 
def rank_higher_all(X):
    X_copy = X.copy()
    X_copy.loc[(X_copy['rank']>1.0),'rank'] -= 1
    X_copy.loc[(X_copy['rank']>1.0),'spend_diff'] += 1
    return X_copy


In [259]:

#predict #(policies_sold) and marketing spend
def pre_sold_spend(X):
    X_copy = X.copy()
    A=logistic_pre_sold.predict_proba(X_copy[['Currently Insured_N','Currently Insured_Y','Currently Insured_unknown','Number of Vehicles','Number of Drivers','Marital Status_M','Marital Status_S', 'rank']])[:,1] #sold
    B=logistic_pre_click.predict_proba(X_copy[['Currently Insured_N','Currently Insured_Y','Currently Insured_unknown','Number of Vehicles','Number of Drivers','Marital Status_M','Marital Status_S', 'rank']])[:,1] #click
    C = X4['spend_diff'].to_numpy()
    spend_exp = 0
    for i in range(len(B)):
        spend_exp += B[i]*(10+C[i])
    
    sold_exp = np.sum(A)
    
    return sold_exp, spend_exp

In [268]:
X_lower_1 = rank_lower(X_new)
X_higher_1 = rank_higher(X_new)
X_both_1 = rank_change_both(X_new)
X_both_2 = rank_change_both(X_both_1)
print(pre_sold_spend(X_new)) #E(sold), E(spend)
print(pre_sold_spend(X_lower_1))
print(pre_sold_spend(X_higher_1))
print(pre_sold_spend(X_both_1))
print(pre_sold_spend(X_both_2))

(781.3392934925713, 18844.948852558682)
(769.4618177037148, 18507.547947450952)
(817.1237326055251, 19564.35064384309)
(805.2462568166685, 19226.949738735326)
(837.2192033965487, 19793.136503979207)


In [269]:
X_all_high = rank_higher_all(X_new)
print(pre_sold_spend(X_all_high))

(1150.923492887306, 28884.079636320675)


In [260]:
X_new_test = Data_for_pre(X_test, 0.2, 0.5, 0.35)
X_new_test = X_new_test.dropna()

In [261]:
print(X_new_test)

      Currently Insured_N  Currently Insured_Y  Currently Insured_unknown  \
9                     0.0                  0.0                        1.0   
15                    0.0                  0.0                        1.0   
19                    1.0                  0.0                        0.0   
21                    1.0                  0.0                        0.0   
34                    0.0                  0.0                        1.0   
...                   ...                  ...                        ...   
1975                  1.0                  0.0                        0.0   
1980                  1.0                  0.0                        0.0   
1990                  0.0                  1.0                        0.0   
1992                  1.0                  0.0                        0.0   
1999                  0.0                  0.0                        1.0   

      Number of Vehicles  Number of Drivers  Marital Status_M  \
9         

In [265]:
X_lower_1 = rank_lower(X_new_test)
X_higher_1 = rank_higher(X_new_test)
X_both_1 = rank_change_both(X_new_test)

print(pre_sold_spend(X_new_test))
print(pre_sold_spend(X_lower_1))
print(pre_sold_spend(X_higher_1))
print(pre_sold_spend(X_both_1))


print(y_test['click'].value_counts())
print(y_test['policies_sold'].value_counts())

(33.44362001137009, 824.8652656973576)
(32.91381300758479, 812.4884814420776)
(34.079523477175194, 843.4471866767976)
(33.54971647338991, 831.070402421518)
0    1624
1     376
Name: click, dtype: int64
0    1843
1     157
Name: policies_sold, dtype: int64
