In [None]:
## Importing packages

# This R environment comes with all of CRAN and many other helpful packages preinstalled.
# You can see which packages are installed by checking out the kaggle/rstats docker image: 
# https://github.com/kaggle/docker-rstats

library(tidyverse) # metapackage with lots of helpful functions

## Running code

# In a notebook, you can run a single code cell by clicking in the cell and then hitting 
# the blue arrow to the left, or by clicking in the cell and pressing Shift+Enter. In a script, 
# you can run code by highlighting the code you want to run and then clicking the blue arrow
# at the bottom of this window.

## Reading in files

# You can access files from datasets you've added to this kernel in the "../input/" directory.
# You can see the files added to this kernel by running the code below. 

list.files(path = "../input")

## Saving data

# If you save any files or images, these will be put in the "output" directory. You 
# can see the output directory by committing and running your kernel (using the 
# Commit & Run button) and then checking out the compiled version of your kernel.

In [None]:
#Objective: Predict Customer Life-time Value for an Auto Insurance Company
#For an Auto Insurance company, predict the customer life time value (CLV). CLV is the total revenue the client will
#derive from their entire relationship with a customer. Because we don't know how long each customer relationship will
#be, we make a good estimate and state CLV as a periodic value — that is, we usually say “this customer's 12-month
#(or 24-month, etc) CLV is $x”.

In [None]:
#Importing tidyverse and reading the data from source
library("tidyverse")
#data=read_csv("../input/ibm-watson-marketing-customer-value-data/WA_Fn-UseC_-Marketing-Customer-Value-Analysis.csv")
data<-read_csv("../input/auto-insurance-clv/data.csv")

In [None]:
#Importing relevant packages
library(caret)
library(ggplot2)
library(dplyr)
#Here we are checking dimension and structure of the data
dim(data)
glimpse(data)

In [None]:
#ummary of the data
summary(data)
#chaeck for missing values
sum(is.na(data))
#getting column names and renaming those
names(data)
names(data)<-c("Customer","State","CLV","Response","Coverage","Education","Effective.To.Date","EmploymentStatus","Gender","Income","Location.Code","Marital.Status","Monthly.Premium.Auto","Months.Since.Last.Claim","Month.Since.Policy.Inception","Open.Complaints","Policies","Policy.Type","Policy","Renew.Offer.Type","Sales.Channel","Total.Claim.Amount","Vehicle.Class","Vehicle.Size")
colnames(data)

In [None]:
#Discarding insignificant features first(intuitive sense) 
data<-data[,-c(1,7)]
dim(data)
#subsetting with numerical variables
new_data<-subset(data,select=-c(CLV))
new_data<-new_data[sapply(new_data,is.numeric)]
#scale it
new_scaled<-scale(new_data, center=TRUE, scale=TRUE)
#find correlation matrix
corr<-cor(new_data)
print(corr)
highlyCorrelated<-findCorrelation(corr, cutoff=0.5)
print(highlyCorrelated)
names<-colnames(highlyCorrelated)
names
library(corrplot)
corrplot(corr, order="FPC", method="circle", type="lower", tl.cex=0.7, tl.col=rgb(0,0,0))

In [None]:
#set.seed(999)
#glimpse(data)
#Doing variable transformation(from character to factor)
data$State<-as.factor(data$State)
data$Sales.Channel<-as.factor(data$Sales.Channel)
data$Response<-as.factor(data$Response)
data$Coverage<-as.factor(data$Coverage)
data$Education<-as.factor(data$Education)
data$EmploymentStatus<-as.factor(data$EmploymentStatus)
data$Gender<-as.factor(data$Gender)
data$Location.Code<-as.factor(data$Location.Code)
data$Marital.Status<-as.factor(data$Marital.Status)
data$Policy.Type<-as.factor(data$Policy.Type)
data$Policy<-as.factor(data$Policy)
data$Renew.Offer.Type<-as.factor(data$Renew.Offer.Type)
data$Vehicle.Class<-as.factor(data$Vehicle.Class)
data$Vehicle.Size<-as.factor(data$Vehicle.Size)

In [None]:
#Scaling it(standardization with z-score method) as numerical variables are in different scale/measure
scale(data[,c(8,11,12,13,14,15,20)],center=TRUE,scale=TRUE)
#glimpse(data)
#Create dummy variables
dmy<-dummyVars(CLV~.,data,fullRank=T)
new<-data.frame(predict(dmy,data))
head(new,10)
glimpse(data)

In [None]:
#class(data$Income)
new_data<-cbind(new,data[,-c(1,3:7,9,10,16:19,21,22)])
new_data<-new_data[, !duplicated(colnames(new_data))]
dim(new_data)
colnames(new_data)
#Train-test split
#index<-createDataPartition(new_data$CLV, p=0.80, list=FALSE)
#train<-new_data[index,]
#test<-new_data[-index,]
#set.seed(50000)
#Building a multiple regression model
#control<-trainControl(method="repeatedcv", number=10, repeats=3)
#model<-train(CLV~., train, method="lm", metric="Rsquared", preProcess=c("center","scale"))
#print(model)
#summary(model)

In [None]:
set.seed(50000)
#Building a multiple regression model
#control<-trainControl(method="repeatedcv", number=10, repeats=3)
model<-train(CLV~., new_data, method="lm", metric="Rsquared", preProcess=c("center","scale"))
print(model)
summary(model)


In [None]:
#Check for variable importance
#Here I followed a different method instead of p-value(as it may misleading)
#Create a traincontrol and fit the model
control<-trainControl(method="cv", number=10)
model<-train(CLV~., new_data, method="lm", metric="Rsquared", trControl=control, preProcess=c("center","scale"))
summary(model)
imp<-varImp(model, scale=FALSE)
plot(imp)
#Do different tests like multicollinearity test, Homoscedasticity test, Normality test, MAPE.
#Discard features or levels

In [None]:
#Selecting significant variables comparing p-value from summary and variable importance plot
#ix<-which(c("CLV","Monthly.Premium.Auto","Renew.Offer.Type.Offer2","Renew.Offer.Type.Offer3","Renew.Offer.Type.Offer4","Policy.Corporate.L2","Policy.Corporate.L3","Open.Complaints","Vehicle.Class.SUV","Education.High.School.or.Below","Vehicle.Class.Sports.Car","Marital.Status.Single","Policies","EmploymentStatus.Employed","Sales.Channel.Call.Center","Vehicle.Size.Small") %in% colnames(new_data))
#newData<-new_data[,ix]
newData<-new_data[,c(10,12,21,22,25,26,29,30,37,38,39,41,46,47,50)]
newData<-cbind(newData,new_data[,51])
colnames(newData)<-c("Education.High.School.or.Below","EmploymentStatus.Employed","Marital.Status.Single","Monthly.Premium.Auto","Open.Complaints","Policies","Policy.Corporate.L2","Policy.Corporate.L3","Renew.Offer.Type.Offer2","Renew.Offer.Type.Offer3","Renew.Offer.Type.Offer4","Sales.Channel.Call.Center","Vehicle.Class.Sports.Car","Vehicle.Class.SUV","Vehicle.Size.Small","CLV")
dim(newData)
head(newData)
#colnames(new_data)
#glimpse(new_data)
set.seed(50000)
#Building a multiple regression model
#control<-trainControl(method="cv", number=10)
model<-train(CLV~., newData, method="lm", trControl=control, metric="Rsquared", preProcess=c("center","scale"))
print(model)

In [None]:
#summary of the  model
#looking at adjusted R-squared and AIC-BIC
summary(model)
summary(model)$coeff
summary(model)$r.squared
summary(model)$adj.r.suared#'ll be more focus on that
#AIC(model)
#BIC(model)
#confint(model,confidence.level=0.95)
#glimpse(newData)
#newData<-newData[,-c(7,15,16)]
#set.seed(999)
#index<-createDataPartition(new_data$CLV, p=0.80, list=FALSE)
#train<-new_data[index,]
#test<-new_data[-index,]
#model<-train(CLV~., train, method="lm", trControl=control, metric="Rsquared",preProcess=c("center","scale"))
#print(model)
#summary(model)$adj.r.squared

In [None]:
#Model Diagnostic and Scoring
residuals<-resid(model)
predValues<-predict(model,newData)
plot(newData$CLV,residuals)
abline(0,0)
plot(newData$CLV,predValues)

In [None]:
pred<-predict(model,newData)
modValues<-data.frame(newData[,16,pred)
defaultSummary(modValues)
#confusionMatrix(table(data$CLV,pred))