In [4]:
# This R environment comes with many helpful analytics packages installed
# It is defined by the kaggle/rstats Docker image: https://github.com/kaggle/docker-rstats
# For example, here's a helpful package to load

library(tidyverse) # metapackage of all tidyverse packages

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

list.files(path = "../input")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [5]:
customer.df <- read.csv("../input/customer-personality-analysis/marketing_campaign.csv", sep = "\t")

In [6]:
summary(customer.df)

In [7]:
customer.df$Age = 2021 - customer.df$Year_Birth# Caculating  years decrease between customer birth and present year.

In [8]:
customer.df$Dt_CustomerCovert1 = as.Date(customer.df$Dt_Customer)
customer.df$Dt_CustomerCovert2 = as.Date("2021-07-31") - as.Date(customer.df$Dt_CustomerCovert1)
customer.df$NumberofDayEnrolled = as.numeric(customer.df$Dt_CustomerCovert2, units="days")

In [9]:
## 1 - ID : 2 - Year_Birth
## 8 - Dt_Customer : 9 - Recency
## 16 - NumDealsPurchases : 25 - AcceptedCmp2  
## 27 - Z_CostContact : 29 - Response           
## 31 - Dt_CustomerCovert1 : 32 - Dt_CustomerCovert2 
customer.df <- customer.df[-c( 1:2 , 8:9 , 16:25 , 27:29 , 31:32 )]

In [10]:
options(scipen = 200)
hist(customer.df$Income, 
     xlab = "Income", 
     main = "Histogram of Income", 
     col = "darkblue",
     breaks = 20,
     labels = TRUE)

In [11]:
options(scipen = 100)
hist(customer.df$MntMeatProducts, 
     xlab = "MntMeatProducts", 
     main = "Histogram of MntMeatProducts", 
     col = "lightblue",
     breaks = 20,
     labels = TRUE)

In [12]:
options(scipen = 100)
hist(customer.df$MntSweetProducts, 
     xlab = "MntSweetProducts", 
     main = "Histogram of MntSweetProducts", 
     col = "orange",
     breaks = 20,
     labels = TRUE)

In [13]:
options(scipen = 100)
hist(customer.df$MntGoldProds, 
     xlab = "MntGoldProds", 
     main = "Histogram of MntGoldProds", 
     col = "lightgoldenrod1",
     breaks = 20,
     labels = TRUE)

In [14]:
customer.df <- customer.df[!(customer.df$Income>150000 | 
                             customer.df$MntMeatProducts>1000 | 
                             customer.df$MntSweetProducts>200 | 
                             customer.df$MntGoldProds>260 ) , ]

In [15]:
customer.df <- customer.df[!(is.na(customer.df$Income)),]

In [16]:
customer.df$Education <- as.factor(customer.df$Education)
customer.df$Marital_Status <- as.factor(customer.df$Marital_Status)
customer.df$Complain <- as.factor(customer.df$Complain)

In [17]:
MarritalStatfreq <- data.frame(table(customer.df$Marital_Status))
MarritalStatfreq[order(MarritalStatfreq$Freq, decreasing = TRUE),]

In [18]:
MarritalStatfreq[MarritalStatfreq$Freq / nrow(customer.df) > .01, ]

In [19]:
customer.df$Marital_Status <- as.factor(ifelse(customer.df$Marital_Status %in% 
                                               c("Divorced", "Married", "Single","Together","Widow"), 
                                        as.character(customer.df$Marital_Status), 
                                        "Other"))
MarritalStatfreq <- data.frame(table(customer.df$Marital_Status))
MarritalStatfreq[order(MarritalStatfreq$Freq, decreasing = TRUE),]

In [20]:
library( fastDummies )
customer.df <- dummy_cols( customer.df,                                              
                           select_columns = c("Education", "Marital_Status","Complain"),
                           remove_first_dummy = TRUE,
                           remove_selected_columns = TRUE )

In [21]:
customersubset1.df <- customer.df[ , -c(5:9)]

set.seed(14)
train.rows <- sample(rownames( customersubset1.df ), nrow( customersubset1.df )*0.7)
train.data <- customersubset1.df[train.rows , ]
valid.rows <- setdiff(rownames( customersubset1.df ), train.rows)
valid.data <- customersubset1.df[valid.rows , ]

In [22]:
customer.full.lm <- lm( MntWines ~ . ,    
                        data = train.data ) 
options( scipen = 999 )            
sum.full <- summary(customer.full.lm)
sum.full

In [23]:
library(forecast)
valid.full.lm.pred <- predict( customer.full.lm, valid.data)
options(scipen = 999, digits = 1)
valid.resid <- valid.data$MntWines - valid.full.lm.pred

data.frame( "Predicted" = valid.full.lm.pred[1:10],
            "Actual" = valid.data$MntWines[1:10],
            "Residual" = valid.resid[1:10])

options( digits = 6 )
accuracy( valid.full.lm.pred, valid.data$MntWines ) 

sum.full$r.squared
sum.full$adj.r.squared
AIC(customer.full.lm)
BIC(customer.full.lm)

In [24]:
customer.reduced.lm <- lm( MntWines ~ .
                           - Age
                           - Marital_Status_Married - Marital_Status_Other - Marital_Status_Single 
                           - Marital_Status_Together - Marital_Status_Widow
                           - Complain_1 ,    
                           data = train.data ) 
options( scipen = 999 )            
sum.reduced <- summary(customer.reduced.lm)
sum.reduced

In [25]:
library(forecast)
valid.full.lm.pred <- predict( customer.full.lm, valid.data)
options(scipen = 999, digits = 1)
valid.resid <- valid.data$MntWines - valid.full.lm.pred

data.frame( "Predicted" = valid.full.lm.pred[1:10],
            "Actual" = valid.data$MntWines[1:10],
            "Residual" = valid.resid[1:10])

options( digits = 6 )
accuracy( valid.full.lm.pred, valid.data$MntWines ) 

sum.full$r.squared
sum.full$adj.r.squared
AIC(customer.full.lm)
BIC(customer.full.lm)

In [26]:
customer.reduced.lm <- lm( MntWines ~ .
                           - Age
                           - Marital_Status_Married - Marital_Status_Other - Marital_Status_Single 
                           - Marital_Status_Together - Marital_Status_Widow
                           - Complain_1 ,    
                           data = train.data ) 
options( scipen = 999 )            
sum.reduced <- summary(customer.reduced.lm)
sum.reduced

In [27]:
library(forecast)
valid.reduced.lm.pred <- predict( customer.reduced.lm, valid.data)
options(scipen = 999, digits = 1)
valid.resid <- valid.data$MntWines - valid.reduced.lm.pred

data.frame( "Predicted" = valid.reduced.lm.pred[1:10],
            "Actual" = valid.data$MntWines[1:10],
            "Residual" = valid.resid[1:10])

options( digits = 6 )
accuracy( valid.reduced.lm.pred, valid.data$MntWines ) 

sum.reduced$r.squared
sum.reduced$adj.r.squared
AIC(customer.reduced.lm)
BIC(customer.reduced.lm)

In [28]:
customer.lm      <- lm( MntWines ~ . , data = train.data)
customer.lm.null <- lm( MntWines ~ 1 , data = train.data)

In [29]:
customer.lm.fwd <- step( customer.lm.null,
                         scope = list( customer.lm.null, upper = customer.lm ),
                         direction = "forward")
sum.forward <- summary( customer.lm.fwd )
sum.forward

In [30]:
library(forecast)
valid.fwd.pred <- predict( customer.lm.fwd, valid.data )
options( digits = 6 )
accuracy( valid.fwd.pred, valid.data$MntWines )  # performance of variable selection

sum.forward$r.squared
sum.forward$adj.r.squared
AIC(customer.lm.fwd)
BIC(customer.lm.fwd)

In [31]:
customer.lm.back <- step( customer.lm,
                          direction = "backward" )
sum.backward <- summary( customer.lm.back )
sum.backward

In [32]:
library(forecast)
valid.back.pred <- predict( customer.lm.back, valid.data )
options( digits = 6 )
accuracy( valid.back.pred, valid.data$MntWines )  # performance of variable selection

sum.backward$r.squared
sum.backward$adj.r.squared
AIC(customer.lm.back)
BIC(customer.lm.back)

In [33]:
customer.lm.step <- step( customer.lm.null,
                          scope = list( customer.lm.null, upper = customer.lm ),
                          direction = "both" )
sum.stepwise <- summary( customer.lm.step )
sum.stepwise

In [34]:
library(forecast)
valid.step.pred <- predict( customer.lm.step, valid.data )
options( digits = 6 )
accuracy( valid.step.pred, valid.data$MntWines )  # performance of variable selection

sum.stepwise$r.squared
sum.stepwise$adj.r.squared
AIC(customer.lm.step)
BIC(customer.lm.step)

In [35]:
customer.train.norm <- train.data
customer.valid.norm <- valid.data

In [36]:
cols <- colnames(train.data[, -4])
for (i in cols) {
        customer.valid.norm[[i]] <- 
                as.numeric((customer.valid.norm[[i]] - min(train.data[[i]]))) / 
                as.numeric((max(train.data[[i]]) - min(train.data[[i]])))
        customer.train.norm[[i]] <- 
                as.numeric((customer.train.norm[[i]] - min(train.data[[i]]))) / 
                as.numeric((max(train.data[[i]]) - min(train.data[[i]])))
}
summary(customer.train.norm)
summary(customer.valid.norm)

In [37]:
library(FNN)
customer.knn <- knn.reg(train = customer.train.norm[, -4],
                        test  = customer.valid.norm[, -4], 
                        y = customer.train.norm$MntWines,
                        k = 1)

customer.knn.results <- data.frame(cbind(pred   = customer.knn$pred, 
                                         actual = customer.valid.norm$MntWines))
head(customer.knn.results, 10)

In [38]:
library(caret)
RMSE(customer.knn$pred, customer.valid.norm$MntWines)

In [39]:
meanerror <- function(actual, pred){mean(actual - pred)}
meanerror(customer.valid.norm$MntWines,customer.knn$pred)

In [40]:
RMSE.df <- data.frame(k = seq(1, 30, 1), RMSE.k = rep(0, 30))
for (i in 1:30) {
        knn.reg.pred <- knn.reg(train = customer.train.norm[, -4], 
                                test  = customer.valid.norm[, -4], 
                                y = customer.train.norm$MntWines, 
                                k = i)
        RMSE.df[i, 2] <- RMSE(customer.valid.norm$MntWines, knn.reg.pred$pred)}
RMSE.df

In [41]:
library(FNN)
customer.knn.best <- knn.reg(train = customer.train.norm[, -4],
                            test  = customer.valid.norm[, -4], 
                            y = customer.train.norm$MntWines,
                            k = 7)

customer.knn.best.results <- data.frame(cbind( pred   = customer.knn.best$pred, 
                                               actual = customer.valid.norm$MntWines))
head(customer.knn.best.results, 10)

In [42]:
library(caret)
RMSE(customer.knn.best$pred, customer.valid.norm$MntWines)

In [43]:
meanerror <- function(actual, pred){mean(actual - pred)}
meanerror(customer.valid.norm$MntWines,customer.knn.best$pred)

# Regression and Random Forests

In [44]:
library(rpart)
library(rpart.plot)

customer.default.tree <- rpart(MntWines ~ .,         
                               data = train.data,  
                               method = "anova")

In [45]:
options(scipen=999)
prp(customer.default.tree,          # tree model
    type = 1,               # label all the nodes
    extra = 1,              # show # of observations in each node
    varlen = -10,           # truncate variable names to 10 characters
    digits = -3,
    box.col = ifelse(customer.default.tree$frame$var == "<leaf>", 'green', 'orange'))

In [46]:
default.tree.pred <- predict(customer.default.tree, valid.data)
RMSE(default.tree.pred, valid.data$MntWines)

In [47]:
meanerror <- function(actual, pred){mean(actual - pred)}
meanerror(valid.data$MntWines,default.tree.pred)

In [48]:
customer.full.tree <- rpart(MntWines ~ .,           # quantitative outcome ~ predictors
                            data = train.data,      # training data
                            method = "anova",       # regression tree - this is the default
                            cp = 0,                 # complexity parameter
                            minsplit = 2)           # minimum observations required to attempt split

In [None]:
prp(customer.full.tree,          # tree model
    type = 1,               # label all the nodes
    extra = 1,              # show # of observations in each node
    varlen = -10,           # truncate variable names to 10 characters
    box.col = ifelse(customer.full.tree$frame$var == "<leaf>", 'green', 'orange'))

In [None]:
full.tree.pred <- predict(customer.full.tree, valid.data)
RMSE(full.tree.pred, valid.data$MntWines)

In [None]:
meanerror <- function(actual, pred){mean(actual - pred)}
meanerror(valid.data$MntWines,full.tree.pred)

In [None]:
cv.customer.full.tree <- rpart(MntWines ~ .,
                               data = train.data,
                               cp = 0,
                               minsplit = 2,
                               xval = 10)            # 10-fold cross validation
options(scipen = 999, digits = 8)
printcp(cv.customer.full.tree)

In [None]:
minerror <- min(cv.customer.full.tree$cptable[ ,4 ])
minerror

In [None]:
minerrorstd <- cv.customer.full.tree$cptable[cv.customer.full.tree$cptable[,4] == minerror, 5]
minerrorstd

In [None]:
cv.customer.full.tree$cptable[cv.customer.full.tree$cptable[,4] == minerror, ]

In [None]:
simplertrees <- cv.customer.full.tree$cptable[cv.customer.full.tree$cptable[,4] < minerror + minerrorstd, ]
simplertrees

In [None]:
bestcp <- simplertrees[1, 1]
customer.pruned <- prune( cv.customer.full.tree, 
                          cp = bestcp )
prp( customer.pruned, 
     type = 1, 
     extra = 1, 
     varlen = -10, 
     digits = -3,
     box.col = ifelse(customer.pruned$frame$var == "<leaf>", 'green', 'orange'))

length(customer.pruned$frame$var[customer.pruned$frame$var == "<leaf>"])

In [None]:
best.tree.pred <- predict(customer.pruned, valid.data)
RMSE(best.tree.pred, valid.data$MntWines)

In [None]:
meanerror <- function(actual, pred){mean(actual - pred)}
meanerror(valid.data$MntWines,best.tree.pred)

In [None]:
library(randomForest)
customer.rf <- randomForest(MntWines ~ . , 
                            data = train.data, 
                            ntree = 500,
                            mtry = 4, 
                            nodesize = 5, 
                            importance = TRUE)

In [None]:
varImpPlot(customer.rf, type = 1)

In [None]:
customer.rf.pred <- predict(customer.rf, valid.data)
RMSE(customer.rf.pred, valid.data$MntWines)

In [None]:
meanerror <- function(actual, pred){mean(actual - pred)}
meanerror(valid.data$MntWines,customer.rf.pred)

# Neural Nets

In [None]:
customer.train.norm.nn <- train.data
customer.valid.norm.nn <- valid.data

cols <- colnames(train.data)
for (i in cols) {
        customer.valid.norm.nn[[i]] <- 
                as.numeric((customer.valid.norm.nn[[i]] - min(train.data[[i]]))) / 
                as.numeric((max(train.data[[i]]) - min(train.data[[i]])))
        customer.train.norm.nn[[i]] <- 
                as.numeric((customer.train.norm.nn[[i]] - min(train.data[[i]]))) / 
                as.numeric((max(train.data[[i]]) - min(train.data[[i]])))
}
summary(customer.train.norm.nn)
summary(customer.valid.norm.nn)

In [None]:
library(neuralnet)
set.seed(14)
customer.nn.3 <- neuralnet(MntWines ~ .,                
                           data = customer.train.norm.nn,       
                           linear.output = FALSE,     
                           hidden = 3)                
plot(customer.nn.3, rep = "best")

predict.nn.3 <- predict(customer.nn.3, customer.valid.norm.nn)
head(predict.nn.3)

In [None]:
minWines <- min(train.data$MntWines)
maxWines <- max(train.data$MntWines)
actpred  <- data.frame(actual = valid.data$MntWines, 
                       predicted = minWines + predict.nn.3*(maxWines - minWines))
head(actpred)
RMSE(actpred$predicted, actpred$actual)

In [None]:
set.seed(14)
customer.nn.3.3 <- neuralnet(MntWines ~ .,                
                             data = customer.train.norm.nn,       
                             linear.output = FALSE,     
                             hidden = c(3,3))                
plot(customer.nn.3.3, rep = "best")

predict.nn.3.3 <- predict(customer.nn.3.3, customer.valid.norm.nn)
head(predict.nn.3.3)

In [None]:
minWines <- min(train.data$MntWines)
maxWines <- max(train.data$MntWines)
actpred  <- data.frame(actual = valid.data$MntWines, 
                       predicted = minWines + predict.nn.3.3*(maxWines - minWines))
head(actpred)
RMSE(actpred$predicted, actpred$actual)

In [None]:
set.seed(14)
customer.nn.10 <- neuralnet(MntWines ~ .,                
                            data = customer.train.norm.nn,       
                            linear.output = FALSE,     
                            hidden = 10)                
plot(customer.nn.10, rep = "best")

predict.nn.10 <- predict(customer.nn.10, customer.valid.norm.nn)
head(predict.nn.10)

In [None]:
minWines <- min(train.data$MntWines)
maxWines <- max(train.data$MntWines)
actpred  <- data.frame(actual = valid.data$MntWines, 
                       predicted = minWines + predict.nn.10*(maxWines - minWines))
head(actpred)
RMSE(actpred$predicted, actpred$actual)

In [None]:
set.seed(14)
customer.nn.1 <- neuralnet(MntWines ~ .,                
                           data = customer.train.norm.nn,       
                           linear.output = FALSE,     
                           hidden = 1)                
plot(customer.nn.1, rep = "best")

predict.nn.1 <- predict(customer.nn.1, customer.valid.norm.nn)
head(predict.nn.1)

In [None]:
minWines <- min(train.data$MntWines)
maxWines <- max(train.data$MntWines)
actpred  <- data.frame(actual = valid.data$MntWines, 
                       predicted = minWines + predict.nn.1*(maxWines - minWines))
head(actpred)
RMSE(actpred$predicted, actpred$actual)