In [52]:
library(caret)
library(Amelia)
library(cattonum)
library(tidyverse)
library(leaps)
library(MASS)
library(pROC)
library(e1071)
library(rpart.plot)

In [2]:
test_file <- read.csv("orange_churn_test.csv",na.strings=c(""))
data_train <- read.csv("orange_churn_train.csv",na.strings=c(""))

In [3]:
#Afficher un graph avec les valeurs manquantes vs les valeurs observées
# missmap(data_train, main = "Missing values vs observed")

## Processing Pipeline

In [4]:
processing_pipeline <- function(dataframe){
    #remove columns that are 90% empty
    p_data<- dataframe[,colSums(is.na(dataframe))<9000]
    
    #replace NA by mean for integers and doubles
    for(col in names(p_data)){
        if( typeof(p_data[[col]]) == "integer" | typeof(p_data[[col]]) == "double"){
            p_data[[col]]= ifelse(is.na(p_data[[col]]), ave(p_data[[col]], FUN = function(x) mean(x, na.rm = 'TRUE')), p_data[[col]])
        }
    }
    
    #replace NA in string cols by "Missing"
    p_data[is.na(p_data)] <- "Missing"
    
    #frequency encoding
    p_data = catto_freq(p_data, verbose=TRUE)
                                                                      
    #remove columns with no variance
    p_data<-p_data[c(TRUE, lapply(p_data[-1], var, na.rm = TRUE) != 0)]

    return(p_data)
}

In [5]:
p_data_train <- processing_pipeline(data_train)
p_data_train$churn <- ifelse(p_data_train$churn==1,'yes','no')
labels <- as.factor(p_data_train$churn)
p_data_train$churn <- NULL

## Removing highly correlated features

In [6]:
cor <- cor(p_data_train)
hc <- findCorrelation(cor, cutoff=0.9)
# paste(hc,collapse=", ")


In [7]:
remove_cor <- function(dataset){
    hc <- c(2, 3, 4, 5, 6, 7, 9, 10, 12, 14, 15, 16, 17, 18, 20, 21, 24, 25, 27, 28, 30, 32, 33, 34, 35, 36, 37, 40, 41, 42, 43, 45, 46, 47, 49, 50, 51, 52, 53, 54, 57, 58, 59, 60, 61, 62, 66, 68, 70, 71, 72, 74, 75, 76, 77, 78, 80, 82, 83, 85, 86, 87, 89, 90, 91, 93, 94, 96, 97, 98, 100, 101, 104, 105, 106, 107, 109, 110, 111, 112, 114, 117, 119, 120, 121, 123, 125, 126, 127, 129, 131, 134, 135, 137, 139, 140, 141, 142, 143, 144, 146, 147, 148, 150, 151, 153, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 168, 169, 170, 171, 172, 173, 182, 184, 185, 191, 196, 201, 207)
    reduced_data_train = dataset[,-c(hc)]
    return(reduced_data_train)
}


In [8]:
reduced_data_train <- remove_cor(p_data_train)


## Models

In [33]:
reduced_data_train$churn = labels
splitIndex <- createDataPartition(reduced_data_train$churn, p = .5, list = FALSE, times = 1)
trainDF <- reduced_data_train[ splitIndex,]
testDF  <- reduced_data_train[-splitIndex,]

outcomeName <- 'churn'
predictorsNames <- names(trainDF)[names(trainDF) != outcomeName]


### 1. SVM

In [34]:
SVMclassifier = svm(formula = churn ~ .,
                 data = trainDF,
                 type = 'C-classification',
                 kernel = 'linear') #can be tweaked : https://www.rdocumentation.org/packages/e1071/versions/1.7-4/topics/svm

In [35]:
pred = predict(SVMclassifier, newdata = testDF)
train_pred = predict(SVMclassifier, newdata = trainDF)

In [36]:
cm = table(testDF$churn, pred)
cm2 = table(trainDF$churn, train_pred )

In [55]:
table(trainDF$churn)


  no  yes 
4632  368 

### 2. Decision Trees

In [40]:
objControl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
set.seed(3333)
dtree_fit <- train(churn ~., data = trainDF, method = "rpart",
                   parms = list(split = "information"),
                   trControl=objControl,
                   tuneLength = 10)

In [46]:
dtree_fit

CART 

5000 samples
  78 predictor
   2 classes: 'no', 'yes' 

No pre-processing
Resampling: Cross-Validated (10 fold, repeated 3 times) 
Summary of sample sizes: 4501, 4500, 4500, 4500, 4500, 4500, ... 
Resampling results across tuning parameters:

  cp            Accuracy   Kappa     
  0.0000000000  0.8956008  0.08256762
  0.0007548309  0.9004007  0.08894846
  0.0015096618  0.9059339  0.09053515
  0.0022644928  0.9134666  0.09813855
  0.0030193237  0.9174686  0.08961233
  0.0037741546  0.9204676  0.07570664
  0.0045289855  0.9222004  0.06439860
  0.0052838164  0.9251999  0.06075149
  0.0060386473  0.9252001  0.05066131
  0.0067934783  0.9254002  0.02466325

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.006793478.

In [56]:
pred = predict(dtree_fit, newdata = testDF)
train_pred = predict(dtree_fit, newdata = trainDF)

In [61]:
cm = table(testDF$churn, pred)
cm2 = table(trainDF$churn, train_pred )

### -. GBM

In [10]:
objControl <- trainControl(method='cv', number=3, returnResamp='none', summaryFunction = twoClassSummary, classProbs = TRUE)
gbm <- train(trainDF[,predictorsNames], trainDF[,outcomeName], 
                  method='gbm', 
                  trControl=objControl,  
                  metric = "ROC",
                  preProc = c("center", "scale"))

Iter   TrainDeviance   ValidDeviance   StepSize   Improve
     1        0.5243             nan     0.1000    0.0004
     2        0.5225             nan     0.1000    0.0001
     3        0.5197             nan     0.1000    0.0001
     4        0.5186             nan     0.1000    0.0002
     5        0.5156             nan     0.1000    0.0013
     6        0.5150             nan     0.1000   -0.0000
     7        0.5127             nan     0.1000    0.0009
     8        0.5107             nan     0.1000   -0.0001
     9        0.5097             nan     0.1000   -0.0006
    10        0.5085             nan     0.1000    0.0001
    20        0.4989             nan     0.1000    0.0001
    40        0.4835             nan     0.1000   -0.0003
    60        0.4736             nan     0.1000   -0.0003
    80        0.4633             nan     0.1000   -0.0002
   100        0.4562             nan     0.1000   -0.0002
   120        0.4514             nan     0.1000   -0.0003
   140        

In [11]:
predictions_raw <- predict(object=gbm, testDF[,predictorsNames], type='raw')
confusionMatrix(predictions_raw,testDF$churn)

Confusion Matrix and Statistics

          Reference
Prediction   no  yes
       no  4614  357
       yes   18   11
                                          
               Accuracy : 0.925           
                 95% CI : (0.9173, 0.9322)
    No Information Rate : 0.9264          
    P-Value [Acc > NIR] : 0.6601          
                                          
                  Kappa : 0.0451          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.99611         
            Specificity : 0.02989         
         Pos Pred Value : 0.92818         
         Neg Pred Value : 0.37931         
             Prevalence : 0.92640         
         Detection Rate : 0.92280         
   Detection Prevalence : 0.99420         
      Balanced Accuracy : 0.51300         
                                          
       'Positive' Class : no              
                        

In [12]:
predictions_prob <- predict(object=gbm, testDF[,predictorsNames], type='prob')
auc <- roc(ifelse(testDF[,outcomeName]=="yes",1,0), predictions_prob[[2]])
print(auc$auc)

Setting levels: control = 0, case = 1

Setting direction: controls < cases



Area under the curve: 0.6983


### -. glm and Stepwise regression model

In [62]:
# full log model
full.model <- glm(churn ~., data = trainDF, family = binomial)
summary(full.model)


Call:
glm(formula = churn ~ ., family = binomial, data = trainDF)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9539  -0.4131  -0.3070  -0.2118   3.3444  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.338e+01  3.036e+02  -0.044 0.964837    
cust_id      4.526e-07  9.921e-06   0.046 0.963618    
Var7        -4.341e-05  6.659e-05  -0.652 0.514477    
Var11        2.438e-04  1.292e-04   1.887 0.059174 .  
Var13       -7.227e-05  1.112e-04  -0.650 0.515773    
Var21       -7.240e-04  4.870e-04  -1.486 0.137151    
Var24       -4.366e-05  1.674e-04  -0.261 0.794248    
Var25       -3.094e-04  4.069e-04  -0.760 0.446977    
Var28        1.024e-04  1.299e-04   0.788 0.430516    
Var33        2.907e-06  1.123e-04   0.026 0.979348    
Var35       -5.208e-05  4.022e-05  -1.295 0.195362    
Var44        4.597e-05  6.450e-05   0.713 0.476043    
Var45       -7.685e-05  1.203e-04  -0.639 0.522828    
Var51        1.466e-05  2.754e-05 

In [63]:
# Make predictions
probabilities <- full.model %>% predict(testDF, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
# Prediction accuracy
observed.classes <- testDF$churn
mean(predicted.classes == observed.classes)

In [None]:
# stepwise model
step.model <- full.model %>% stepAIC(trace = FALSE)
summary(step.model)

In [None]:
# Make predictions
probabilities <- predict(step.model, testDF, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
# Prediction accuracy
observed.classes <- testDF$churn
mean(predicted.classes == observed.classes)


In [None]:
cm = table(observed.classes, predicted.classes)