In [77]:
## Importing packages

# This R environment comes with all of CRAN and many other helpful packages preinstalled.
# You can see which packages are installed by checking out the kaggle/rstats docker image: 
# https://github.com/kaggle/docker-rstats

library(tidyverse) # metapackage with lots of helpful functions
library(data.table)
library(dplyr)
library(mice)
library(randomForest)
## Running code

# In a notebook, you can run a single code cell by clicking in the cell and then hitting 
# the blue arrow to the left, or by clicking in the cell and pressing Shift+Enter. In a script, 
# you can run code by highlighting the code you want to run and then clicking the blue arrow
# at the bottom of this window.

## Reading in files

# You can access files from datasets you've added to this kernel in the "../input/" directory.
# You can see the files added to this kernel by running the code below. 

list.files(path = "../input")

## Saving data

# If you save any files or images, these will be put in the "output" directory. You 
# can see the output directory by committing and running your kernel (using the 
# Commit & Run button) and then checking out the compiled version of your kernel.

In [78]:
data_path = "../input/sampled-train/"
train = data.frame(fread(paste0(data_path, "train.csv"))) %>%
    mutate(HasDetections = factor(HasDetections))

In [79]:
dim(train)

In [80]:
colnames(train)

In [81]:
boolnames<- names(train)[grepl( "Is|Has" , names(train))]
boolnames<-c(boolnames,c("SMode","Firewall"))
boolnames

In [82]:
ordinal_cols = c("AVProductsInstalled", "AVProductsEnabled")

In [83]:
numeric_cols = c( "Census_SystemVolumeTotalCapacity", "Census_TotalPhysicalRAM"
                 , "Census_InternalPrimaryDiagonalDisplaySizeInInches"
                 , "Census_InternalBatteryNumberOfCharges")

In [84]:
idnames<- names(train)[grepl( "Identifier" , names(train))]
idnames

In [85]:
character_cols = train %>%
    select_if(function(x) !is.numeric(x)) %>%
    select(-MachineIdentifier) %>%
    select(-HasDetections)%>%
    colnames() 
character_cols<-c(character_cols,c("OsSuite","OsBuild"))
character_cols

In [86]:
factor_vars<-c(character_cols,ordinal_cols,boolnames,idnames)
factor_vars

train[factor_vars] <- lapply(train[factor_vars], as.factor)

In [87]:
str(train)

'data.frame':	75000 obs. of  45 variables:
 $ MachineIdentifier                                : Factor w/ 75000 levels "0000ba435580256a4a2c9e3b53fcb39a",..: 62073 22894 48192 49461 62159 33160 3124 70392 34809 25086 ...
 $ ProductName                                      : Factor w/ 2 levels "mse","win8defender": 2 2 2 2 2 2 2 2 2 2 ...
 $ EngineVersion                                    : Factor w/ 41 levels "1.1.11701.0",..: 39 9 39 38 39 39 35 38 39 14 ...
 $ AVProductsInstalled                              : Factor w/ 6 levels "1","2","3","4",..: 1 3 1 1 1 1 1 2 1 2 ...
 $ AVProductsEnabled                                : Factor w/ 5 levels "0","1","2","3",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ HasTpm                                           : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
 $ OrganizationIdentifier                           : Factor w/ 37 levels "1","2","3","4",..: NA 19 NA 19 19 13 19 NA 19 NA ...
 $ Platform                                         : Factor w/ 4 

In [88]:
cols.keep <- c("Census_PrimaryDiskTotalCapacity",
               "Census_SystemVolumeTotalCapacity",
               "MachineIdentifier",
               "HasDetections")
for(colName in colnames(train[, factor_vars])) {
  if(!(colName %in% cols.keep)) {
    train[[colName]] <- factor(ifelse(!is.na(train[[colName]]),
                                         as.factor(train[[colName]]),
                                         "Missing"))
  }
}

In [89]:
str(train)

'data.frame':	75000 obs. of  45 variables:
 $ MachineIdentifier                                : Factor w/ 75000 levels "0000ba435580256a4a2c9e3b53fcb39a",..: 62073 22894 48192 49461 62159 33160 3124 70392 34809 25086 ...
 $ ProductName                                      : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
 $ EngineVersion                                    : Factor w/ 41 levels "1","2","3","4",..: 39 9 39 38 39 39 35 38 39 14 ...
 $ AVProductsInstalled                              : Factor w/ 7 levels "1","2","3","4",..: 1 3 1 1 1 1 1 2 1 2 ...
 $ AVProductsEnabled                                : Factor w/ 6 levels "1","2","3","4",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ HasTpm                                           : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
 $ OrganizationIdentifier                           : Factor w/ 38 levels "1","10","11",..: 38 11 38 11 11 5 11 38 11 38 ...
 $ Platform                                         : Factor w/ 4 levels "1","2"

In [90]:
#train <- mice(train,m=10,defaultMethod=c("pmm"),print=F)
train$Census_InternalBatteryNumberOfCharges<- ifelse(is.na(train$Census_InternalBatteryNumberOfCharges),mean(train$Census_InternalBatteryNumberOfCharges,na.rm=TRUE),train$Census_InternalBatteryNumberOfCharges)
train$Census_TotalPhysicalRAM<- ifelse(is.na(train$Census_TotalPhysicalRAM),mean(train$Census_TotalPhysicalRAM,na.rm=TRUE),train$Census_TotalPhysicalRAM)
train$Census_SystemVolumeTotalCapacity<- ifelse(is.na(train$Census_SystemVolumeTotalCapacity),mean(train$Census_SystemVolumeTotalCapacity,na.rm=TRUE),train$Census_SystemVolumeTotalCapacity)
train$Census_InternalPrimaryDiagonalDisplaySizeInInches<- ifelse(is.na(train$Census_InternalPrimaryDiagonalDisplaySizeInInches),mean(train$Census_InternalPrimaryDiagonalDisplaySizeInInches,na.rm=TRUE),train$Census_InternalPrimaryDiagonalDisplaySizeInInches)
cbind(sort(colSums(is.na(train))/nrow(train),decreasing=TRUE))

0,1
MachineIdentifier,0
ProductName,0
EngineVersion,0
AVProductsInstalled,0
AVProductsEnabled,0
HasTpm,0
OrganizationIdentifier,0
Platform,0
Processor,0
OsVer,0


In [91]:
str(train)

'data.frame':	75000 obs. of  45 variables:
 $ MachineIdentifier                                : Factor w/ 75000 levels "0000ba435580256a4a2c9e3b53fcb39a",..: 62073 22894 48192 49461 62159 33160 3124 70392 34809 25086 ...
 $ ProductName                                      : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
 $ EngineVersion                                    : Factor w/ 41 levels "1","2","3","4",..: 39 9 39 38 39 39 35 38 39 14 ...
 $ AVProductsInstalled                              : Factor w/ 7 levels "1","2","3","4",..: 1 3 1 1 1 1 1 2 1 2 ...
 $ AVProductsEnabled                                : Factor w/ 6 levels "1","2","3","4",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ HasTpm                                           : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
 $ OrganizationIdentifier                           : Factor w/ 38 levels "1","10","11",..: 38 11 38 11 11 5 11 38 11 38 ...
 $ Platform                                         : Factor w/ 4 levels "1","2"

In [94]:
train_input=train[,!(names(train) %in% c("MachineIdentifier","Census_ProcessorModelIdentifier","OsBuildLab","Census_OSUILocaleIdentifier","Census_InternalBatteryType","Census_ChassisTypeName","Census_OSEdition","IeVerIdentifier","OsBuild","EngineVersion","OrganizationIdentifier","Census_MDC2FormFactor","SmartScreen","Census_OSSkuName"))]
str(train_input)

'data.frame':	75000 obs. of  31 variables:
 $ ProductName                                      : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
 $ AVProductsInstalled                              : Factor w/ 7 levels "1","2","3","4",..: 1 3 1 1 1 1 1 2 1 2 ...
 $ AVProductsEnabled                                : Factor w/ 6 levels "1","2","3","4",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ HasTpm                                           : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
 $ Platform                                         : Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
 $ Processor                                        : Factor w/ 3 levels "1","2","3": 3 2 2 2 2 2 2 2 2 2 ...
 $ OsVer                                            : Factor w/ 9 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ OsSuite                                          : Factor w/ 6 levels "1","2","3","4",..: 6 6 6 2 2 2 6 6 2 6 ...
 $ SkuEdition                                       :

In [96]:
mod4<- randomForest(HasDetections ~ ., data =train_input , importance =TRUE,ntree=800)
summary(mod4)

ERROR: Error in randomForest(HasDetections ~ ., data = train_input, importance = TRUE, : could not find function "randomForest"
