I began my program by loading in the necessary packages, reading in the "Adult" dataset, and fixing the column headers.

In [1]:
library(caret)
library(ranger)
library(e1071)
df = read.csv('adult.csv', header = FALSE)
names(df)[1]='age'
names(df)[2]='workclass'
names(df)[3]='fnlwgt'
names(df)[4]='education'
names(df)[5]='education_num'
names(df)[6]='marital_status'
names(df)[7]='occupation'
names(df)[8]='relationship'
names(df)[9]='race'
names(df)[10]='sex'
names(df)[11]='capital_gain'
names(df)[12]='capital_loss'
names(df)[13]='hours_per_week'
names(df)[14]='native_country'
names(df)[15]='target'

Loading required package: lattice
Loading required package: ggplot2


Replaced all the missing values with NA

In [2]:
df[df==' ?']=NA
sum(is.na(df))

Replaced the categorical missing values with the mode of the corresponding variables

In [3]:
cat_miss=function(x) {
  for(i in 1:ncol(x)){
    if(!is.numeric(x[,i])){
      levels=unique(x[,i])
      x[,i][is.na(x[,i])]=levels[which.max(tabulate(match(x, levels)))]
    }
  }
  return(x)
}
df2=cat_miss(df)
sum(is.na(df2))

Replace the numeric missing values with the median using caret

In [4]:
num_miss=preProcess(df2, method='medianImpute')
df3=predict(num_miss, newdata = df2)
sum(is.na(df3))

The variable "native country" has 42 categories, so I grouped the categories of this variable into 5 categories, and did the same for all categorical variables with more than 10 categories.

In [5]:
df3$native_country=as.character(df3$native_country)
df3$education=as.character(df3$education)
df3$occupation=as.character(df3$occupation)

df3$native_country[df3$native_country==" United-States"|df3$native_country==" Canada"|df3$native_country==" Mexico"]="North America"

df3$native_country[df3$native_country==" England"|df3$native_country==" Germany"|df3$native_country==" Italy"|df3$native_country==" Poland"|df3$native_country==" Portugal"|df3$native_country==" France"|df3$native_country==" Scotland"|df3$native_country==" Greece"|df3$native_country==" Ireland"|df3$native_country==" Holand-Netherlands"|df3$native_country==" Yugoslavia"|df3$native_country==" Hungary"]="Europe"

df3$native_country[df3$native_country==" India"|df3$native_country==" Cambodia"|df3$native_country==" Thailand"|df3$native_country==" Laos"|df3$native_country==" Taiwan"|df3$native_country==" China"|df3$native_country==" Japan"|df3$native_country==" Vietnam"|df3$native_country==" Hong"|df3$native_country==" Philippines"|df3$native_country==" Iran"]="Asia"

df3$native_country[df3$native_country==" Columbia"|df3$native_country==" Ecuador"]="South America"

df3$native_country[df3$native_country==" Cuba"|df3$native_country==" Jamaica"|df3$native_country==" South"|df3$native_country==" Puerto-Rico"|df3$native_country==" Honduras"|df3$native_country==" Haiti"|df3$native_country==" Dominican-Republic"|df3$native_country==" El-Salvador"|df3$native_country==" Guatemala"|df3$native_country==" Peru"|df3$native_country==" Outlying-US(Guam-USVI-etc)"|df3$native_country==" Trinadad&Tobago"|df3$native_country==" Nicaragua"]="Other"

df3$education[df3$education==" HS-grad"|df3$education==" 12th"|df3$education==" 11th"|df3$education==" 10th"|df3$education==" 9th"]="High School"

df3$education[df3$education==" Some-college"|df3$education==" Assoc-acdm"|df3$education==" Assoc-voc"]="Some College"

df3$education[df3$education==" 5th-6th"|df3$education==" 7th-8th"]="Middle School"

df3$education[df3$education==" Preschool"|df3$education==" 1st-4th"]="Elementary School"

df3$education[df3$education==" Bachelors"|df3$education==" Masters"|df3$education==" Doctorate"|df3$education==" Prof-school"]="College Grad"

df3$occupation[df3$occupation==" Transport-moving"|df3$occupation==" Farming-fishing"|df3$occupation==" Machine-op-inspct"]="Physical Labor"
df3$occupation[df3$occupation==" Handlers-cleaners"|df3$occupation==" Priv-house-serv"|df3$occupation==" Tech-support"]="Services"
df3$occupation[df3$occupation==" Exec-managerial"|df3$occupation==" Sales"|df3$occupation==" Adm-clerical"]="Corporate"
df3$occupation[df3$occupation==" Armed-Forces"|df3$occupation==" Protective-serv"]="Defense"
df3$occupation[df3$occupation==" Other-service"|df3$occupation==" Prof-specialty"|df3$occupation==" Craft-repair"]="Other"

df3$native_country=as.factor(df3$native_country)
df3$education=as.factor(df3$education)
df3$occupation=as.factor(df3$occupation)

levels(df3$native_country)
levels(df3$education)
levels(df3$occupation)

Encoded categorical variables using one hot encoding (dummy encoding)

In [6]:
dummies_model=dummyVars(target ~ ., data=df3)
trainData_mat=predict(dummies_model, newdata = df3)
trainData= data.frame(trainData_mat)
trainData$target=df3$target
str(trainData)

"variable 'target' is not a factor"

'data.frame':	32561 obs. of  51 variables:
 $ age                                  : num  39 50 38 53 28 37 49 52 31 42 ...
 $ workclass...                         : num  0 0 0 0 0 0 0 0 0 0 ...
 $ workclass..Federal.gov               : num  0 0 0 0 0 0 0 0 0 0 ...
 $ workclass..Local.gov                 : num  0 0 0 0 0 0 0 0 0 0 ...
 $ workclass..Never.worked              : num  0 0 0 0 0 0 0 0 0 0 ...
 $ workclass..Private                   : num  0 0 1 1 1 1 1 0 1 1 ...
 $ workclass..Self.emp.inc              : num  0 0 0 0 0 0 0 0 0 0 ...
 $ workclass..Self.emp.not.inc          : num  0 1 0 0 0 0 0 1 0 0 ...
 $ workclass..State.gov                 : num  1 0 0 0 0 0 0 0 0 0 ...
 $ workclass..Without.pay               : num  0 0 0 0 0 0 0 0 0 0 ...
 $ fnlwgt                               : num  77516 83311 215646 234721 338409 ...
 $ education.College.Grad               : num  1 1 0 0 1 1 0 0 1 1 ...
 $ education.Elementary.School          : num  0 0 0 0 0 0 0 0 0 0 ...
 $ educatio

Scaled and centered the data 

In [7]:
standardized= preProcess(trainData,method=c('center','scale'))
train2=predict(standardized,newdata=trainData)
str(train2)

"These variables have zero variances: workclass..."

'data.frame':	32561 obs. of  51 variables:
 $ age                                  : num  0.0307 0.8371 -0.0426 1.057 -0.7758 ...
 $ workclass...                         : num  0 0 0 0 0 0 0 0 0 0 ...
 $ workclass..Federal.gov               : num  -0.174 -0.174 -0.174 -0.174 -0.174 ...
 $ workclass..Local.gov                 : num  -0.262 -0.262 -0.262 -0.262 -0.262 ...
 $ workclass..Never.worked              : num  -0.0147 -0.0147 -0.0147 -0.0147 -0.0147 ...
 $ workclass..Private                   : num  -1.517 -1.517 0.659 0.659 0.659 ...
 $ workclass..Self.emp.inc              : num  -0.188 -0.188 -0.188 -0.188 -0.188 ...
 $ workclass..Self.emp.not.inc          : num  -0.291 3.437 -0.291 -0.291 -0.291 ...
 $ workclass..State.gov                 : num  3.064 -0.326 -0.326 -0.326 -0.326 ...
 $ workclass..Without.pay               : num  -0.0207 -0.0207 -0.0207 -0.0207 -0.0207 ...
 $ fnlwgt                               : num  -1.064 -1.009 0.245 0.426 1.408 ...
 $ education.College.Gr

Split the data into training (70%) and testing (30%) with the seeding set to be 2018. Built a decision tree and reported the accuracy and balanced accuracy

In [8]:
set.seed(2018)
splitIndex=createDataPartition(train2$target, p=.70, list=FALSE, times=1)
train_data=train2[splitIndex,]
test=train2[-splitIndex,]
model=train(target~.,data=train_data,method="rpart")
pred=predict(model,test)
cm=confusionMatrix(pred,test$target, positive=" >50K")
cm$overall['Accuracy']
cm$byClass['Balanced Accuracy']

Built a random forest using the ranger package.  Reported the accuracy and balanced accuracy

In [9]:
model2=ranger(target ~., data = train_data)
pred2=predict(model2,test)$predictions
cm2=confusionMatrix(pred2,test$target,positive=" >50K")
cm2$overall['Accuracy']
cm2$byClass['Balanced Accuracy']

Redid #3 with missing values being replaced by the mean.  Rebuilt the models and reported the models' performances (the accuracy and balanced accuracy).

In [10]:
miss_num=function(x){
  for (i in 1:ncol(x)){
    if (is.numeric(x[,i])){
      x[,i][is.na(x[,i])]=mean(x[,i], na.rm=TRUE)
    }
  }
  return(x)
}
df4=miss_num(df2)
sum(is.na(df4))

df4$native_country=as.character(df4$native_country)
df4$education=as.character(df4$education)
df4$occupation=as.character(df4$occupation)

df4$native_country[df4$native_country==" United-States"|df4$native_country==" Canada"|df4$native_country==" Mexico"]="North America"

df4$native_country[df4$native_country==" England"|df4$native_country==" Germany"|df4$native_country==" Italy"|df4$native_country==" Poland"|df4$native_country==" Portugal"|df4$native_country==" France"|df4$native_country==" Scotland"|df4$native_country==" Greece"|df4$native_country==" Ireland"|df4$native_country==" Holand-Netherlands"|df4$native_country==" Yugoslavia"|df4$native_country==" Hungary"]="Europe"

df4$native_country[df4$native_country==" India"|df4$native_country==" Cambodia"|df4$native_country==" Thailand"|df4$native_country==" Laos"|df4$native_country==" Taiwan"|df4$native_country==" China"|df4$native_country==" Japan"|df4$native_country==" Vietnam"|df4$native_country==" Hong"|df4$native_country==" Philippines"|df4$native_country==" Iran"]="Asia"

df4$native_country[df4$native_country==" Columbia"|df4$native_country==" Ecuador"]="South America"

df4$native_country[df4$native_country==" Cuba"|df4$native_country==" Jamaica"|df4$native_country==" South"|df4$native_country==" Puerto-Rico"|df4$native_country==" Honduras"|df4$native_country==" Haiti"|df4$native_country==" Dominican-Republic"|df4$native_country==" El-Salvador"|df4$native_country==" Guatemala"|df4$native_country==" Peru"|df4$native_country==" Outlying-US(Guam-USVI-etc)"|df4$native_country==" Trinadad&Tobago"|df4$native_country==" Nicaragua"]="Other"

df3$education[df4$education==" HS-grad"|df4$education==" 12th"|df4$education==" 11th"|df4$education==" 10th"|df4$education==" 9th"]="High School"

df4$education[df4$education==" Some-college"|df4$education==" Assoc-acdm"|df4$education==" Assoc-voc"]="Some College"

df4$education[df4$education==" 5th-6th"|df4$education==" 7th-8th"]="Middle School"

df4$education[df4$education==" Preschool"|df4$education==" 1st-4th"]="Elementary School"

df4$education[df4$education==" Bachelors"|df4$education==" Masters"|df4$education==" Doctorate"|df4$education==" Prof-school"]="College Grad"

df4$occupation[df4$occupation==" Transport-moving"|df4$occupation==" Farming-fishing"|df4$occupation==" Machine-op-inspct"]="Physical Labor"
df4$occupation[df4$occupation==" Handlers-cleaners"|df4$occupation==" Priv-house-serv"|df4$occupation==" Tech-support"]="Services"
df4$occupation[df4$occupation==" Exec-managerial"|df4$occupation==" Sales"|df4$occupation==" Adm-clerical"]="Corporate"
df4$occupation[df4$occupation==" Armed-Forces"|df4$occupation==" Protective-serv"]="Defense"
df4$occupation[df4$occupation==" Other-service"|df4$occupation==" Prof-specialty"|df4$occupation==" Craft-repair"]="Other"

df4$native_country=as.factor(df4$native_country)
df4$education=as.factor(df4$education)
df4$occupation=as.factor(df4$occupation)

dummies_model=dummyVars(target ~ ., data=df4)
trainData_mat2=predict(dummies_model, newdata = df4)
trainData2= data.frame(trainData_mat2)
trainData2$target=df4$target

standardized2= preProcess(trainData2,method=c('center','scale'))
train3=predict(standardized2,newdata=trainData2)

set.seed(2018)
splitIndex2=createDataPartition(train3$target, p=.70, list=FALSE, times=1)
train_data2=train3[splitIndex2,]
test2=train3[-splitIndex2,]
model3=train(target~.,data=train_data2,method="rpart")
pred3=predict(model3,test2)
cm3=confusionMatrix(pred3,test2$target, positive=" >50K")
cm3$overall['Accuracy']
cm3$byClass['Balanced Accuracy']

model4=ranger(target ~., data = train_data2)
pred4=predict(model4,test2)$predictions
cm4=confusionMatrix(pred4,test2$target,positive=" >50K")
cm4$overall['Accuracy']
cm4$byClass['Balanced Accuracy']

"These variables have zero variances: workclass..."

Redid #3 with missing values being replaced by the "knn" imputation method.  Rebuilt the model and report the models' performances (the accuracy and balanced accuracy)

In [11]:
num_miss=preProcess(df2, method='knnImpute')
df5=predict(num_miss, newdata = df2)
sum(is.na(df5))

df5$native_country=as.character(df5$native_country)
df5$education=as.character(df5$education)
df5$occupation=as.character(df5$occupation)

df5$native_country[df5$native_country==" United-States"|df5$native_country==" Canada"|df5$native_country==" Mexico"]="North America"

df5$native_country[df5$native_country==" England"|df5$native_country==" Germany"|df5$native_country==" Italy"|df5$native_country==" Poland"|df5$native_country==" Portugal"|df5$native_country==" France"|df5$native_country==" Scotland"|df5$native_country==" Greece"|df5$native_country==" Ireland"|df5$native_country==" Holand-Netherlands"|df5$native_country==" Yugoslavia"|df5$native_country==" Hungary"]="Europe"

df5$native_country[df5$native_country==" India"|df5$native_country==" Cambodia"|df5$native_country==" Thailand"|df5$native_country==" Laos"|df5$native_country==" Taiwan"|df5$native_country==" China"|df5$native_country==" Japan"|df5$native_country==" Vietnam"|df5$native_country==" Hong"|df5$native_country==" Philippines"|df5$native_country==" Iran"]="Asia"

df5$native_country[df5$native_country==" Columbia"|df5$native_country==" Ecuador"]="South America"

df5$native_country[df5$native_country==" Cuba"|df5$native_country==" Jamaica"|df5$native_country==" South"|df5$native_country==" Puerto-Rico"|df5$native_country==" Honduras"|df5$native_country==" Haiti"|df5$native_country==" Dominican-Republic"|df5$native_country==" El-Salvador"|df5$native_country==" Guatemala"|df5$native_country==" Peru"|df5$native_country==" Outlying-US(Guam-USVI-etc)"|df5$native_country==" Trinadad&Tobago"|df5$native_country==" Nicaragua"]="Other"

df5$education[df5$education==" HS-grad"|df5$education==" 12th"|df5$education==" 11th"|df5$education==" 10th"|df5$education==" 9th"]="High School"

df5$education[df5$education==" Some-college"|df5$education==" Assoc-acdm"|df5$education==" Assoc-voc"]="Some College"

df5$education[df5$education==" 5th-6th"|df5$education==" 7th-8th"]="Middle School"

df5$education[df5$education==" Preschool"|df5$education==" 1st-4th"]="Elementary School"

df5$education[df5$education==" Bachelors"|df5$education==" Masters"|df5$education==" Doctorate"|df5$education==" Prof-school"]="College Grad"

df5$occupation[df5$occupation==" Transport-moving"|df5$occupation==" Farming-fishing"|df5$occupation==" Machine-op-inspct"]="Physical Labor"
df5$occupation[df5$occupation==" Handlers-cleaners"|df5$occupation==" Priv-house-serv"|df5$occupation==" Tech-support"]="Services"
df5$occupation[df5$occupation==" Exec-managerial"|df5$occupation==" Sales"|df5$occupation==" Adm-clerical"]="Corporate"
df5$occupation[df5$occupation==" Armed-Forces"|df5$occupation==" Protective-serv"]="Defense"
df5$occupation[df5$occupation==" Other-service"|df5$occupation==" Prof-specialty"|df5$occupation==" Craft-repair"]="Other"

df5$native_country=as.factor(df5$native_country)
df5$education=as.factor(df5$education)
df5$occupation=as.factor(df5$occupation)

dummies_model=dummyVars(target ~ ., data=df5)
trainData_mat3=predict(dummies_model, newdata = df5)
trainData3= data.frame(trainData_mat3)
trainData3$target=df5$target

standardized3= preProcess(trainData3,method=c('center','scale'))
train4=predict(standardized3,newdata=trainData3)

set.seed(2018)
splitIndex3=createDataPartition(train4$target, p=.70, list=FALSE, times=1)
train_data3=train4[splitIndex3,]
test3=train4[-splitIndex3,]
model4=train(target~.,data=train_data3,method="rpart")
pred4=predict(model4,test3)
cm4=confusionMatrix(pred4,test3$target, positive=" >50K")
cm4$overall['Accuracy']
cm4$byClass['Balanced Accuracy']

model5=ranger(target ~., data = train_data3)
pred5=predict(model5,test3)$predictions
cm5=confusionMatrix(pred5,test3$target,positive=" >50K")
cm5$overall['Accuracy']
cm5$byClass['Balanced Accuracy']

"These variables have zero variances: workclass..."

Redid #6 with only scaling and centering the non-encoded variables. Rebuilt the model and reported the models' performances (the accuracy and balanced accuracy)

In [12]:
standardized4= preProcess(df3,method=c('center','scale'))
train5=predict(standardized4,newdata=df3)

dummies_model=dummyVars(target ~ ., data=train5)
trainData_mat4=predict(dummies_model, newdata = train5)
trainData4= data.frame(trainData_mat4)
trainData4$target=train5$target

set.seed(2018)
splitIndex4=createDataPartition(trainData4$target, p=.70, list=FALSE, times=1)
train_data4=trainData4[splitIndex4,]
test4=trainData4[-splitIndex4,]
model5=train(target~.,data=train_data4,method="rpart")
pred5=predict(model5,test4)
cm5=confusionMatrix(pred5,test4$target, positive=" >50K")
cm5$overall['Accuracy']
cm5$byClass['Balanced Accuracy']

model6=ranger(target ~., data = train_data4)
pred6=predict(model6,test4)$predictions
cm6=confusionMatrix(pred6,test4$target,positive=" >50K")
cm6$overall['Accuracy']
cm6$byClass['Balanced Accuracy']

"variable 'target' is not a factor"

Redid #5 with a different encoding method. Rebuilt the model and reported the models' performances (the accuracy and balanced accuracy).

In [13]:
df3$workclass=is.numeric(df3$workclass)
df3$education=is.numeric(df3$education)
df3$marital_status=is.numeric(df3$marital_status)
df3$occupation=is.numeric(df3$occupation)
df3$relationship=is.numeric(df3$relationship)
df3$race=is.numeric(df3$race)
df3$sex=is.numeric(df3$sex)
df3$native_country=is.numeric(df3$native_country)

standardized5= preProcess(df3,method=c('center','scale'))
train6=predict(standardized5,newdata=df3)

set.seed(2018)
splitIndex5=createDataPartition(train6$target, p=.70, list=FALSE, times=1)
train_data5=train6[splitIndex5,]
test5=train6[-splitIndex5,]
model7=train(target~.,data=train_data5,method="rpart")
pred7=predict(model7,test5)
cm7=confusionMatrix(pred7,test5$target, positive=" >50K")
cm7$overall['Accuracy']
cm7$byClass['Balanced Accuracy']

model8=ranger(target ~., data = train_data5)
pred8=predict(model8,test5)$predictions
cm8=confusionMatrix(pred8,test5$target,positive=" >50K")
cm8$overall['Accuracy']
cm8$byClass['Balanced Accuracy']

Skipped #4 to rebuild the models.

In [14]:
dummies_model=dummyVars(target ~ ., data=df2)
trainData_mat6=predict(dummies_model, newdata = df2)
trainData6= data.frame(trainData_mat6)
trainData6$target=df2$target

standardized6= preProcess(trainData6,method=c('center','scale'))
train7=predict(standardized6,newdata=trainData6)

set.seed(2018)
splitIndex6=createDataPartition(train7$target, p=.70, list=FALSE, times=1)
train_data6=train7[splitIndex6,]
test6=train7[-splitIndex6,]
model9=train(target~.,data=train_data6,method="rpart")
pred9=predict(model9,test6)
cm9=confusionMatrix(pred9,test6$target, positive=" >50K")
cm9$overall['Accuracy']
cm9$byClass['Balanced Accuracy']

model10=ranger(target ~., data = train_data6)
pred10=predict(model10,test6)$predictions
cm10=confusionMatrix(pred10,test6$target,positive=" >50K")
cm10$overall['Accuracy']
cm10$byClass['Balanced Accuracy']

"These variables have zero variances: workclass..., occupation..., native_country..."