# K-nn classification with credit data

In [1]:
library(class)

In [2]:
gcdata <- read.csv("german_credit_data1.csv")
gcdata$Credit.Risks.Label <- factor(gcdata$Credit.Risks, label=c("High","low"))

In [3]:
head(gcdata)

X.,Age,Sex,Job,Housing,Saving.accounts,Checking.account,Credit.amount,Duration,Purpose,Credit.Risks,Credit.Risks.Label
0,67,male,2,own,,little,1169,6,radio/TV,1,High
1,22,female,2,own,little,moderate,5951,48,radio/TV,2,low
2,49,male,1,own,little,,2096,12,education,1,High
3,45,male,2,free,little,little,7882,42,furniture/equipment,1,High
4,53,male,2,free,little,little,4870,24,car,2,low
5,35,male,1,free,,,9055,36,education,1,High


In [4]:
gcdata.df <- as.data.frame(gcdata)
normalize <- function(x){return ((x-min(x))/(max(x)-min(x)))}
for(i in 2:10){
    gcdata.df[,i] <- as.numeric(gcdata.df[,i])
    gcdata.df[,i][is.na(gcdata.df[,i])] <- 0
    gcdata.df[,i] <- normalize(gcdata.df[,i])
}
head(gcdata.df)

X.,Age,Sex,Job,Housing,Saving.accounts,Checking.account,Credit.amount,Duration,Purpose,Credit.Risks,Credit.Risks.Label
0,0.85714286,1,0.6666667,0.5,0.0,0.3333333,0.05056674,0.02941176,0.7142857,1,High
1,0.05357143,0,0.6666667,0.5,0.25,0.6666667,0.31368989,0.64705882,0.7142857,2,low
2,0.53571429,1,0.3333333,0.5,0.25,0.0,0.10157368,0.11764706,0.4285714,1,High
3,0.46428571,1,0.6666667,0.0,0.25,0.3333333,0.41994057,0.55882353,0.5714286,1,High
4,0.60714286,1,0.6666667,0.0,0.25,0.3333333,0.25420931,0.29411765,0.1428571,2,low
5,0.28571429,1,0.3333333,0.0,0.0,0.0,0.48448333,0.47058824,0.4285714,1,High


In [5]:
summary(gcdata.df)

       X.             Age              Sex            Job        
 Min.   :  0.0   Min.   :0.0000   Min.   :0.00   Min.   :0.0000  
 1st Qu.:249.8   1st Qu.:0.1429   1st Qu.:0.00   1st Qu.:0.6667  
 Median :499.5   Median :0.2500   Median :1.00   Median :0.6667  
 Mean   :499.5   Mean   :0.2955   Mean   :0.69   Mean   :0.6347  
 3rd Qu.:749.2   3rd Qu.:0.4107   3rd Qu.:1.00   3rd Qu.:0.6667  
 Max.   :999.0   Max.   :1.0000   Max.   :1.00   Max.   :1.0000  
    Housing       Saving.accounts  Checking.account Credit.amount    
 Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
 1st Qu.:0.5000   1st Qu.:0.2500   1st Qu.:0.0000   1st Qu.:0.06138  
 Median :0.5000   Median :0.2500   Median :0.3333   Median :0.11387  
 Mean   :0.5355   Mean   :0.2975   Mean   :0.3337   Mean   :0.16624  
 3rd Qu.:0.5000   3rd Qu.:0.2500   3rd Qu.:0.6667   3rd Qu.:0.20481  
 Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
    Duration         Purpose        Credit.Risks

In [6]:
count <- nrow(gcdata.df)
misclassificationRate <- matrix(nrow=20, ncol=5)
sampling.rate <- c(0.9,0.8,0.7,0.6,0.5)
for(i in 1:5){
    test.labels<- count*(1-sampling.rate[i])
    training<-sample(1:count, sampling.rate[i]*count, replace=FALSE)
    train<-subset(gcdata.df[training,], select=c(Age,Sex,Job,Housing,Saving.accounts,Checking.account,Credit.amount,Duration,Purpose))
    testing<-setdiff(1:count,training)
    test<-subset(gcdata.df[testing,], select=c(Age,Sex,Job,Housing,Saving.accounts,Checking.account,Credit.amount,Duration,Purpose))
    cl <- gcdata$Credit.Risks.Label[training]
    true.labels<-gcdata.df$Credit.Risks.Label[testing]
    #compute and print misclassification rate
    for(j in 1:20) {
      predicted.labels <- knn(train,test,cl,j)
      incorrect.labels<-sum(predicted.labels !=true.labels)
      misclassification.rate<-incorrect.labels/test.labels
      misclassificationRate[j,i]<-(misclassification.rate)
    }
}

In [7]:
misclassificationRate.df <- as.data.frame(misclassificationRate)
names(misclassificationRate.df)<- c("Sampling Rate - 90%","Sampling Rate - 80%","Sampling Rate - 70%","Sampling Rate - 60%", "Sampling Rate - 50%")
misclassificationRate.df

Sampling Rate - 90%,Sampling Rate - 80%,Sampling Rate - 70%,Sampling Rate - 60%,Sampling Rate - 50%
0.33,0.335,0.3733333,0.36,0.322
0.31,0.315,0.3366667,0.3675,0.318
0.27,0.33,0.34,0.335,0.31
0.29,0.345,0.33,0.345,0.324
0.26,0.3,0.34,0.32,0.298
0.3,0.34,0.3233333,0.3275,0.304
0.28,0.27,0.3133333,0.325,0.306
0.27,0.305,0.3033333,0.3,0.29
0.28,0.275,0.3066667,0.3025,0.3
0.3,0.265,0.32,0.2875,0.308


In [8]:
#minimum classification rate k values
minByCol <- apply(misclassificationRate, MARGIN=2, FUN=which.min)
minByCol

### 90% Sampling rate - Minimum misclassification rate is when k = 13
### 80% Sampling rate - Minimum misclassification rate is when k = 5
### 70% Sampling rate - Minimum misclassification rate is when k = 9
### 60% Sampling rate - Minimum misclassification rate is when k = 8
### 50% Sampling rate - Minimum misclassification rate is when k = 12