# Missing value analysis

In [5]:
dfa <- read.csv('Dataset/click_rates.csv')

In [6]:
dfa

Headline,Click,Rate
Headline A,Click,14
Headline A,No-click,986
Headline B,Click,8
Headline B,No-click,992
Headline C,Click,12
Headline C,No-click,988


In [7]:
is.na(dfa)

Headline,Click,Rate
False,False,False
False,False,False
False,False,False
False,False,False
False,False,False
False,False,False


In [8]:
summary(is.na(dfa))

  Headline         Click            Rate        
 Mode :logical   Mode :logical   Mode :logical  
 FALSE:6         FALSE:6         FALSE:6        

In [9]:
dfb <- read.csv('Dataset/dfw_airline.csv')

In [10]:
dfb

Carrier,ATC,Weather,Security,Inbound
64263.16,84856.5,11235.42,343.15,118427.8


In [11]:
is.na(dfb)

Carrier,ATC,Weather,Security,Inbound
False,False,False,False,False


In [12]:
summary(is.na(dfb))

  Carrier           ATC           Weather         Security      
 Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:1         FALSE:1         FALSE:1         FALSE:1        
  Inbound       
 Mode :logical  
 FALSE:1        

In [13]:
dfc <- read.csv('Dataset/four_sessions.csv')

In [14]:
dfc

Page,Time
Page 1,164.0
Page 2,178.0
Page 3,175.0
Page 4,155.0
Page 1,
Page 2,191.0
Page 3,193.0
Page 4,166.0
Page 1,177.0
Page 2,


In [15]:
is.na(dfc)

Page,Time
False,False
False,False
False,False
False,False
False,True
False,False
False,False
False,False
False,False
False,True


In [16]:
summary(is.na(dfc))

    Page            Time        
 Mode :logical   Mode :logical  
 FALSE:20        FALSE:15       
                 TRUE :5        

# Impute missing values with statistical values

In [6]:
v <-c(100,101,102,104,106,108,NA,NA)
mean(v)
vc <- c(100,101,102,104,106,108,200,1000)
mean(vc)

#mean(v,na.rm=T)
#print(v)

In [17]:
dfc$Time[is.na(dfc$Time)] <- mean(dfc$Time, na.rm = T)

In [18]:
dfc

Page,Time
Page 1,164.0
Page 2,178.0
Page 3,175.0
Page 4,155.0
Page 1,173.6667
Page 2,191.0
Page 3,193.0
Page 4,166.0
Page 1,177.0
Page 2,173.6667


In [19]:
as.data.frame(colSums(is.na(dfc)))

Unnamed: 0,colSums(is.na(dfc))
Page,0
Time,0


# Delete the missing values

In [20]:
library(tidyr)

"package 'tidyr' was built under R version 3.6.3"

In [21]:
dfc = drop_na(dfc)


In [22]:
is.na(dfc)

Page,Time
False,False
False,False
False,False
False,False
False,False
False,False
False,False
False,False
False,False
False,False


In [23]:
as.data.frame(colSums(is.na(dfc)))

Unnamed: 0,colSums(is.na(dfc))
Page,0
Time,0


# kNN Imputation

Another crucial way of imputing the missing values is through kNN imputation.

In kNN imputation, the algorithm chooses ‘k’ values and uses the concept of distance values.

That is, it measures and identifies ‘k’ values based on the Euclidian distance formula and then replaces the missing values with the nearest one.

In the below example, we have used knnImputation() method from ‘DMwR‘ library to perform kNN imputation on the missing values.

In [24]:
##install.packages("DMwR")

package 'DMwR' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\Microsoft\AppData\Local\Temp\RtmpiwDUti\downloaded_packages


In [25]:
library(DMwR)
knn_res <- knnImputation(dfc)  # perform knn imputation.
anyNA(knn_res)

"package 'DMwR' was built under R version 3.6.3"Loading required package: lattice
Loading required package: grid
Registered S3 method overwritten by 'xts':
  method     from
  as.zoo.xts zoo 
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 
"No case has missing values. Stopping as there is nothing to do."

In [26]:
dfd <-read.csv('heartdata.csv')

In [27]:
dfd

X,biking,smoking,heart.disease
1,30.801246,10.896608,11.769423
2,65.129215,2.219563,2.854081
3,1.959665,17.588331,17.177803
4,44.800196,2.802559,6.816647
5,69.428454,15.974505,4.062224
6,54.403626,29.333176,9.550046
7,49.056162,9.060846,7.624507
8,4.784604,12.835021,
9,65.730788,11.991297,3.067462
10,35.257449,23.277683,12.098484


In [28]:
is.na(dfd)

X,biking,smoking,heart.disease
FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,TRUE
FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,FALSE


In [29]:
summary(is.na(dfd))

     X             biking         smoking        heart.disease  
 Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:498       FALSE:480       FALSE:482       FALSE:476      
                 TRUE :18        TRUE :16        TRUE :22       

In [30]:
dfd$biking[is.na(dfd$biking)] <- mean(dfd$biking, na.rm = T)

In [31]:
is.na(dfd)

X,biking,smoking,heart.disease
FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,TRUE
FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,FALSE


In [40]:
head(dfd,10)

X,biking,smoking,heart.disease
1,30.801246,10.896608,11.769423
2,65.129215,2.219563,2.854081
3,1.959665,17.588331,17.177803
4,44.800196,2.802559,6.816647
5,69.428454,15.974505,4.062224
6,54.403626,29.333176,9.550046
7,49.056162,9.060846,7.624507
8,4.784604,12.835021,10.107345
9,65.730788,11.991297,3.067462
10,35.257449,23.277683,12.098484


In [32]:
summary(is.na(dfd))

     X             biking         smoking        heart.disease  
 Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:498       FALSE:498       FALSE:482       FALSE:476      
                                 TRUE :16        TRUE :22       

In [33]:
dfd$smoking[is.na(dfd$smoking)] <- mean(dfd$smoking, na.rm = T)

In [34]:
dfd$heart.disease[is.na(dfd$heart.disease)] <- mean(dfd$heart.disease, na.rm = T)

In [35]:
summary(is.na(dfd))

     X             biking         smoking        heart.disease  
 Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:498       FALSE:498       FALSE:498       FALSE:498      

# kNN Imputation

In [41]:
head(dfd)

X,biking,smoking,heart.disease
1,30.801246,10.896608,11.769423
2,65.129215,2.219563,2.854081
3,1.959665,17.588331,17.177803
4,44.800196,2.802559,6.816647
5,69.428454,15.974505,4.062224
6,54.403626,29.333176,9.550046


In [36]:
knn_res <- knnImputation(dfd)  # perform knn imputation.
anyNA(knn_res)

"No case has missing values. Stopping as there is nothing to do."

In [37]:
anyNA(dfd)

In [38]:
as.data.frame(colSums(is.na(dfd)))

Unnamed: 0,colSums(is.na(dfd))
X,0
biking,0
smoking,0
heart.disease,0


In [42]:
head(dfd,16)

X,biking,smoking,heart.disease
1,30.801246,10.896608,11.769423
2,65.129215,2.219563,2.854081
3,1.959665,17.588331,17.177803
4,44.800196,2.802559,6.816647
5,69.428454,15.974505,4.062224
6,54.403626,29.333176,9.550046
7,49.056162,9.060846,7.624507
8,4.784604,12.835021,10.107345
9,65.730788,11.991297,3.067462
10,35.257449,23.277683,12.098484


In [43]:
dfv <- read.csv('Dataset/Loan_Modelling.csv')

In [44]:
head(dfv)

ID,Age,Experience,Income,ZIPCode,Family,CCAvg,Education,Mortgage,Personal_Loan,Securities_Account,CD_Account,Online,CreditCard
1.0,25,1.0,49.0,91107.0,4,1.6,1,0,0,1.0,0,0,0
2.0,45,19.0,,90089.0,3,1.5,1,0,0,1.0,0,0,0
3.0,39,15.0,11.0,94720.0,1,1.0,1,0,0,,0,0,0
,35,9.0,100.0,,1,2.7,2,0,0,0.0,0,0,0
5.0,35,,45.0,91330.0,4,1.0,2,0,0,0.0,0,0,1
,37,13.0,29.0,,4,0.4,2,155,0,0.0,0,1,0


In [45]:
is.na(dfv)

ID,Age,Experience,Income,ZIPCode,Family,CCAvg,Education,Mortgage,Personal_Loan,Securities_Account,CD_Account,Online,CreditCard
FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE
TRUE,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
TRUE,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,FALSE,FALSE,TRUE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
FALSE,TRUE,TRUE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE


In [46]:
summary(is.na(dfv))

     ID             Age          Experience        Income       
 Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:4998      FALSE:4999      FALSE:4997      FALSE:4999     
 TRUE :2         TRUE :1         TRUE :3         TRUE :1        
  ZIPCode          Family          CCAvg         Education      
 Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:4997      FALSE:4998      FALSE:4998      FALSE:5000     
 TRUE :3         TRUE :2         TRUE :2                        
  Mortgage       Personal_Loan   Securities_Account CD_Account     
 Mode :logical   Mode :logical   Mode :logical      Mode :logical  
 FALSE:4999      FALSE:5000      FALSE:4999         FALSE:5000     
 TRUE :1                         TRUE :1                           
   Online        CreditCard     
 Mode :logical   Mode :logical  
 FALSE:5000      FALSE:5000     
                                

In [47]:
knn_rese <- knnImputation(dfv)  # perform knn imputation.
anyNA(knn_rese)

In [48]:
is.na(knn_rese)

ID,Age,Experience,Income,ZIPCode,Family,CCAvg,Education,Mortgage,Personal_Loan,Securities_Account,CD_Account,Online,CreditCard
FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE
FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE


In [49]:
head(knn_rese)

ID,Age,Experience,Income,ZIPCode,Family,CCAvg,Education,Mortgage,Personal_Loan,Securities_Account,CD_Account,Online,CreditCard
1.0,25,1.0,49.0,91107.0,4,1.6,1,0,0,1,0,0,0
2.0,45,19.0,54.13027,90089.0,3,1.5,1,0,0,1,0,0,0
3.0,39,15.0,11.0,94720.0,1,1.0,1,0,0,0,0,0,0
2320.558,35,9.0,100.0,92383.99,1,2.7,2,0,0,0,0,0,0
5.0,35,10.56222,45.0,91330.0,4,1.0,2,0,0,0,0,0,1
2122.2,37,13.0,29.0,93758.31,4,0.4,2,155,0,0,0,1,0
