# Eksempler imputering

Laster først inn pakkene som skal bli brukt

In [None]:
library(tidyverse)
library(plotly)
library(dcmodify)
library(simputation)
library(lumberjack)

# Datasett som skal bli brukt i eksempel

Datasettet heter women og innholder vekt og høyde til 15 kvinnene

In [None]:
#Load data----
data(women)
# Omgjør til norske enheter kg og cm
women <- women %>% mutate(hoyde = round(height*2.54,1), vekt = round(weight*0.453592,1))
women

# Lager grafikk                 
fig1 <- women %>% 
       plot_ly( x = ~hoyde, y= ~vekt,
        type ="scatter",mode ="markers",              
        hovertemplate = paste(     
      "%{yaxis.title.text}: %{y:}<br>",
      "%{xaxis.title.text}: %{x:}<br>",
      "<extra></extra>"
      ) ) %>%
       layout(title = "Datasettet Women", xaxis = list(title = "Høyde"),
        yaxis = list(title = "Vekt") )
fig1  


Legger inn noen feil i datasettet og beregner KMI (BMI)

In [None]:
#Introdusere feil----
women$hoyde[10] <- women$height[10] #svare i foot istedenor cm
women$hoyde[8] <- -women$hoyde[8] #tastet inn - negativt tall
women <- women %>% mutate(kmi= vekt/(hoyde/100)^2)
women$kmi[c(2,5)] <- NA


# Lager grafikk                 
fig2 <- women %>% 
       plot_ly( x = ~hoyde, y= ~vekt,
        type ="scatter",mode ="markers",              
        hovertemplate = paste(     
      "%{yaxis.title.text}: %{y:}<br>",
      "%{xaxis.title.text}: %{x:}<br>",
      "<extra></extra>"
      ) ) %>%
       layout(title = "Datasettet Women", xaxis = list(title = "Høyde"),
        yaxis = list(title = "Vekt") )
fig2

fig3 <- women %>% 
       plot_ly( x = ~hoyde, y= ~kmi,
        type ="scatter",mode ="markers",              
        hovertemplate = paste(     
      "%{yaxis.title.text}: %{y:}<br>",
      "%{xaxis.title.text}: %{x:}<br>",
      "<extra></extra>"
      ) ) %>%
       layout(title = "Datasettet Women", xaxis = list(title = "Høyde"),
        yaxis = list(title = "KMI") )
fig3

#### Setter opp regelretting med pakken dcmodify

In [None]:
#reglene

m <- modifier(if(hoyde < 0) hoyde <- abs(hoyde),
              if(hoyde < 145) hoyde <- round(hoyde*2.54),
              if(is.na(kmi) | kmi > 40 ) kmi <- vekt/(hoyde/100)^2) 

#Kjører regelrettingen              
women_out1 <- modify(women, m)



In [None]:
# Lager grafikk                 
fig2 <- women_out1 %>% 
       plot_ly( x = ~hoyde, y= ~vekt,
        type ="scatter",mode ="markers",              
        hovertemplate = paste(     
      "%{yaxis.title.text}: %{y:}<br>",
      "%{xaxis.title.text}: %{x:}<br>",
      "<extra></extra>"
      ) ) %>%
       layout(title = "Datasettet Women", xaxis = list(title = "Høyde"),
        yaxis = list(title = "Vekt") )
fig2

fig3 <- women_out1 %>% 
       plot_ly( x = ~hoyde, y= ~kmi,
        type ="scatter",mode ="markers",              
        hovertemplate = paste(     
      "%{yaxis.title.text}: %{y:}<br>",
      "%{xaxis.title.text}: %{x:}<br>",
      "<extra></extra>"
      ) ) %>%
       layout(title = "Datasettet Women", xaxis = list(title = "Høyde"),
        yaxis = list(title = "KMI") )
fig3

# Del 2 imputering med donor og modeller pakken simpultation

In [None]:
#Load data----
data(women)

women <- women %>% mutate(id =1:nrow(women), hoyde=round( height*2.54,1), vekt= round(weight*0.453592,1), 
                          kmi= round(vekt/(hoyde/100)^2,1),kmi_org= round(vekt/(hoyde/100)^2,1))
women$kmi[c(2,5,8, 9)] <- NA # Fjerner 4 verdier som skal imputeres
women <- women %>% mutate( imp=ifelse(is.na(kmi), 2, 1))

women

fig <- women %>% 
       plot_ly( x = ~hoyde, y= ~kmi,
        type ="scatter",mode ="markers",              
        hovertemplate = paste(     
      "%{yaxis.title.text}: %{y:}<br>",
      "%{xaxis.title.text}: %{x:}<br>",
      "<extra></extra>"
      ) ) %>%
       layout(title = "Datasettet Women", xaxis = list(title = "Høyde"),
        yaxis = list(title = "KMI") )
fig

# Imputering med gjennomsnittet

In [None]:
#gjennomsnitts-imputering
women_1 <-women %>% impute_proxy(kmi ~ mean(kmi, na.rm = TRUE)) %>% mutate(feil=(kmi-kmi_org)^2)

rmse1 <-women_1 %>%summarize( round(sqrt(sum(feil)/15),2))
rmse1

fig <- women_1 %>% 
       plot_ly( x = ~hoyde, y= ~kmi,
        type ="scatter",mode ="markers",split = ~imp,             
        hovertemplate = paste(     
      "%{yaxis.title.text}: %{y:}<br>",
      "%{xaxis.title.text}: %{x:}<br>",
      "<extra></extra>"
      ) ) %>%
       layout(title = "Gjennomsitts imputering", xaxis = list(title = "Høyde"),
        yaxis = list(title = "KMI") , legend=list(title=list(text='Outlier:')))
fig

# Imputering med gjennomsnittet i gruppen

In [None]:
# Lager grupperingsvariabel
women <- women %>% mutate(gruppe=cut(women$hoyde, breaks = c(0, 155, 165, 175, 190),labels = c("gr1", "gr2", "gr3", "gr4")))
women_2 <- women %>% impute_proxy(kmi ~ mean(kmi, na.rm = TRUE)|gruppe) %>% mutate(feil=(kmi-kmi_org)^2)

rmse2 <-women_2 %>%summarize( round(sqrt(sum(feil)/15),2))
rmse2


vline <- function(x = 0, color = "red") {
  list(
    type = "line", 
    y0 = 0, 
    y1 = 1, 
    yref = "paper",
    x0 = x, 
    x1 = x, 
    line = list(color = color)
  )
}

fig2 <- women_2 %>% 
       plot_ly( x = ~hoyde, y= ~kmi,
        type ="scatter",mode ="markers",split = ~imp,             
        hovertemplate = paste(     
      "%{yaxis.title.text}: %{y:}<br>",
      "%{xaxis.title.text}: %{x:}<br>",
      "<extra></extra>"
      ) ) %>%
       layout(title = "Gjennomsitts imputering per gruppe ", xaxis = list(title = "Høyde"),
        yaxis = list(title = "KMI") , legend=list(title=list(text='Outlier:')),
         shapes= list(vline(155), vline(165), vline(175)))
fig2
#layout(shapes = list(vline(4), hline(5)))

# Nærmeste nabo imputering



In [None]:

women_3 <- women %>% impute_knn(kmi ~ vekt + hoyde, k = 1)  %>% mutate(feil=(kmi-kmi_org)^2)



rmse3 <-women_3 %>%summarize( round(sqrt(sum(feil)/15),2))
rmse3

fig3 <- women_3 %>% 
       plot_ly( x = ~hoyde, y= ~kmi,
        type ="scatter",mode ="markers",split = ~imp,             
        hovertemplate = paste(     
      "%{yaxis.title.text}: %{y:}<br>",
      "%{xaxis.title.text}: %{x:}<br>",
      "<extra></extra>"
      ) ) %>%
       layout(title = "Nærmeste nabo imputering", xaxis = list(title = "Høyde"),
        yaxis = list(title = "KMI") , legend=list(title=list(text='Outlier:')))
fig3

# Imputering med lineær regresjon

In [None]:
# Sjekker først modellen

fit <- lm(data=women, kmi ~ hoyde ) 
summary(fit)
plot_reg <-women %>% filter(!is.na(kmi)) %>% mutate(pred=predict(fit)) %>%
  plot_ly(x=~hoyde, y=~kmi, type ="scatter",mode ="markers") %>%
     add_trace(y =~pred , mode = 'lines') 
plot_reg


In [None]:
# Imouterer med modellen
women_4 <- women %>% impute_lm(kmi ~ hoyde ) %>%  mutate(feil=(kmi-kmi_org)^2)

rmse4 <-women_4 %>%summarize( round(sqrt(sum(feil)/15),2))
rmse4


fig4 <- women_4 %>% 
       plot_ly( x = ~hoyde, y= ~kmi,
        type ="scatter",mode ="markers",split = ~imp,             
        hovertemplate = paste(     
      "%{yaxis.title.text}: %{y:}<br>",
      "%{xaxis.title.text}: %{x:}<br>",
      "<extra></extra>"
      ) ) %>%
       layout(title = "Regresjons imputering", xaxis = list(title = "Høyde"),
        yaxis = list(title = "KMI") , legend=list(title=list(text='Outlier:')))
    
             
 
fig4


In [None]:
#pmm, predictor = impute_lm
women_6<-women %>% impute_pmm(kmi ~ vekt + hoyde) 
women_6$feil <- (women_6$kmi-women_6$kmi_org)^2
rmse6 <-round(sqrt(sum(women_6$feil)/15),2)

plot(women_6$hoyde, women_6$kmi,col=women_6$imp,xlab="h?yde", ylab="kmi", pch = 19, main=paste("predictiv mean matching. RMSE: ", rmse6))
legend("topright", legend = paste(c("Observert", "Imputert")), col = 1:2, pch = 19, bty = "n")
lines(women_5$hoyde,predict(linear_model,women), col="red")
?impute_shd

# random hotdeck
women_7<-women %>% impute_rhd(kmi ~ 1, pool = "complete" ) 
women_7$feil <- (women_7$kmi-women_7$kmi_org)^2
rmse7 <-round(sqrt(sum(women_7$feil)/15),2)

plot(women_7$hoyde, women_7$kmi,col=women_7$imp,xlab="h?yde", ylab="kmi", pch = 19, main=paste("Random hotdeck. RMSE: ", rmse7))
legend("topright", legend = paste(c("Observert", "Imputert")), col = 1:2, pch = 19, bty = "n")

# random hotdeck gruppert
women_8<-women %>% impute_rhd(kmi ~ 1| gruppe, pool = "complete" ) 
women_8$feil <- (women_8$kmi-women_8$kmi_org)^2
rmse8 <-round(sqrt(sum(women_8$feil)/15),2)

plot(women_8$hoyde, women_8$kmi,col=women_8$imp,xlab="h?yde", ylab="kmi", pch = 19, main=paste("Random hotdeck. RMSE: ", rmse8))
legend("topright", legend = paste(c("Observert", "Imputert")), col = 1:2, pch = 19, bty = "n")
abline(v = c(155,165,175))
