In [1]:
library(dplyr, warn.conflicts = FALSE)
library(moments)
library(ggplot2)
library(repr)
source('multiplot.r')
options(repr.plot.width=10, repr.plot.height=2) 

In [2]:
housing_data <- data.frame(read.table(file='../data/housing.data', header=FALSE, sep=""))
colnames(housing_data) <- c('CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT','MEDV')
housing_data$MEDV <- NULL
housing_data$CHAS = factor(housing_data$CHAS)
housing_features = Filter(is.numeric, housing_data)

# Outlier Analysis (Extra Credit)

### Use Tukey's method to identify outliers for each feature

In [3]:
display_outliers <- function(col_, df, param=1.5){
    quantile_ = quantile(df[[col_]])
    Q1 = quantile_[2]
    Q3 = quantile_[4]
    cat("Q1: ", Q1, "and Q3: ", Q3, "\n")
    tukey_window = param*(Q3-Q1)
    outliers <- df[df[[col_]] < (Q1 - tukey_window) | df[[col_]] > Q3 + tukey_window,]
    cat(col_,"::", dim(outliers),"\n", "\n", sep=" ")
    outliers
}

outlier_df <- data.frame(sapply(colnames(housing_features), display_outliers, df=housing_features))

# for(i in colnames(housing_features)) {
#     x <- display_outliers(i, housing_features)
#     cat(i,"::", dim(x),"\n", "\n", sep=" ")
# }

# x

Q1:  0.082045 and Q3:  3.677083 
CRIM :: 66 12 
 
Q1:  0 and Q3:  12.5 
ZN :: 68 12 
 
Q1:  5.19 and Q3:  18.1 
INDUS :: 0 12 
 
Q1:  0.449 and Q3:  0.624 
NOX :: 0 12 
 
Q1:  5.8855 and Q3:  6.6235 
RM :: 30 12 
 
Q1:  45.025 and Q3:  94.075 
AGE :: 0 12 
 
Q1:  2.100175 and Q3:  5.188425 
DIS :: 5 12 
 
Q1:  4 and Q3:  24 
RAD :: 0 12 
 
Q1:  279 and Q3:  666 
TAX :: 0 12 
 
Q1:  17.4 and Q3:  20.2 
PTRATIO :: 15 12 
 
Q1:  375.3775 and Q3:  396.225 
B :: 77 12 
 
Q1:  6.95 and Q3:  16.955 
LSTAT :: 7 12 
 


### Identify each instance that is an outlier for more than one feature

In [4]:
get_outlier_rows <- function(col_, df, param=1.5){
    quantile_ = quantile(df[[col_]])
    Q1 = quantile_[2]
    Q3 = quantile_[4]
    cat("Q1: ", Q1, "and Q3: ", Q3, "\n")
    tukey_window = param*(Q3-Q1)
    outliers <- df[df[[col_]] < (Q1 - tukey_window) | df[[col_]] > Q3 + tukey_window,]
    rownames(outliers)
}

outlier_rows <- sapply(colnames(housing_features), get_outlier_rows, df=housing_features)
row_df <- data.frame(table(unlist(outlier_rows)))
row_df[row_df$Freq > 1,]$Var1

Q1:  0.082045 and Q3:  3.677083 
Q1:  0 and Q3:  12.5 
Q1:  5.19 and Q3:  18.1 
Q1:  0.449 and Q3:  0.624 
Q1:  5.8855 and Q3:  6.6235 
Q1:  45.025 and Q3:  94.075 
Q1:  2.100175 and Q3:  5.188425 
Q1:  4 and Q3:  24 
Q1:  279 and Q3:  666 
Q1:  17.4 and Q3:  20.2 
Q1:  375.3775 and Q3:  396.225 
Q1:  6.95 and Q3:  16.955 


### Assess what percentage of the total data are outliers for:
    - one feature
    - two features
    - other

In [5]:
one_feature_outlier = length(row_df[row_df$Freq == 1,]$Var1)
two_feature_outlier = length(row_df[row_df$Freq == 2,]$Var1)
three_or_more_feature_outlier = length(row_df[row_df$Freq > 2,]$Var1)
cat("1 feature outlier", one_feature_outlier / 506, '\n')
cat("2 feature outlier", two_feature_outlier / 506, '\n')
cat("3 or more feature outlier", three_or_more_feature_outlier / 506, '\n')

1 feature outlier 0.3083004 
2 feature outlier 0.09090909 
3 or more feature outlier 0.01185771 


### Come up with a plan for handling outliers

    1. Imputation with mean / median / mode
    2. Capping observations outside the lower limit with the value of 5th percentile 
        and those that lie above the upper limit, with the value of 95th percentile
    3. Replacing the value with NaN