## Load libraries

In [1]:
library(outlierensembles)
library(DDoutlier)

## Load data

In [2]:
data.dir <- paste(getwd(), "data", sep="/")
data.matrix.original <- paste(data.dir, "data_matrix_original.csv", sep="/")

## Format data

In [3]:
data <- read.csv(data.matrix.original)
rownames(data) <- data$micro_specimen_id
index <- data[, (ncol(data)-1):ncol(data)]
data <- subset(data, select = -c(X, micro_specimen_id, person_id))
data[is.na(data)] <- 0
head(data)

Unnamed: 0_level_0,seven_day_mortality,fourteen_day_mortality,twentyone_day_mortality,twentyeight_day_mortality,sixty_day_mortality,ninety_day_mortality,onetwenty_day_mortality,Ambulatory.Clinic...Center,Ambulatory.Surgical.Center,Emergency.Room...Hospital,⋯,bicarbonate_first,hemoglobin_first,creatinine_first,potassium_last,chloride_last,glucose_last,sodium_last,bicarbonate_last,hemoglobin_last,creatinine_last
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
9769904,0,0,0,0,0,0,0,0,0,1,⋯,0.232437,0.6725906,-0.8264525,-0.05648535,-0.2134402,-0.7724418,-0.6155762,0.04677541,0.3824809,-0.7101768
2985612,0,0,0,0,0,0,0,0,0,1,⋯,-0.9051665,-1.3038165,-0.6865942,-1.60893745,1.8592971,-0.3449661,1.0887199,-0.71820295,0.3230233,-0.4964765
2871076,0,0,0,0,0,0,0,0,0,1,⋯,0.232437,-0.3916286,-0.6865942,-0.74646406,1.1190338,-0.7724418,0.8993537,0.4292646,2.7013267,-0.6389433
6894504,0,0,0,0,1,1,1,0,0,1,⋯,-0.1467641,0.7232677,-0.7565233,0.97848271,-0.6575982,-0.3077943,-0.2368437,0.4292646,1.0959719,-0.7814102
8752252,1,1,1,1,1,1,1,0,1,0,⋯,1.3700405,-0.6956912,-0.4068774,-0.57396938,0.5268231,-0.9954726,0.331255,0.23802,-0.5093829,-0.7101768
231719,0,0,0,0,0,0,0,0,0,1,⋯,0.6116382,-0.8477225,0.1525561,-0.22898003,0.0826651,-0.7724418,0.1418888,0.23802,-0.4499253,0.4295579


## Unsupervized Outlier detection algorithms

In [4]:
y1 <- DDoutlier::KNN_AGG(data)

In [5]:
y2 <- DDoutlier::LOF(data)

In [6]:
# y3 <- DDoutlier::COF(data, k=10)

In [7]:
y4 <- DDoutlier::INFLO(data)

In [8]:
y5 <- DDoutlier::KDEOS(data)

In [9]:
y6 <- DDoutlier::LDF(data)

In [10]:
# y7 <- DDoutlier::LDOF(data, k=10)

In [20]:
# Y <- cbind.data.frame(y1, y2, y3, y4, y5, y6$LDF, y7)
# Y <- cbind.data.frame(y1, y2, y4, y6$LDF)
Y <- cbind.data.frame(y1, y4)

In [23]:
head(Y)

Unnamed: 0_level_0,y1,y4
Unnamed: 0_level_1,<dbl>,<dbl>
1,273.5416,0.8958666
2,331.3549,1.0
3,318.0614,1.421791
4,401.9014,1.2877158
5,453.746,1.0
6,303.7111,0.9094021


## Item Response Theory (IRT) emsemble

In [22]:
ens1 <- irt_ensemble(Y)

“no non-missing arguments to max; returning -Inf”
“no non-missing arguments to min; returning Inf”
“no non-missing arguments to min; returning Inf”
“no non-missing arguments to max; returning -Inf”


ERROR: Error in while (abs(d) > converge && iter < max.EMCycle) {: missing value where TRUE/FALSE needed


## IRT emsemble scores

In [None]:
summary(ens1$scores)

In [None]:
hist(ens1$scores, breaks = sqrt(length(ens1$scores)))

In [None]:
boxplot(ens1$scores,
  ylab = "Ensemble anomaly scores"
)

## Export

In [None]:
df <- cbind.data.frame(index, data, y1, y2, y4, y5, y6$LDF, ens1$scores)

In [None]:
colnames(df) <- c('micro_specimen_id', 'person_id', colnames(data), 'y_knn_agg', 'y_lof', 'y_inflo', 'y_kdeos', 'y_ldf', 'ensemble_scores')

In [None]:
head(df)

In [None]:
data.dir <- paste(getwd(), "data", sep="/")
data.matrix.final <- paste(data.dir, "data_matrix_final.csv", sep="/")
write.csv(df, data.matrix.final, row.names = FALSE)