## Imports

In [1]:
# DO NOT CHANGE THESE LINES.
suppressWarnings(
  suppressMessages({
    library(tidyverse)
    library(jsonlite)
    library(fastDummies)
    library(nnet)
  })
)

## Paths

In [2]:
# DO NOT CHANGE THESE LINES.
ROOT_DIR <- dirname(getwd())
MODEL_INPUTS_OUTPUTS <- file.path(ROOT_DIR, 'model_inputs_outputs')
INPUT_DIR <- file.path(MODEL_INPUTS_OUTPUTS, "inputs")
OUTPUT_DIR <- file.path(MODEL_INPUTS_OUTPUTS, "outputs")
INPUT_SCHEMA_DIR <- file.path(INPUT_DIR, "schema")
DATA_DIR <- file.path(INPUT_DIR, "data")
TRAIN_DIR <- file.path(DATA_DIR, "training")
TEST_DIR <- file.path(DATA_DIR, "testing")
MODEL_PATH <- file.path(MODEL_INPUTS_OUTPUTS, "model")
MODEL_ARTIFACTS_PATH <- file.path(MODEL_PATH, "artifacts")
OHE_ENCODER_FILE <- file.path(MODEL_ARTIFACTS_PATH, 'ohe.rds')
PREDICTOR_DIR_PATH <- file.path(MODEL_ARTIFACTS_PATH, "predictor")
PREDICTOR_FILE_PATH <- file.path(PREDICTOR_DIR_PATH, "predictor.rds")
IMPUTATION_FILE <- file.path(MODEL_ARTIFACTS_PATH, 'imputation.rds')
PREDICTIONS_DIR <- file.path(OUTPUT_DIR, 'predictions')
PREDICTIONS_FILE <- file.path(PREDICTIONS_DIR, 'predictions.csv')
LABEL_ENCODER_FILE <- file.path(MODEL_ARTIFACTS_PATH, 'label_encoder.rds')
ENCODED_TARGET_FILE <- file.path(MODEL_ARTIFACTS_PATH, "encoded_target.rds")
TOP_3_CATEGORIES_MAP <- file.path(MODEL_ARTIFACTS_PATH, "top_3_map.rds")

if (!dir.exists(PREDICTIONS_DIR)) {
  dir.create(PREDICTIONS_DIR, recursive = TRUE)
}

### Reading the schema

In [3]:
file_name <- list.files(INPUT_SCHEMA_DIR, pattern = "*.json")[1]
schema <- fromJSON(file.path(INPUT_SCHEMA_DIR, file_name))
features <- schema$features

numeric_features <- features$name[features$dataType != 'CATEGORICAL']
categorical_features <- features$name[features$dataType == 'CATEGORICAL']
id_feature <- schema$id$name
target_feature <- schema$target$name
target_classes <- schema$target$classes
model_category <- schema$modelCategory

### Reading test data.

In [4]:
# Reading test data.
file_name <- list.files(TEST_DIR, pattern = "*.csv", full.names = TRUE)[1]
# Read the first line to get column names
header_line <- readLines(file_name, n = 1)
col_names <- unlist(strsplit(header_line, split = ",")) # assuming ',' is the delimiter
# Read the CSV with the exact column names
df <- read.csv(file_name, skip = 0, col.names = col_names, check.names=FALSE)

## Data preprocessing
Note that when we work with testing data, we have to impute using the same values learned during training. This is to avoid data leakage.

In [5]:
imputation_values <- readRDS(IMPUTATION_FILE)
for (column in names(df)[sapply(df, function(col) any(is.na(col)))]) {
  df[, column][is.na(df[, column])] <- imputation_values[[column]]
}

In [6]:
# Saving the id column in a different variable and then dropping it.
ids <- df[[id_feature]]
df[[id_feature]] <- NULL
head(df)

Unnamed: 0_level_0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X23,X24,X25,X26,X27,X28,X29,X30,X31,X32
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,0.00028408,-0.00016291,-4.502e-05,0.01452343,-0.00433185,-0.00132287,-2.617e-05,-0.00061623,0.00010604,0.00877363,...,-0.00018344,-0.00014018,0.00033056,0.01521332,0.00062583,0.00890927,0.00015812,0.00211354,0.00020266,0.00218619
2,-6.228e-05,0.00043633,0.00016562,0.00139257,-0.00044223,0.00020754,0.00097774,0.00051748,-6.84e-06,-9.722e-05,...,0.00060851,0.00012892,0.00047084,0.00147577,0.00110626,0.00240156,0.00053823,0.00161534,0.00086654,0.00062205
3,0.00886443,-0.02939425,0.00264778,0.01571359,0.03180669,-0.00073439,0.00844243,0.0135821,-0.00118457,0.01208746,...,0.0096773,0.0003565,0.03081575,0.0354841,0.01603594,0.02922327,0.01310763,0.01207336,0.01651852,0.01059408
4,0.02145489,0.01743992,0.00379467,0.01339493,0.00076531,-0.0007695,0.0213114,0.01376909,0.00428827,0.00982688,...,0.00321702,-0.00296212,0.02790811,0.01343883,0.02573233,0.01053071,0.02243118,0.00862146,0.01722895,0.00445101
5,0.00147721,-0.00178811,-0.00497505,0.0011258,0.00857101,0.0084882,0.001201,-0.00063487,-0.00572473,0.00506914,...,0.00113801,-0.00024614,0.00548913,0.01211525,0.00588371,0.00949511,0.00331269,0.00152415,0.00206997,0.00133677
6,0.00552926,0.00437267,-0.00073831,0.02496931,-0.03973564,-0.00253208,0.00158065,0.00165054,-0.00021593,0.01683546,...,0.0007397,1.593e-05,0.00708788,0.04699787,0.00229551,0.03422393,0.00155,0.00759593,0.00073124,0.00145763


### Encoding
We encode the data using the same encoder that we saved during training.

In [7]:
if (length(categorical_features) > 0 && file.exists(OHE_ENCODER_FILE)) {
  top_3_map <- readRDS(TOP_3_CATEGORIES_MAP)
  encoder <- readRDS(OHE_ENCODER_FILE)
  for(col in categorical_features) {
    # Use the saved top 3 categories to replace values outside these categories with 'Other'
    df[[col]][!(df[[col]] %in% top_3_map[[col]])] <- "Other"
  }

  test_df_encoded <- dummy_cols(df, select_columns = categorical_features, remove_selected_columns = TRUE)
  encoded_columns <- readRDS(OHE_ENCODER_FILE)
  # Add missing columns with 0s
    for (col in encoded_columns) {
        if (!col %in% colnames(test_df_encoded)) {
            test_df_encoded[[col]] <- 0
        }
    }

# Remove extra columns
    extra_cols <- setdiff(colnames(test_df_encoded), c(colnames(df), encoded_columns))
    df <- test_df_encoded[, !names(test_df_encoded) %in% extra_cols]
}

### Making predictions
Using the model saved during training. Notice that the model outputs a 2D array with many rows and 5 columns. </br>
Each row in the array represents an answer to a sample in the test data. Each number of the 5 numbers in the row is a probability to one of the 5 classes in the original dataset.

In [8]:
type <- ifelse(model_category == "binary_classification", "response", "probs")

# Making predictions
model <- readRDS(PREDICTOR_FILE_PATH)
predictions <- predict(model, newdata = df, type = type)
head(predictions)

Unnamed: 0,0,1,2,3,4
1,0.246849623,0.06648546,0.22214313,0.16233296,0.3021888
2,0.5021940301,0.1492466,0.13519934,0.05779201,0.155568
3,0.0004435492,7.027803e-05,0.2164756,0.03380179,0.7492088
4,0.0193742233,0.001983382,0.06483584,0.02787079,0.8859358
5,0.0948142112,0.02619522,0.31862091,0.08144627,0.4789234
6,0.002909014,0.0009626001,0.02829774,0.47843613,0.4893945


### Getting the original labels and creating the Dataframe.

In [9]:
# Getting the original labels
encoder <- readRDS(LABEL_ENCODER_FILE)
target <- readRDS(ENCODED_TARGET_FILE)
class_names <- encoder[target + 1]
unique_classes <- unique(class_names)
unique_classes <- sort(unique_classes)

if (model_category == 'binary_classification'){
    Prediction1 <- predictions
    Prediction2 <- 1 - Prediction1
    predictions_df <- data.frame(Prediction2 = Prediction2, Prediction1 = Prediction1)
    
} else{
    predictions_df <- predictions
}
colnames(predictions_df) <- unique_classes
predictions_df <- tibble(ids = ids) %>% bind_cols(predictions_df)
colnames(predictions_df)[1] <- id_feature

write.csv(predictions_df, PREDICTIONS_FILE, row.names = FALSE)
head(predictions_df)


Id,D,H,P,R,S
<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1197,0.246849623,0.06648546,0.22214313,0.16233296,0.3021888
1745,0.5021940301,0.1492466,0.13519934,0.05779201,0.155568
7459,0.0004435492,7.027803e-05,0.2164756,0.03380179,0.7492088
2263,0.0193742233,0.001983382,0.06483584,0.02787079,0.8859358
5940,0.0948142112,0.02619522,0.31862091,0.08144627,0.4789234
905,0.002909014,0.0009626001,0.02829774,0.47843613,0.4893945
