## Imports

In [1]:
# DO NOT CHANGE THESE LINES.
suppressWarnings(
  suppressMessages({
    library(jsonlite)
    library(dplyr)
    library(tidyr)
    library(caret)
    library(readr)
    library(data.table)
    library(fastDummies)
    library(nnet)
  })
)

## Paths

In [2]:
# DO NOT CHANGE THESE LINES.
ROOT_DIR <- dirname(getwd())
MODEL_INPUTS_OUTPUTS <- file.path(ROOT_DIR, 'model_inputs_outputs')
INPUT_DIR <- file.path(MODEL_INPUTS_OUTPUTS, "inputs")
INPUT_SCHEMA_DIR <- file.path(INPUT_DIR, "schema")
DATA_DIR <- file.path(INPUT_DIR, "data")
TRAIN_DIR <- file.path(DATA_DIR, "training")
MODEL_ARTIFACTS_PATH <- file.path(MODEL_INPUTS_OUTPUTS, "model", "artifacts")
OHE_ENCODER_FILE <- file.path(MODEL_ARTIFACTS_PATH, 'ohe.rds')
PREDICTOR_FILE_PATH <- file.path(MODEL_ARTIFACTS_PATH, "predictor", "predictor.rds")
IMPUTATION_FILE <- file.path(MODEL_ARTIFACTS_PATH, 'imputation.rds')
LABEL_ENCODER_FILE <- file.path(MODEL_ARTIFACTS_PATH, 'label_encoder.rds')
ENCODED_TARGET_FILE <- file.path(MODEL_ARTIFACTS_PATH, "encoded_target.rds")
TOP_3_CATEGORIES_MAP <- file.path(MODEL_ARTIFACTS_PATH, "top_3_map.rds")

if (!dir.exists(MODEL_ARTIFACTS_PATH)) {
    dir.create(MODEL_ARTIFACTS_PATH, recursive = TRUE)
}
if (!dir.exists(file.path(MODEL_ARTIFACTS_PATH, "predictor"))) {
    dir.create(file.path(MODEL_ARTIFACTS_PATH, "predictor"))
}

### Reading the schema
The schema contains metadata about the datasets. We will use the scehma to get information about the type of each feature (NUMERIC or CATEGORICAL) and the id and target features, this will be helpful in preprocessing stage.

In [3]:
file_name <- list.files(INPUT_SCHEMA_DIR, pattern = "*.json")[1]
schema <- fromJSON(file.path(INPUT_SCHEMA_DIR, file_name))
features <- schema$features

numeric_features <- features$name[features$dataType == "NUMERIC"]
categorical_features <- features$name[features$dataType == "CATEGORICAL"]
id_feature <- schema$id$name
target_feature <- schema$target$name
model_category <- schema$modelCategory

### Reading training data

In [4]:
# Reading training data
file_name <- list.files(TRAIN_DIR, pattern = "*.csv")[1]
# Read the first line to get column names
header_line <- readLines(file.path(TRAIN_DIR, file_name), n = 1)
col_names <- unlist(strsplit(header_line, split = ",")) # assuming ',' is the delimiter
# Read the CSV with the exact column names
df <- read.csv(file.path(TRAIN_DIR, file_name), skip = 0, col.names = col_names, check.names=FALSE)
head(df)

Unnamed: 0_level_0,Id,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X24,X25,X26,X27,X28,X29,X30,X31,X32,Phase
Unnamed: 0_level_1,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1,1776,-0.00097329,0.00162618,-7.159e-05,-0.00042025,0.00306038,-0.00028111,0.00046286,0.00156423,-0.00029951,...,-4.82e-06,0.00189654,0.00310186,0.00165854,0.00301216,0.0003166,0.00245034,0.00030301,0.00013135,P
2,5522,0.00612409,-0.03046733,0.00240245,0.00416802,-0.00092154,-0.00027998,0.0036581,-0.02180706,0.00182192,...,-1.38e-06,0.03116945,0.00427785,0.02218668,0.00244711,0.00097349,0.00010083,0.00132872,4.721e-05,P
3,7047,0.01072138,0.00584966,0.031741,0.01925152,0.01778552,0.06017001,0.0101969,0.0029227,0.0109322,...,0.00326259,0.03400967,0.0656306,0.01523259,0.03947077,0.00202505,0.00429311,0.00082388,0.00366965,S
4,7152,0.00619969,0.01544912,-0.00824005,-0.00014001,-0.00109091,-0.00179021,0.00424658,0.00812888,-0.00615179,...,-0.00189031,0.01857444,0.00210108,0.0110434,0.00137522,0.00260543,0.00181556,0.00053719,0.00198435,S
5,2480,0.01239417,0.00511193,0.00054728,0.01487226,0.00439047,-0.0005211,0.00124945,0.00393094,5.018e-05,...,0.00016431,0.01341815,0.01551554,0.00412504,0.00946664,0.00204508,0.00106448,0.00090394,0.00074077,S
6,2980,0.00018631,0.00043444,1.06e-06,-0.00019567,0.00086708,1.689e-05,-0.00104761,-0.00301388,-0.0004044,...,-4.11e-06,0.00047271,0.00088904,0.00321629,0.00017729,5.416e-05,7.364e-05,0.00291751,3.49e-05,H


## Data Preprocessing
Data preprocessing is very important before training the model, as the data may contain missing values in some cells. Moreover, most of the learning algorithms cannot work with categorical data, thus the data has to be encoded.

In this section we will impute the missing values and encode the categorical features. Afterwards the data will be ready to train the model.

##### Imputing missing data
> The median value will be used to impute missing values of the numeric features and the mode will be used to impute categorical features.

##### You can add your own preprocessing steps such as:
<ul>
<li>Normalization</li> <br>
<li>Outlier removal</li><br>
<li>Handling imbalanced classes</li><br>
<li>Dropping or adding features</li><br>
</ul>

### Important note:
<p> 
Saving the values used for imputation during training step is crucial. These values will be used to impute missing data in the testing set. This is very important to avoid the well known problem of data leakage. During testing, you should not make any assumptions about the data in hand, alternatively anything needed during the testing phase should be learned from the training phase. This is why we are creating a dictionary of values used during training to reuse these values during testing.
</p>


In [5]:
imputation_values <- list()

columns_with_missing_values <- colnames(df)[apply(df, 2, anyNA)]
for (column in columns_with_missing_values) {
    if (column %in% numeric_features) {
        value <- median(df[, column], na.rm = TRUE)
    } else {
        value <- as.character(df[, column] %>% tidyr::replace_na())
        value <- value[1]
    }
    df[, column][is.na(df[, column])] <- value
    imputation_values[column] <- value
}
saveRDS(imputation_values, IMPUTATION_FILE)
head(df)

Unnamed: 0_level_0,Id,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X24,X25,X26,X27,X28,X29,X30,X31,X32,Phase
Unnamed: 0_level_1,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1,1776,-0.00097329,0.00162618,-7.159e-05,-0.00042025,0.00306038,-0.00028111,0.00046286,0.00156423,-0.00029951,...,-4.82e-06,0.00189654,0.00310186,0.00165854,0.00301216,0.0003166,0.00245034,0.00030301,0.00013135,P
2,5522,0.00612409,-0.03046733,0.00240245,0.00416802,-0.00092154,-0.00027998,0.0036581,-0.02180706,0.00182192,...,-1.38e-06,0.03116945,0.00427785,0.02218668,0.00244711,0.00097349,0.00010083,0.00132872,4.721e-05,P
3,7047,0.01072138,0.00584966,0.031741,0.01925152,0.01778552,0.06017001,0.0101969,0.0029227,0.0109322,...,0.00326259,0.03400967,0.0656306,0.01523259,0.03947077,0.00202505,0.00429311,0.00082388,0.00366965,S
4,7152,0.00619969,0.01544912,-0.00824005,-0.00014001,-0.00109091,-0.00179021,0.00424658,0.00812888,-0.00615179,...,-0.00189031,0.01857444,0.00210108,0.0110434,0.00137522,0.00260543,0.00181556,0.00053719,0.00198435,S
5,2480,0.01239417,0.00511193,0.00054728,0.01487226,0.00439047,-0.0005211,0.00124945,0.00393094,5.018e-05,...,0.00016431,0.01341815,0.01551554,0.00412504,0.00946664,0.00204508,0.00106448,0.00090394,0.00074077,S
6,2980,0.00018631,0.00043444,1.06e-06,-0.00019567,0.00086708,1.689e-05,-0.00104761,-0.00301388,-0.0004044,...,-4.11e-06,0.00047271,0.00088904,0.00321629,0.00017729,5.416e-05,7.364e-05,0.00291751,3.49e-05,H


##### Encoding Categorical features
<p>
The id column is just an identifier for the training example, so we will exclude it during the encoding phase.<br>
Target feature will be label encoded in the next step.
</p>


In [6]:
# Encoding Categorical features
ids <- df[, id_feature]
target <- df[, target_feature]
df <- df %>% select(-all_of(c(id_feature, target_feature)))


# One Hot Encoding
if(length(categorical_features) > 0){
    top_3_map <- list()
    for(col in categorical_features) {
        # Get the top 3 categories for the column
        top_3_categories <- names(sort(table(df[[col]]), decreasing = TRUE)[1:3])

        # Save the top 3 categories for this column
        top_3_map[[col]] <- top_3_categories
        # Replace categories outside the top 3 with "Other"
        df[[col]][!(df[[col]] %in% top_3_categories)] <- "Other"
    }

    df_encoded <- dummy_cols(df, select_columns = categorical_features, remove_selected_columns = TRUE)
    encoded_columns <- setdiff(colnames(df_encoded), colnames(df))
    saveRDS(encoded_columns, OHE_ENCODER_FILE)
    saveRDS(top_3_map, TOP_3_CATEGORIES_MAP)
    df <- df_encoded
}
head(df)


Unnamed: 0_level_0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X23,X24,X25,X26,X27,X28,X29,X30,X31,X32
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,-0.00097329,0.00162618,-7.159e-05,-0.00042025,0.00306038,-0.00028111,0.00046286,0.00156423,-0.00029951,-0.00037538,...,0.00012813,-4.82e-06,0.00189654,0.00310186,0.00165854,0.00301216,0.0003166,0.00245034,0.00030301,0.00013135
2,0.00612409,-0.03046733,0.00240245,0.00416802,-0.00092154,-0.00027998,0.0036581,-0.02180706,0.00182192,0.00235699,...,3.784e-05,-1.38e-06,0.03116945,0.00427785,0.02218668,0.00244711,0.00097349,0.00010083,0.00132872,4.721e-05
3,0.01072138,0.00584966,0.031741,0.01925152,0.01778552,0.06017001,0.0101969,0.0029227,0.0109322,0.00317718,...,0.00161313,0.00326259,0.03400967,0.0656306,0.01523259,0.03947077,0.00202505,0.00429311,0.00082388,0.00366965
4,0.00619969,0.01544912,-0.00824005,-0.00014001,-0.00109091,-0.00179021,0.00424658,0.00812888,-0.00615179,0.0004056,...,0.00019363,-0.00189031,0.01857444,0.00210108,0.0110434,0.00137522,0.00260543,0.00181556,0.00053719,0.00198435
5,0.01239417,0.00511193,0.00054728,0.01487226,0.00439047,-0.0005211,0.00124945,0.00393094,5.018e-05,0.00916923,...,-0.0005105,0.00016431,0.01341815,0.01551554,0.00412504,0.00946664,0.00204508,0.00106448,0.00090394,0.00074077
6,0.00018631,0.00043444,1.06e-06,-0.00019567,0.00086708,1.689e-05,-0.00104761,-0.00301388,-0.0004044,0.0001631,...,4.41e-06,-4.11e-06,0.00047271,0.00088904,0.00321629,0.00017729,5.416e-05,7.364e-05,0.00291751,3.49e-05


#### Encoding the target feature

In [7]:
# Encoding the target feature
levels_target <- levels(factor(target))
encoded_target <- as.integer(factor(target, levels = levels_target)) - 1
saveRDS(levels_target, LABEL_ENCODER_FILE)
saveRDS(encoded_target, ENCODED_TARGET_FILE)

### Training the Classifier
We choose Logistic Regression Classifier, but feel free to try your own and compare the results.

In [8]:
# Train the Classifier
if (model_category == 'binary_classification'){
    model <- glm(encoded_target ~ ., family = binomial(link = "logit"), data = df)

} else if (model_category == "multiclass_classification") {
   model <- multinom(encoded_target ~ ., data = df, MaxNWts = 10000)
}
saveRDS(model, PREDICTOR_FILE_PATH)


# weights:  170 (132 variable)
initial  value 12709.731194 
iter  10 value 10385.186059
iter  20 value 10119.446171
iter  30 value 10074.539703
iter  40 value 10043.855719
iter  50 value 10021.855944
iter  60 value 10010.549433
iter  70 value 10001.560270
iter  80 value 9995.803108
iter  90 value 9991.641535
iter 100 value 9987.710916
final  value 9987.710916 
stopped after 100 iterations
