## Imports

In [1]:
# DO NOT CHANGE THESE LINES.
suppressWarnings(
  suppressMessages({
    library(jsonlite)
    library(dplyr)
    library(tidyr)
    library(caret)
    library(readr)
    library(data.table)
    library(fastDummies)
    library(nnet)
  })
)

## Paths

In [2]:
# DO NOT CHANGE THESE LINES.
ROOT_DIR <- dirname(getwd())
MODEL_INPUTS_OUTPUTS <- file.path(ROOT_DIR, 'model_inputs_outputs')
INPUT_DIR <- file.path(MODEL_INPUTS_OUTPUTS, "inputs")
INPUT_SCHEMA_DIR <- file.path(INPUT_DIR, "schema")
DATA_DIR <- file.path(INPUT_DIR, "data")
TRAIN_DIR <- file.path(DATA_DIR, "training")
MODEL_ARTIFACTS_PATH <- file.path(MODEL_INPUTS_OUTPUTS, "model", "artifacts")
OHE_ENCODER_FILE <- file.path(MODEL_ARTIFACTS_PATH, 'ohe.rds')
PREDICTOR_FILE_PATH <- file.path(MODEL_ARTIFACTS_PATH, "predictor", "predictor.rds")
IMPUTATION_FILE <- file.path(MODEL_ARTIFACTS_PATH, 'imputation.rds')
LABEL_ENCODER_FILE <- file.path(MODEL_ARTIFACTS_PATH, 'label_encoder.rds')
ENCODED_TARGET_FILE <- file.path(MODEL_ARTIFACTS_PATH, "encoded_target.rds")
TOP_3_CATEGORIES_MAP <- file.path(MODEL_ARTIFACTS_PATH, "top_3_map.rds")

if (!dir.exists(MODEL_ARTIFACTS_PATH)) {
    dir.create(MODEL_ARTIFACTS_PATH, recursive = TRUE)
}
if (!dir.exists(file.path(MODEL_ARTIFACTS_PATH, "predictor"))) {
    dir.create(file.path(MODEL_ARTIFACTS_PATH, "predictor"))
}

### Reading the schema
The schema contains metadata about the datasets. We will use the scehma to get information about the type of each feature (NUMERIC or CATEGORICAL) and the id and target features, this will be helpful in preprocessing stage.

In [3]:
file_name <- list.files(INPUT_SCHEMA_DIR, pattern = "*.json")[1]
schema <- fromJSON(file.path(INPUT_SCHEMA_DIR, file_name))
features <- schema$features

numeric_features <- features$name[features$dataType == "NUMERIC"]
categorical_features <- features$name[features$dataType == "CATEGORICAL"]
id_feature <- schema$id$name
target_feature <- schema$target$name
model_category <- schema$modelCategory
nullable_features <- features$name[features$nullable == TRUE]


### Reading training data

In [4]:
# Reading training data
file_name <- list.files(TRAIN_DIR, pattern = "*.csv")[1]
# Read the first line to get column names
header_line <- readLines(file.path(TRAIN_DIR, file_name), n = 1)
col_names <- unlist(strsplit(header_line, split = ",")) # assuming ',' is the delimiter
# Read the CSV with the exact column names
df <- read.csv(file.path(TRAIN_DIR, file_name), skip = 0, col.names = col_names, check.names=FALSE)
head(df)

Unnamed: 0_level_0,unit_id,act_combined_midrange,act_english_midrange,act_math_midrange,act_writing_midrange,admission_rate,agege24,average_cost_academic_year,average_cost_program_year,carnegie_basic_classification,⋯,sat_math_midrange,sat_total_average,sat_verbal_midrange,sat_writing_midrange,spend_per_student,state,tuition_(instate),tuition_(out_of_state),undergrad_size,percent_pell_grant
Unnamed: 0_level_1,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
1,123341,,,,,,0.48,13157.0,,Associate\s--Public Suburban-serving Multicampus',⋯,,,,,3674,CA,1150.0,6190.0,17195,0.3146
2,172495,,,,,,0.46,,17013.0,,⋯,,,,,2015,MI,,,111,0.652
3,419457,,,,,,0.62,23317.0,,Special Focus Institutions--Other health professions schools,⋯,,,,,6234,VA,13663.0,13663.0,126,0.5495
4,455725,,,,,,,,15564.0,,⋯,,,,,3366,AZ,,,363,0.8227
5,107831,,,,,,0.54,,18967.0,,⋯,,,,,971,AR,,,104,0.4621
6,210304,,,,,0.5274,0.53,30830.0,,Baccalaureate Colleges--Diverse Fields,⋯,,,,,10245,OR,19770.0,19770.0,501,0.5336


## Data Preprocessing
Data preprocessing is very important before training the model, as the data may contain missing values in some cells. Moreover, most of the learning algorithms cannot work with categorical data, thus the data has to be encoded.

In this section we will impute the missing values and encode the categorical features. Afterwards the data will be ready to train the model.

##### Imputing missing data
> The median value will be used to impute missing values of the numeric features and the mode will be used to impute categorical features.

##### You can add your own preprocessing steps such as:
<ul>
<li>Normalization</li> <br>
<li>Outlier removal</li><br>
<li>Dropping or adding features</li><br>
</ul>

### Important note:
<p> 
Saving the values used for imputation during training step is crucial. These values will be used to impute missing data in the testing set. This is very important to avoid the well known problem of data leakage. During testing, you should not make any assumptions about the data in hand, alternatively anything needed during the testing phase should be learned from the training phase. This is why we are creating a dictionary of values used during training to reuse these values during testing.
</p>


In [5]:
imputation_values <- list()

for (column in nullable_features) {
    if (column %in% numeric_features) {
        value <- median(df[, column], na.rm = TRUE)
    } else {
        value <- as.character(df[, column] %>% tidyr::replace_na())
        value <- value[1]
    }
    df[, column][is.na(df[, column])] <- value
    imputation_values[column] <- value
}
saveRDS(imputation_values, IMPUTATION_FILE)
head(df)

Unnamed: 0_level_0,unit_id,act_combined_midrange,act_english_midrange,act_math_midrange,act_writing_midrange,admission_rate,agege24,average_cost_academic_year,average_cost_program_year,carnegie_basic_classification,⋯,sat_math_midrange,sat_total_average,sat_verbal_midrange,sat_writing_midrange,spend_per_student,state,tuition_(instate),tuition_(out_of_state),undergrad_size,percent_pell_grant
Unnamed: 0_level_1,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
1,123341,23,22,22,7,0.6976,0.48,13157,17831,Associate\s--Public Suburban-serving Multicampus',⋯,520,1035.5,514,510,3674,CA,1150,6190,17195,0.3146
2,172495,23,22,22,7,0.6976,0.46,22933,17013,,⋯,520,1035.5,514,510,2015,MI,11736,14498,111,0.652
3,419457,23,22,22,7,0.6976,0.62,23317,17831,Special Focus Institutions--Other health professions schools,⋯,520,1035.5,514,510,6234,VA,13663,13663,126,0.5495
4,455725,23,22,22,7,0.6976,0.46,22933,15564,,⋯,520,1035.5,514,510,3366,AZ,11736,14498,363,0.8227
5,107831,23,22,22,7,0.6976,0.54,22933,18967,,⋯,520,1035.5,514,510,971,AR,11736,14498,104,0.4621
6,210304,23,22,22,7,0.5274,0.53,30830,17831,Baccalaureate Colleges--Diverse Fields,⋯,520,1035.5,514,510,10245,OR,19770,19770,501,0.5336


##### Encoding Categorical features
<p>
The id column is just an identifier for the training example, so we will exclude it during the encoding phase.<br>
Target feature will be label encoded in the next step.
</p>


In [6]:
# Encoding Categorical features
ids <- df[, id_feature]
target <- df[, target_feature]
df <- df %>% select(-all_of(c(id_feature, target_feature)))


# One Hot Encoding
if(length(categorical_features) > 0){
    top_3_map <- list()
    for(col in categorical_features) {
        # Get the top 3 categories for the column
        top_3_categories <- names(sort(table(df[[col]]), decreasing = TRUE)[1:3])

        # Save the top 3 categories for this column
        top_3_map[[col]] <- top_3_categories
        # Replace categories outside the top 3 with "Other"
        df[[col]][!(df[[col]] %in% top_3_categories)] <- "Other"
    }

    df_encoded <- dummy_cols(df, select_columns = categorical_features, remove_selected_columns = TRUE)
    encoded_columns <- setdiff(colnames(df_encoded), colnames(df))
    saveRDS(encoded_columns, OHE_ENCODER_FILE)
    saveRDS(top_3_map, TOP_3_CATEGORIES_MAP)
    df <- df_encoded
}
head(df)


Unnamed: 0_level_0,act_combined_midrange,act_english_midrange,act_math_midrange,act_writing_midrange,admission_rate,agege24,average_cost_academic_year,average_cost_program_year,completion_rate,faculty_salary,⋯,region_Other,"region_Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC, TN, VA, WV)",religious_affiliation_,religious_affiliation_Other,religious_affiliation_Roman Catholic,religious_affiliation_United Methodist,state_CA,state_NY,state_Other,state_TX
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,23,22,22,7,0.6976,0.48,13157,17831,0.4638,7123,⋯,1,0,1,0,0,0,1,0,0,0
2,23,22,22,7,0.6976,0.46,22933,17013,0.4638,5751,⋯,0,0,1,0,0,0,0,0,1,0
3,23,22,22,7,0.6976,0.62,23317,17831,0.5,6281,⋯,0,1,1,0,0,0,0,0,1,0
4,23,22,22,7,0.6976,0.46,22933,15564,0.4638,5751,⋯,1,0,1,0,0,0,0,0,1,0
5,23,22,22,7,0.6976,0.54,22933,18967,0.4638,5751,⋯,0,1,1,0,0,0,0,0,1,0
6,23,22,22,7,0.5274,0.53,30830,17831,0.3152,6560,⋯,1,0,0,1,0,0,0,0,1,0


### Training the model
We choose Linear Regression, but feel free to try your own and compare the results.

In [7]:
# Train the model
model <- lm(target ~ ., data = df)
saveRDS(model, PREDICTOR_FILE_PATH)