## Imports

In [1]:
# DO NOT CHANGE THESE LINES.
import os
import pandas as pd
import json
import warnings
from joblib import load
warnings.filterwarnings('ignore')

## Paths

In [2]:
# DO NOT CHANGE THESE LINES.
ROOT_DIR = os.path.dirname(os.getcwd())
MODEL_INPUTS_OUTPUTS = os.path.join(ROOT_DIR, 'model_inputs_outputs/')
INPUT_DIR = os.path.join(MODEL_INPUTS_OUTPUTS, "inputs")
OUTPUT_DIR = os.path.join(MODEL_INPUTS_OUTPUTS, "outputs")
INPUT_SCHEMA_DIR = os.path.join(INPUT_DIR, "schema")
DATA_DIR = os.path.join(INPUT_DIR, "data")
TRAIN_DIR = os.path.join(DATA_DIR, "training")
TEST_DIR = os.path.join(DATA_DIR, "testing")
MODEL_PATH = os.path.join(MODEL_INPUTS_OUTPUTS, "model")
MODEL_ARTIFACTS_PATH = os.path.join(MODEL_PATH, "artifacts")
OHE_ENCODER_FILE = os.path.join(MODEL_ARTIFACTS_PATH, 'ohe.joblib')
PREDICTOR_DIR_PATH = os.path.join(MODEL_ARTIFACTS_PATH, "predictor")
PREDICTOR_FILE_PATH = os.path.join(PREDICTOR_DIR_PATH, "predictor.joblib")
IMPUTATION_FILE = os.path.join(MODEL_ARTIFACTS_PATH, 'imputation.joblib')
PREDICTIONS_DIR = os.path.join(OUTPUT_DIR, 'predictions')
PREDICTIONS_FILE = os.path.join(PREDICTIONS_DIR, 'predictions.csv')
LABEL_ENCODER_FILE = os.path.join(MODEL_ARTIFACTS_PATH, 'label_encoder.joblib')

if not os.path.exists(PREDICTIONS_DIR):
    os.makedirs(PREDICTIONS_DIR)

### Reading the schema

In [3]:
file_name = [f for f in os.listdir(INPUT_SCHEMA_DIR) if f.endswith('.json')][0]
schema_path = os.path.join(INPUT_SCHEMA_DIR, file_name)
with open(schema_path, "r", encoding="utf-8") as file:
    schema = json.load(file)
features = schema['features']

numeric_features = []
categorical_features = []
for f in features:
    if f['dataType'] == 'CATEGORICAL':
        categorical_features.append(f['name'])
    else:
        numeric_features.append(f['name'])

id_feature = schema['id']['name']
target_feature = schema['target']['name']
target_classes = schema['target']['classes']

### Reading test data.

In [4]:
file_name = [f for f in os.listdir(TEST_DIR) if f.endswith('.csv')][0]
file_path = os.path.join(TEST_DIR, file_name)
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Id,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X23,X24,X25,X26,X27,X28,X29,X30,X31,X32
0,8961,-0.022152,0.007143,-0.001517,0.012274,-0.008348,0.008369,-0.014637,0.001891,-0.003228,...,0.00164,0.000365,0.023325,0.01704,0.015107,0.012112,0.005601,0.004108,0.004532,0.001828
1,1197,0.000284,-0.000163,-4.5e-05,0.014523,-0.004332,-0.001323,-2.6e-05,-0.000616,0.000106,...,-0.000183,-0.00014,0.000331,0.015213,0.000626,0.008909,0.000158,0.002114,0.000203,0.002186
2,1745,-6.2e-05,0.000436,0.000166,0.001393,-0.000442,0.000208,0.000978,0.000517,-7e-06,...,0.000609,0.000129,0.000471,0.001476,0.001106,0.002402,0.000538,0.001615,0.000867,0.000622
3,7459,0.008864,-0.029394,0.002648,0.015714,0.031807,-0.000734,0.008442,0.013582,-0.001185,...,0.009677,0.000356,0.030816,0.035484,0.016036,0.029223,0.013108,0.012073,0.016519,0.010594
4,2263,0.021455,0.01744,0.003795,0.013395,0.000765,-0.00077,0.021311,0.013769,0.004288,...,0.003217,-0.002962,0.027908,0.013439,0.025732,0.010531,0.022431,0.008621,0.017229,0.004451


## Data preprocessing
Note that when we work with testing data, we have to impute using the same values learned during training. This is to avoid data leakage.

In [5]:
columns_with_missing_values = df.columns[df.isna().any()]
imputation_values = load(IMPUTATION_FILE)
for column in columns_with_missing_values:
    df[column].fillna(imputation_values[column], inplace=True)


In [6]:
df.head()

Unnamed: 0,Id,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X23,X24,X25,X26,X27,X28,X29,X30,X31,X32
0,8961,-0.022152,0.007143,-0.001517,0.012274,-0.008348,0.008369,-0.014637,0.001891,-0.003228,...,0.00164,0.000365,0.023325,0.01704,0.015107,0.012112,0.005601,0.004108,0.004532,0.001828
1,1197,0.000284,-0.000163,-4.5e-05,0.014523,-0.004332,-0.001323,-2.6e-05,-0.000616,0.000106,...,-0.000183,-0.00014,0.000331,0.015213,0.000626,0.008909,0.000158,0.002114,0.000203,0.002186
2,1745,-6.2e-05,0.000436,0.000166,0.001393,-0.000442,0.000208,0.000978,0.000517,-7e-06,...,0.000609,0.000129,0.000471,0.001476,0.001106,0.002402,0.000538,0.001615,0.000867,0.000622
3,7459,0.008864,-0.029394,0.002648,0.015714,0.031807,-0.000734,0.008442,0.013582,-0.001185,...,0.009677,0.000356,0.030816,0.035484,0.016036,0.029223,0.013108,0.012073,0.016519,0.010594
4,2263,0.021455,0.01744,0.003795,0.013395,0.000765,-0.00077,0.021311,0.013769,0.004288,...,0.003217,-0.002962,0.027908,0.013439,0.025732,0.010531,0.022431,0.008621,0.017229,0.004451


### Encoding
We encode the data using the same encoder that we saved during training.

In [7]:
# Saving the id column in a different variable.
ids = df[id_feature]

# Dropping the id from the dataframe
df.drop(columns=[id_feature], inplace=True)

# Encoding the rest of the features if exist
if os.path.exists(OHE_ENCODER_FILE):
    encoder = load(OHE_ENCODER_FILE)
    df = encoder.transform(df)


### Making predictions
Using the model saved during training. Notice that the model outputs a 2D array with many rows and 5 columns. </br>
Each row in the array represents an answer to a sample in the test data. Each number of the 5 numbers in the row is a probability to one of the 5 classes in the original dataset.

In [8]:
model = load(PREDICTOR_FILE_PATH)
predictions = model.predict_proba(df)

predictions

array([[0.21285967, 0.09296525, 0.21117845, 0.12846568, 0.35453096],
       [0.28795724, 0.10484084, 0.20251848, 0.11720958, 0.28747386],
       [0.33851028, 0.11038317, 0.2016238 , 0.10010355, 0.24937919],
       ...,
       [0.32594845, 0.1097305 , 0.19493983, 0.10886509, 0.26051613],
       [0.34231458, 0.11075176, 0.19777342, 0.10132491, 0.24783532],
       [0.20526114, 0.09295789, 0.19622232, 0.13958613, 0.36597252]])

### Getting the original labels.

In [9]:
encoder = load(LABEL_ENCODER_FILE)
class_names = encoder.inverse_transform(range(len(target_classes)))
predictions = pd.DataFrame(predictions, columns=class_names)
predictions.insert(0, id_feature, ids)
predictions.to_csv(PREDICTIONS_FILE)
predictions.head()


Unnamed: 0,Id,D,H,P,R,S
0,8961,0.21286,0.092965,0.211178,0.128466,0.354531
1,1197,0.287957,0.104841,0.202518,0.11721,0.287474
2,1745,0.33851,0.110383,0.201624,0.100104,0.249379
3,7459,0.121595,0.067954,0.279215,0.099889,0.431347
4,2263,0.186435,0.087492,0.253446,0.108488,0.36414
