## Imports

In [24]:
# DO NOT CHANGE THESE LINES.
import os
import pandas as pd
import json
import warnings
from joblib import load
warnings.filterwarnings('ignore')

## Paths

In [25]:
# DO NOT CHANGE THESE LINES.
ROOT_DIR = os.path.dirname(os.getcwd())
MODEL_INPUTS_OUTPUTS = os.path.join(ROOT_DIR, 'model_inputs_outputs/')
INPUT_DIR = os.path.join(MODEL_INPUTS_OUTPUTS, "inputs")
OUTPUT_DIR = os.path.join(MODEL_INPUTS_OUTPUTS, "outputs")
INPUT_SCHEMA_DIR = os.path.join(INPUT_DIR, "schema")
DATA_DIR = os.path.join(INPUT_DIR, "data")
TRAIN_DIR = os.path.join(DATA_DIR, "training")
TEST_DIR = os.path.join(DATA_DIR, "testing")
MODEL_PATH = os.path.join(MODEL_INPUTS_OUTPUTS, "model")
MODEL_ARTIFACTS_PATH = os.path.join(MODEL_PATH, "artifacts")
OHE_ENCODER_FILE = os.path.join(MODEL_ARTIFACTS_PATH, 'ohe.joblib')
PREDICTOR_DIR_PATH = os.path.join(MODEL_ARTIFACTS_PATH, "predictor")
PREDICTOR_FILE_PATH = os.path.join(PREDICTOR_DIR_PATH, "predictor.joblib")
IMPUTATION_FILE = os.path.join(MODEL_ARTIFACTS_PATH, 'imputation.joblib')
PREDICTIONS_DIR = os.path.join(OUTPUT_DIR, 'predictions')
PREDICTIONS_FILE = os.path.join(PREDICTIONS_DIR, 'predictions.csv')
LABEL_ENCODER_FILE = os.path.join(MODEL_ARTIFACTS_PATH, 'label_encoder.joblib')

if not os.path.exists(PREDICTIONS_DIR):
    os.makedirs(PREDICTIONS_DIR)

### Reading the schema

In [26]:
file_name = [f for f in os.listdir(INPUT_SCHEMA_DIR) if f.endswith('.json')][0]
schema_path = os.path.join(INPUT_SCHEMA_DIR, file_name)
with open(schema_path, "r", encoding="utf-8") as file:
    schema = json.load(file)
features = schema['features']

numeric_features = []
categorical_features = []
nullable_features = []
for f in features:
    if f['dataType'] == 'CATEGORICAL':
        categorical_features.append(f['name'])
    else:
        numeric_features.append(f['name'])
    if f['nullable']:
        nullable_features.append(f['name'])

id_feature = schema['id']['name']
target_feature = schema['target']['name']

### Reading test data.

In [27]:
file_name = [f for f in os.listdir(TEST_DIR) if f.endswith('.csv')][0]
file_path = os.path.join(TEST_DIR, file_name)
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,unit_id,act_combined_midrange,act_english_midrange,act_math_midrange,act_writing_midrange,admission_rate,agege24,average_cost_academic_year,average_cost_program_year,carnegie_basic_classification,...,religious_affiliation,sat_math_midrange,sat_total_average,sat_verbal_midrange,sat_writing_midrange,spend_per_student,state,tuition_(instate),tuition_(out_of_state),undergrad_size
0,422695,,,,,,0.71,,21205.0,Associate\s--Private For-profit',...,,,,,,6669.0,CA,,,277.0
1,363712,,,,,,0.0,17300.0,,Special Focus Institutions--Theological semina...,...,Jewish,,,,,8516.0,FL,8400.0,8400.0,51.0
2,119331,,,,,,0.47,15178.0,,Associate\s--Public Urban-serving Multicampus',...,,,,,,5065.0,CA,1142.0,5942.0,5707.0
3,476142,,,,,,,,24908.0,,...,,,,,,7178.0,TX,,,29.0
4,194259,,,,,,0.64,,21500.0,,...,,,,,,5155.0,NY,,,37.0


## Data preprocessing
Note that when we work with testing data, we have to impute using the same values learned during training. This is to avoid data leakage.

In [28]:
imputation_values = load(IMPUTATION_FILE)
for column in nullable_features:
    df[column].fillna(imputation_values[column], inplace=True)


In [29]:
df.head()

Unnamed: 0,unit_id,act_combined_midrange,act_english_midrange,act_math_midrange,act_writing_midrange,admission_rate,agege24,average_cost_academic_year,average_cost_program_year,carnegie_basic_classification,...,religious_affiliation,sat_math_midrange,sat_total_average,sat_verbal_midrange,sat_writing_midrange,spend_per_student,state,tuition_(instate),tuition_(out_of_state),undergrad_size
0,422695,23.0,22.0,22.0,7.0,0.6976,0.71,22933.0,21205.0,Associate\s--Private For-profit',...,Roman Catholic,520.0,1035.5,514.0,510.0,6669.0,CA,11736.0,14498.0,277.0
1,363712,23.0,22.0,22.0,7.0,0.6976,0.0,17300.0,17828.5,Special Focus Institutions--Theological semina...,...,Jewish,520.0,1035.5,514.0,510.0,8516.0,FL,8400.0,8400.0,51.0
2,119331,23.0,22.0,22.0,7.0,0.6976,0.47,15178.0,17828.5,Associate\s--Public Urban-serving Multicampus',...,Roman Catholic,520.0,1035.5,514.0,510.0,5065.0,CA,1142.0,5942.0,5707.0
3,476142,23.0,22.0,22.0,7.0,0.6976,0.46,22933.0,24908.0,Associate\s--Private For-profit',...,Roman Catholic,520.0,1035.5,514.0,510.0,7178.0,TX,11736.0,14498.0,29.0
4,194259,23.0,22.0,22.0,7.0,0.6976,0.64,22933.0,21500.0,Associate\s--Private For-profit',...,Roman Catholic,520.0,1035.5,514.0,510.0,5155.0,NY,11736.0,14498.0,37.0


### Encoding
We encode the data using the same encoder that we saved during training.

In [30]:
# Saving the id column in a different variable.
ids = df[id_feature]

# Dropping the id from the dataframe
df.drop(columns=[id_feature], inplace=True)

# Encoding the rest of the features if exist
if os.path.exists(OHE_ENCODER_FILE):
    encoder = load(OHE_ENCODER_FILE)
    df = encoder.transform(df)


### Making predictions
Using the model saved during training. Notice that the model outputs a 2D array with many rows and 5 columns. </br>
Each row in the array represents an answer to a sample in the test data. Each number of the 5 numbers in the row is a probability to one of the 5 classes in the original dataset.

In [31]:
model = load(PREDICTOR_FILE_PATH)
predictions = model.predict(df)

predictions

array([0.52659527, 0.52489189, 0.3330418 , ..., 0.53994467, 0.58948223,
       0.59384768])

### Creating predictions DataFrame.

In [32]:
predictions_df = pd.DataFrame(columns=[id_feature, 'prediction'])
predictions_df[id_feature] = ids
predictions_df['prediction'] = predictions
predictions_df.to_csv(PREDICTIONS_FILE)
predictions_df.head()


Unnamed: 0,unit_id,prediction
0,422695,0.526595
1,363712,0.524892
2,119331,0.333042
3,476142,0.586913
4,194259,0.461986
