Pipeline
====

This notebook is designed as a pipeline to run steps required to preprocess, train and make predictions with your model. You should not need to modify this file, instead make changes to `Model.py`, and then run this notebook to generate a set of predictions.

In [32]:
import mlflow
import pandas as pd
import Model
import importlib
import shutil
import numpy as np

In [33]:
# Construct and reload the contents of the Model module, so the Kernel does not need to be
# restarted after changes - execute this cell to reload the model!
importlib.reload(Model)
model = Model.Model()

In [34]:
# Metadata and utilities.
data = {
    "train": {
        "scan_10cm": "train_scan.csv",
        "targets": "train_assay.csv",
        "preprocessed": "./data-preprocessed/train_preprocessed.csv",
    },
    "public": {
        "scan_10cm": "public_scan.csv",
        "targets": "public_template.csv",
    },
    "private": {
        "scan_10cm": "private_scan.csv",
        "targets": "private_template.csv",
    },
}

metadata = {
    "targets": [
        "Element 1",
        "Element 2",
        "Element 3",
        "Element 4",
        "Element 5",
        "Element 6",
        "Element 7",
        "Element 8",
        "Element 9",
        "Element 10",
        "Element 11",
    ],
    "forbidden_features": ["from", "HolNum", "to", "ID"],
    "y_identifier": "ID",
}


def gather_10cm_data(scan_10cm, holNum, fromDepth, toDepth):
    matching_hole_number = scan_10cm[scan_10cm["HolNum"] == holNum]
    matching_hole_number_and_from_depth = matching_hole_number[
        matching_hole_number["from"] >= fromDepth
    ]
    matching_hole_number_and_to_depth = matching_hole_number_and_from_depth[
        matching_hole_number_and_from_depth["to"] <= toDepth
    ]

    return matching_hole_number_and_to_depth

## Create training set

In [35]:
# Create a training set by preprocessing and joining targets.
def create_training_dataset():
    train_targets = pd.read_csv('train_assay.csv')
    train_scan_10 = pd.read_csv('train_scan.csv')

    all_train_rows = pd.DataFrame()

    for index, train_target in train_targets.iterrows():
        # Create a range of data points between the depths that an assay was created for, call this a
        # "synthetic core".
        synthetic_core = gather_10cm_data(
            train_scan_10,
            train_target["HolNum"],
            train_target["from"],
            train_target["to"],
        )

        # Do not train when the synthetic cores have no 10cm scan data points.
        if len(synthetic_core) == 0:
            continue

        # Create a preprocessed dataset from the model, the model may return any number of rows from this
        # function or engineer any number of features based on an aggregation of the rows.
        preprocessed_synthetic_core = model.preprocess(synthetic_core.copy())

        # Join an identifier and a each of the targets against every row returned from the model
        # preprocessing.
        join_from_train_target = train_target[
            metadata["targets"] + [metadata["y_identifier"]]
        ].to_dict()
        preprocessed_synthetic_core_with_targets = preprocessed_synthetic_core.assign(
            **join_from_train_target
        )

        # Add to the single frame representing the preprocessed data for each synthetic core in the dataset.
        all_train_rows = pd.concat(
            [all_train_rows, preprocessed_synthetic_core_with_targets],
            ignore_index=True,
        )

    return all_train_rows


# Create and dump the training dataset to disk.
training_dataset = create_training_dataset()
training_dataset.to_csv(data["train"]["preprocessed"], index=False)

## Train model

In [36]:
# train_preprocessed = pd.read_csv(data["train"]["preprocessed"])
train_preprocessed = pd.read_csv(data["train"]["preprocessed"])
# Create X and y from the training set and train the model.
X_train = train_preprocessed.drop(
    columns=metadata["targets"] + metadata["forbidden_features"], inplace=False, errors='ignore'
)
y_train = train_preprocessed[metadata["targets"]]
model.train(X_train, y_train)

# Remove the existing model folder and save the newly trained model in its place.
shutil.rmtree("./model", ignore_errors=True)
mlflow.pyfunc.save_model(path="model", python_model=model)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000768 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7852
[LightGBM] [Info] Number of data points in the train set: 1483, number of used features: 48
[LightGBM] [Info] Start training from score 0.139764
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000833 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7852
[LightGBM] [Info] Number of data points in the train set: 1483, number of used features: 48
[LightGBM] [Info] Start training from score 2185.925152
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000881 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7852
[LightGBM] [Info] Number of data points in the train set: 1483, number of used features: 48
[LightGBM] [Info] Start tra

## Generate predictions

In [37]:
# Load the dumped model back off disk.
loaded_model = mlflow.pyfunc.load_model("./model")


def create_predictions(data_package):
    targets = pd.read_csv(data_package["targets"])
    scan_10 = pd.read_csv(data_package["scan_10cm"])

    all_predictions = pd.DataFrame()

    for index, target in targets.iterrows():
        synthetic_core = gather_10cm_data(
            scan_10, target["HolNum"], target["from"], target["to"]
        )

        # Do not create predictions where there is no synthetic core.
        if len(synthetic_core) == 0:
            continue

        # Create predictions for each row.
        preprocessed_synthetic_core = loaded_model.unwrap_python_model().preprocess(
            synthetic_core
        )
        predictions = loaded_model.predict(
            preprocessed_synthetic_core.drop(
                columns=metadata["forbidden_features"], errors="ignore"
            )
        )

        # Use post-process to squash predictions down into a single core-level prediction.
        postprocessed_core_level_prediction = (
            loaded_model.unwrap_python_model().postprocess(
                predictions, preprocessed_synthetic_core
            )
        )

        # Add each prediction to the targets row, and concat to a new frame containing all predictions.
        target[metadata["targets"]] = postprocessed_core_level_prediction
        all_predictions = pd.concat(
            [all_predictions, target.to_frame().T], ignore_index=True
        )

    return all_predictions


# Create predictions for each leaderboard and dump to disk.
public_predictions = create_predictions(data["public"])
private_predictions = create_predictions(data["private"])

all_predictions = pd.concat(
    [public_predictions, private_predictions], axis=0, ignore_index=True
)
all_predictions.to_csv("predictions.csv", index=False)



## Score predictions

Now that a set of predictions has been made, you can push your model and predictions to GitLab using the following commands:

```bash
git add .
git commit -m 'New submission'
push origin origin main
```