# Inference Code

Download dataset and pretrained models

In [17]:
# Download dataset
! gdown "https://drive.google.com/uc?id=1bLxeZUA6Xsk7V6ktU4jGINm1O7hq2hHJ&confirm=t"
# Download protrained model. Comment the following line to use "trained_model" from train code.
! gdown "https://drive.google.com/uc?id=1kE_1AFNtRi-5lOEveMdhAdt-QOjEWrFR&confirm=t"
! unzip -o -q tabular-playground-series-aug-2022.zip

Downloading...
From: https://drive.google.com/uc?id=1bLxeZUA6Xsk7V6ktU4jGINm1O7hq2hHJ&confirm=t
To: /home/poyehchen/Courses/introduction-to-machine-learning-2022/final/tabular-playground-series-aug-2022.zip
100%|██████████████████████████████████████| 2.38M/2.38M [00:00<00:00, 55.2MB/s]
Downloading...
From: https://drive.google.com/uc?id=1kE_1AFNtRi-5lOEveMdhAdt-QOjEWrFR&confirm=t
To: /home/poyehchen/Courses/introduction-to-machine-learning-2022/final/trained_model
100%|██████████████████████████████████████| 19.3k/19.3k [00:00<00:00, 42.5MB/s]


In [18]:
import numpy as np
import pandas as pd
import os
import joblib
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from scipy.stats import rankdata

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

Set file paths and load pretrained model.

In [19]:
BASE = os.getcwd()
test = pd.read_csv(BASE + "/test.csv")
submission = pd.read_csv(BASE + "/sample_submission.csv")
models = joblib.load("trained_model")

Preprocess `test` as we did to `train`

In [20]:
data = test
data["loading"] = np.log1p(data["loading"])
feature = [f for f in data.columns if f.startswith("measurement") or f == "loading"]

for code in data.product_code.unique():
    model = KNNImputer(n_neighbors=5)
    print(f"KNN imputing code {code}")
    data.loc[data.product_code==code, feature] = model.fit_transform(data.loc[data.product_code==code, feature])

test = data

KNN imputing code F
KNN imputing code G
KNN imputing code H
KNN imputing code I


In [21]:
def scale_data(data, feats):
    scaler = StandardScaler()
    
    scaled_data = scaler.fit_transform(data[feats])
    
    new_data = data.copy()
    
    new_data[feats] = scaled_data
    
    assert len(data) == len(new_data)
    
    return new_data

In [22]:
def predict_model(cur, select_feature):
    lr_test = np.zeros(len(test))
    for i in range(5):
        test_data = test.copy()
        test_data = scale_data(test_data, select_feature)
        lr_test += models[cur][i].predict_proba(test_data[select_feature])[:, 1] / 5
    submission[cur] = lr_test

Use the pretrained models to predict `test`, with the same `select_feature` as we trained to `fs0`-`fs3`.

In [23]:
predict_model('fs0', ['measurement_2', 'loading', 'measurement_17'])
predict_model('fs1', ['measurement_7', 'loading', 'measurement_17'])
predict_model('fs2', ['measurement_5', 'loading', 'measurement_17'])
predict_model('fs3', ['measurement_8', 'loading', 'measurement_17'])

Produce prediction.

In [24]:
submission['rank0'] = rankdata(submission['fs0'])
submission['rank1'] = rankdata(submission['fs1'])
submission['rank2'] = rankdata(submission['fs2'])
submission['rank3'] = rankdata(submission['fs3'])

submission['failure'] = submission['rank0'] * 0.4 + submission['rank1'] * 0.3 + submission['rank2'] * 0.2 + submission['rank3'] * 0.1
submission[['id', 'failure']].to_csv('submission.csv', index=False)