In [1]:
import pandas as pd
from joblib import load

test_df = pd.read_parquet("data/test.parquet")

In [2]:
ch_model = load("models/model_CH.joblib")
cn_model = load("models/model_CN.joblib")
mt_model = load("models/model_MT.joblib")
lc_model = load("models/model_LC.joblib")
redacao_model = load("models/model_REDACAO.joblib")

In [3]:
def presence_filter(df, key):
    cond = df.filter(regex=f"PRESENCA_{key}|STATUS_{key}").iloc[:,0] == 1
    return df.loc[cond, :], df.loc[~cond, :]

models = {"CN":cn_model, 
          "CH":ch_model, 
          "LC":lc_model, 
          "MT":mt_model, 
          "REDACAO":redacao_model}

submission = test_df[["NU_INSCRICAO"]].copy()

submission["NU_NOTA_CN"] = 0 
submission["NU_NOTA_CH"] = 0 
submission["NU_NOTA_MT"] = 0 
submission["NU_NOTA_LC"] = 0 
submission["NU_NOTA_REDACAO"] = 0

for key, model in models.items():
    presente, _ = presence_filter(test_df, key)
    y_hat = model.predict(presente)
    presentes = submission["NU_INSCRICAO"].isin(presente["NU_INSCRICAO"])
    submission.loc[presentes, f"NU_NOTA_{key}"] = y_hat

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    1.5s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    5.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    4.8s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    1.4s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    4.9s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    4.8s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    1.3s
[Parallel(n_job

In [10]:
(submission.filter(["NU_INSCRICAO",
                   "NU_NOTA_CN", 
                   "NU_NOTA_CH", 
                   "NU_NOTA_LC", "NU_NOTA_MT", "NU_NOTA_REDACAO"])
           .to_csv("first_sub.csv", index=False))