In [1]:
import sys

from os import getcwd
from os.path import dirname, abspath

# Append the root directory to recognize the modules
sys.path.append(dirname((abspath(getcwd()))))

In [2]:
import numpy as np
import pandas as pd

from joblib import load

from school_budget.config import CATEGORIES
from school_budget.data import multilabel_sample_dataframe, multilabel_train_test_split
from school_budget.models import predict, top_n_predictions_ids, format_predictions

In [3]:
df = pd.read_csv("../datasets/TrainingData.csv", index_col=0)
NON_LABELS = [c for c in df.columns if c not in CATEGORIES]
SAMPLE_SIZE = 40000
sampling = multilabel_sample_dataframe(
    df,
    pd.get_dummies(df[CATEGORIES]),
    size=SAMPLE_SIZE,
    min_count=25,
    seed=43
)
dummy_labels = pd.get_dummies(sampling[CATEGORIES])
X_train, X_test, y_train, y_test = multilabel_train_test_split(
    sampling[NON_LABELS],
    dummy_labels,
    0.2,
    min_count=3,
    seed=43
)

In [4]:
model = load("model-1.1.0.joblib")
predictions = predict(model, X_test)
# Format correctly in new DataFrame: prediction_df
prediction_df = pd.DataFrame(
    columns=pd.get_dummies(df[CATEGORIES]).columns,
    index=X_test.index,
    data=predictions
)

In [5]:
top_n_pred = top_n_predictions_ids(predictions)
#predicted_sorted = predictions[np.arange(top_n_pred.shape[0]),top_n_pred.T].T
formatted_predictions = format_predictions(X_test, top_n_pred, predictions, prediction_df.columns, model.classes_)
print(formatted_predictions)
print(prediction_df.columns)

[OrderedDict([('label', 'Position_Type_Substitute'), ('probability', '100%')]), OrderedDict([('label', 'Operating_Status_PreK-12 Operating'), ('probability', '100%')]), OrderedDict([('label', 'Object_Type_Benefits'), ('probability', '100%')]), OrderedDict([('label', 'Position_Type_Substitute'), ('probability', '100%')]), OrderedDict([('label', 'Operating_Status_PreK-12 Operating'), ('probability', '100%')]), OrderedDict([('label', 'Pre_K_NO_LABEL'), ('probability', '100%')]), OrderedDict([('label', 'Object_Type_Benefits'), ('probability', '99%')]), OrderedDict([('label', 'Operating_Status_PreK-12 Operating'), ('probability', '99%')]), OrderedDict([('label', 'Pre_K_NO_LABEL'), ('probability', '100%')]), OrderedDict([('label', 'Reporting_Non-School'), ('probability', '100%')]), OrderedDict([('label', 'Object_Type_Other Compensation/Stipend'), ('probability', '100%')]), OrderedDict([('label', 'Sharing_Shared Services'), ('probability', '100%')]), OrderedDict([('label', 'Reporting_Non-Scho

In [7]:
lst = [lsta for lsta in prediction_df.columns]
print(lst)

['Function_Aides Compensation', 'Function_Career & Academic Counseling', 'Function_Communications', 'Function_Curriculum Development', 'Function_Data Processing & Information Services', 'Function_Development & Fundraising', 'Function_Enrichment', 'Function_Extended Time & Tutoring', 'Function_Facilities & Maintenance', 'Function_Facilities Planning', 'Function_Finance, Budget, Purchasing & Distribution', 'Function_Food Services', 'Function_Governance', 'Function_Human Resources', 'Function_Instructional Materials & Supplies', 'Function_Insurance', 'Function_Legal', 'Function_Library & Media', 'Function_NO_LABEL', 'Function_Other Compensation', 'Function_Other Non-Compensation', 'Function_Parent & Community Relations', 'Function_Physical Health & Services', 'Function_Professional Development', 'Function_Recruitment', 'Function_Research & Accountability', 'Function_School Administration', 'Function_School Supervision', 'Function_Security & Safety', 'Function_Social & Emotional', 'Functio